import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

import warnings
warnings.simplefilter('ignore')


diabetes = pd.read_csv('data/diabetes.csv')
diabetes.head()


fig = (
    diabetes.assign(Outcome=diabetes['Outcome'].astype(str))
            .plot(kind='scatter', x='Glucose', y='BMI', color='Outcome', 
                  color_discrete_map={'0': 'orange', '1': 'blue'},
                  title='Relationship between Glucose, BMI, and Diabetes',
                  template=TEMPLATE)
)
fig


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree


X_train, X_test, y_train, y_test = train_test_split(diabetes[['Glucose', 'BMI']], 
                                                    diabetes['Outcome'],
                                                    random_state=1)


dt = DecisionTreeClassifier(max_depth=2)
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=2)


plt.figure(figsize=(15, 5))
plot_tree(dt, feature_names=X_train.columns, class_names=['no db', 'yes db'], 
          filled=True, rounded=True, fontsize=15, impurity=False);


from sklearn.model_selection import GridSearchCV


hyperparameters = {
    'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None], 
    'min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}


len(hyperparameters['max_depth']) * \
len(hyperparameters['min_samples_split']) * \
len(hyperparameters['criterion'])

140


searcher = GridSearchCV(DecisionTreeClassifier(), hyperparameters, cv=5)


searcher.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
                         'min_samples_split': [2, 5, 10, 20, 50, 100, 200]})


searcher.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}


searcher.cv_results_['mean_test_score'] # Array of length 140.

array([0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 , 0.7292054 ,
       0.7292054 , 0.72568216, 0.74136432, 0.74136432, 0.74136432,
       0.74136432, 0.74484258, 0.73964018, 0.72568216, 0.75005997,
       0.75005997, 0.74832084, 0.75005997, 0.7517991 , 0.7465967 ,
       0.72568216, 0.73613193, 0.7343928 , 0.73787106, 0.73787106,
       0.74832084, 0.7465967 , 0.72568216, 0.72403298, 0.71710645,
       0.72923538, 0.73617691, 0.73968516, 0.7465967 , 0.72568216,
       0.71013493, 0.70488756, 0.71875562, 0.72572714, 0.7413943 ,
       0.7465967 , 0.72568216, 0.69104948, 0.69793103, 0.70665667,
       0.71878561, 0.73965517, 0.7465967 , 0.72568216, 0.69448276,
       0.70493253, 0.71010495, 0.71709145, 0.7362069 , 0.7465967 ,
       0.72568216, 0.70146927, 0.70665667, 0.7014093 , 0.72226387,
       0.73791604, 0.7465967 , 0.72568216, 0.68934033, 0.7014093 ,
       0.70662669, 0.72226387, 0.74313343, 0.7465967 , 0.72568216,
       0.72398801, 0.72398801, 0.72398801, 0.72398801, 0.72398801,
       0.72398801, 0.72394303, 0.74658171, 0.74658171, 0.74658171,
       0.74658171, 0.74658171, 0.73616192, 0.72394303, 0.7517991 ,
       0.7517991 , 0.75005997, 0.75005997, 0.75005997, 0.73964018,
       0.72394303, 0.75004498, 0.75004498, 0.75178411, 0.75178411,
       0.75353823, 0.73964018, 0.72394303, 0.72923538, 0.73094453,
       0.74482759, 0.74484258, 0.75353823, 0.73964018, 0.72394303,
       0.72058471, 0.71361319, 0.73443778, 0.7413943 , 0.74838081,
       0.73964018, 0.72394303, 0.70145427, 0.70488756, 0.72749625,
       0.73443778, 0.75008996, 0.73964018, 0.72394303, 0.6961919 ,
       0.6962069 , 0.71187406, 0.73269865, 0.74664168, 0.73964018,
       0.72394303, 0.67193403, 0.6858021 , 0.71878561, 0.73269865,
       0.74664168, 0.73964018, 0.72394303, 0.67365817, 0.68409295,
       0.70833583, 0.72752624, 0.75008996, 0.73964018, 0.72394303])


# Rows correspond to folds, columns correspond to hyperparameter combinations.
pd.DataFrame(np.vstack([searcher.cv_results_[f'split{i}_test_score'] for i in range(5)]))


searcher.best_params_

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 50}


final_tree = DecisionTreeClassifier(**searcher.best_params_)
final_tree

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)


final_tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=50)


# Training accuracy.
final_tree.score(X_train, y_train)

0.7829861111111112


# Testing accuracy.
final_tree.score(X_test, y_test)

0.765625


searcher.score(X_train, y_train)

0.7829861111111112


searcher.score(X_test, y_test)

0.765625


people = pd.read_csv('data/SOCR-HeightWeight.csv').drop('Index', axis=1)
people.head()


people.plot(kind='scatter', x='Height (Inches)', y='Weight (Pounds)', 
            title='Weight vs. Height for 25,000 18 Year Olds', template=TEMPLATE)


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(people[['Height (Inches)']], 
                                                            people['Weight (Pounds)'], 
                                                            random_state=1)


lr_one_feat = LinearRegression()
lr_one_feat.fit(X_train_1, y_train_1)

LinearRegression()


lr_one_feat.intercept_, lr_one_feat.coef_

(-81.18621383194537, array([3.06263359]))


rmse_one_feat = mean_squared_error(y_test_1, 
                                   lr_one_feat.predict(X_test_1), 
                                   squared=False)
rmse_one_feat

10.044723811394736


people['Height (cm)'] = people['Height (Inches)'] * 2.54 # 1 inch = 2.54 cm.


X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(people[['Height (Inches)', 'Height (cm)']], 
                                                            people['Weight (Pounds)'], 
                                                            random_state=1)


lr_two_feat = LinearRegression()
lr_two_feat.fit(X_train_2, y_train_2)

LinearRegression()


lr_two_feat.intercept_, lr_two_feat.coef_

(-81.16916912853335, array([ 9.70933657e+11, -3.82257345e+11]))


rmse_two_feat = mean_squared_error(y_test_2, 
                                   lr_two_feat.predict(X_test_2), 
                                   squared=False)
rmse_two_feat

10.044733718499367


(-80 - 10 * people.iloc[:, 0] + (13 / 2.54) * people.iloc[:, 2]).head()

0    117.34993
1    134.54563
2    128.19622
3    124.64980
4    123.36343
dtype: float64


(-80 + 10 * people.iloc[:, 0] - (7 / 2.54) * people.iloc[:, 2]).head()

0    117.34993
1    134.54563
2    128.19622
3    124.64980
4    123.36343
dtype: float64


tips = px.data.tips()
tips_features = tips.drop('tip', axis=1)
tips_features.head()


X = pd.get_dummies(tips_features)
X.head()


X['all_ones'] = 1
X.head()


np.linalg.matrix_rank(X)

9


np.linalg.matrix_rank(X.drop(columns=['sex_Male', 'smoker_Yes', 'time_Lunch', 'day_Thur']))

9


pd.get_dummies(tips_features, drop_first=True)


from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder(drop='first')
ohe.fit_transform(tips_features[['sex', 'smoker', 'day', 'time']]).toarray()

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])


reviews = pd.read_json(open('data/reviews.json'), lines=True)
reviews.head()


reviews['overall'].value_counts(normalize=True)

5    0.530214
4    0.254973
3    0.125000
2    0.050708
1    0.039105
Name: overall, dtype: float64


from sklearn.feature_extraction.text import CountVectorizer


example_corp = ['hey hey hey my name is billy', 
                'hey billy how is your dog billy']


count_vec = CountVectorizer()
count_vec.fit(example_corp)

CountVectorizer()


count_vec.vocabulary_

{'hey': 2,
 'my': 5,
 'name': 6,
 'is': 4,
 'billy': 0,
 'how': 3,
 'your': 7,
 'dog': 1}


count_vec.transform(example_corp).toarray()

array([[1, 0, 3, 0, 1, 1, 1, 0],
       [2, 1, 1, 1, 1, 0, 0, 1]])


example_corp

['hey hey hey my name is billy', 'hey billy how is your dog billy']


pd.DataFrame(count_vec.transform(example_corp).toarray(),
             columns=pd.Series(count_vec.vocabulary_).sort_values().index)


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


X = reviews['summary']
y = reviews['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y)


pl = Pipeline([
    ('cv', CountVectorizer()), 
    ('clf', RandomForestClassifier(max_depth=8, n_estimators=7)) # Uses 7 separate decision trees!
])


pl.fit(X_train, y_train)

Pipeline(steps=[('cv', CountVectorizer()),
                ('clf', RandomForestClassifier(max_depth=8, n_estimators=7))])


# Training accuracy.
pl.score(X_train, y_train)

0.5340566606389391


# Testing accuracy.
pl.score(X_test, y_test)

0.52833031946956


# Distribution of true ys in the training set: 53% are 5.
y_train.value_counts(normalize=True)

5    0.531344
4    0.253968
3    0.125075
2    0.050733
1    0.038879
Name: overall, dtype: float64


# Distribution of predicted ys in the training set: 99.8% are 5.
# It turns out we essentially are predicting 5 every time!
pd.Series(pl.predict(X_train)).value_counts(normalize=True)

5    0.996685
4    0.002512
3    0.000603
2    0.000100
1    0.000100
dtype: float64


len(pl.named_steps['cv'].vocabulary_) # We have many features, but we are not asking many questions!

5270


# Note that we've used the key clf__max_depth, not max_depth
# because max_depth is a hyperparameter of clf, not of pl.

hyperparameters = {
    'clf__max_depth': np.arange(2, 500, 20)
}


# Takes 10+ seconds to run – how many trees are being trained?
grids = GridSearchCV(pl, param_grid=hyperparameters, return_train_score=True)
grids.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(max_depth=8,
                                                               n_estimators=7))]),
             param_grid={'clf__max_depth': array([  2,  22,  42,  62,  82, 102, 122, 142, 162, 182, 202, 222, 242,
       262, 282, 302, 322, 342, 362, 382, 402, 422, 442, 462, 482])},
             return_train_score=True)


grids.best_params_

{'clf__max_depth': 142}


# Training accuracy.
grids.score(X_train, y_train)

0.8203737191078964


# Testing accuracy.
grids.score(X_test, y_test)

0.569620253164557


index = grids.param_grid['clf__max_depth']
train = grids.cv_results_['mean_train_score']
valid = grids.cv_results_['mean_test_score']


pd.DataFrame({'train': train, 'valid': valid}, index=index).plot().update_layout(
    xaxis_title='max_depth', yaxis_title='Accuracy'
)

	0	1	2	3	4	5	6	7	8	9	...	130	131	132	133	134	135	136	137	138	139
0	0.706897	0.706897	0.706897	0.706897	0.706897	0.706897	0.732759	0.715517	0.715517	0.715517	...	0.681034	0.706897	0.732759	0.646552	0.646552	0.706897	0.672414	0.698276	0.706897	0.732759
1	0.773913	0.773913	0.773913	0.773913	0.773913	0.773913	0.756522	0.756522	0.756522	0.756522	...	0.826087	0.773913	0.756522	0.678261	0.669565	0.747826	0.817391	0.826087	0.773913	0.756522
2	0.739130	0.739130	0.739130	0.739130	0.739130	0.739130	0.730435	0.756522	0.756522	0.756522	...	0.721739	0.739130	0.730435	0.643478	0.669565	0.678261	0.678261	0.721739	0.739130	0.730435
3	0.704348	0.704348	0.704348	0.704348	0.704348	0.704348	0.704348	0.756522	0.756522	0.756522	...	0.791304	0.756522	0.695652	0.730435	0.739130	0.730435	0.765217	0.791304	0.756522	0.695652
4	0.721739	0.721739	0.721739	0.721739	0.721739	0.721739	0.704348	0.721739	0.721739	0.721739	...	0.713043	0.721739	0.704348	0.669565	0.695652	0.678261	0.704348	0.713043	0.721739	0.704348

	Height (Inches)	Weight (Pounds)
0	65.78331	112.9925
1	71.51521	136.4873
2	69.39874	153.0269
3	68.21660	142.3354
4	67.78781	144.2971

	total_bill	sex	smoker	day	time	size
0	16.99	Female	No	Sun	Dinner	2
1	10.34	Male	No	Sun	Dinner	3
2	21.01	Male	No	Sun	Dinner	3
3	23.68	Male	No	Sun	Dinner	2
4	24.59	Female	No	Sun	Dinner	4

	total_bill	size	sex_Female	sex_Male	smoker_No	day_Sun	time_Dinner
0	16.99	2	1	0	1	1	1
1	10.34	3	0	1	1	1	1
2	21.01	3	0	1	1	1	1
3	23.68	2	0	1	1	1	1
4	24.59	4	1	0	1	1	1

	total_bill	size	sex_Female	sex_Male	smoker_No	day_Sun	time_Dinner	all_ones
0	16.99	2	1	0	1	1	1	1
1	10.34	3	0	1	1	1	1	1
2	21.01	3	0	1	1	1	1	1
3	23.68	2	0	1	1	1	1	1
4	24.59	4	1	0	1	1	1	1

Lecture 25 – Grid Search, Multicollinearity, Examples¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Recap: Decision trees 🌲 and grid search¶

Example: Predicting diabetes¶

Goal¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

One hot encoding and multicollinearity¶

Key takeaways¶

Example: Modeling using text features¶

Example: Predicting reviews¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶

Training and validation accuracy vs. depth¶

Summary, next time¶

Summary¶

Next time¶

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	reviewerID	asin	reviewerName	helpful	reviewText	overall	summary	unixReviewTime	reviewTime
0	A1JZFGZEZVWQPY	B00002N674	Carter H "1amazonreviewer@gmail . com"	[4, 4]	Good USA company that stands behind their prod...	4	Great Hoses	1308614400	06 21, 2011
1	A32JCI4AK2JTTG	B00002N674	Darryl Bennett "Fuzzy342"	[0, 0]	This is a high quality 8 ply hose. I have had ...	5	Gilmour 10-58050 8-ply Flexogen Hose 5/8-Inch ...	1402272000	06 9, 2014
2	A3N0P5AAMP6XD2	B00002N674	H B	[2, 3]	It's probably one of the best hoses I've ever ...	4	Very satisfied!	1336176000	05 5, 2012
3	A2QK7UNJ857YG	B00002N674	Jason	[0, 0]	I probably should have bought something a bit ...	5	Very high quality	1373846400	07 15, 2013
4	AS0CYBAN6EM06	B00002N674	jimmy	[1, 1]	I bought three of these 5/8-inch Flexogen hose...	5	Good Hoses	1375660800	08 5, 2013

Lecture 25 – Grid Search, Multicollinearity, Examples¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Recap: Decision trees 🌲 and grid search¶

Example: Predicting diabetes¶

Goal¶

Grid search¶

Choosing possible hyperparameter values¶

Key takeaways¶

Multicollinearity¶

Heights and weights¶

Motivating example¶

Redundant features¶

Infinitely many parameter choices¶

Multicollinearity¶

One hot encoding and multicollinearity¶

Key takeaways¶

Example: Modeling using text features¶

Example: Predicting reviews¶

Aside: CountVectorizer¶

Creating an initial Pipeline¶

Choosing tree depth via GridSearchCV¶

Training and validation accuracy vs. depth¶

Summary, next time¶

Summary¶

Next time¶

Aside: `CountVectorizer`¶

Creating an initial `Pipeline`¶

Choosing tree depth via `GridSearchCV`¶