import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)


tips = sns.load_dataset('tips')
tips


mean_tip = tips['tip'].mean()
mean_tip

2.9982786885245902


def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))


rmse(tips['tip'], mean_tip)

1.3807999538298958


rmse_dict = {}
rmse_dict['constant, tip'] = rmse(tips['tip'], mean_tip)


tips = tips.assign(pct_tip=(tips['tip'] / tips['total_bill']))
tips.head()


mean_pct_tip = tips['pct_tip'].mean()
mean_pct_tip

0.16080258172250478


tips['total_bill'] * mean_pct_tip

0      2.732036
1      1.662699
2      3.378462
3      3.807805
4      3.954135
         ...   
239    4.668099
240    4.370614
241    3.645395
242    2.865502
243    3.019872
Name: total_bill, Length: 244, dtype: float64


rmse_dict['constant, pct_tip'] = rmse(tips['tip'], tips['total_bill'] * mean_pct_tip)
rmse_dict

{'constant, tip': 1.3807999538298958, 'constant, pct_tip': 1.146820820140744}


mean_pct_tip

0.16080258172250478


rmse_dict

{'constant, tip': 1.3807999538298958, 'constant, pct_tip': 1.146820820140744}


sns.lmplot(data=tips, x='total_bill', y='tip', line_kws={'color': 'red'});


from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()


lr.intercept_, lr.coef_

(0.9202696135546735, array([0.10502452]))


preds = lr.predict(X=tips[['total_bill']])
rmse_dict['simple linear model'] = rmse(tips['tip'], preds)
rmse_dict

{'constant, tip': 1.3807999538298958,
 'constant, pct_tip': 1.146820820140744,
 'simple linear model': 1.0178504025697377}


tips.head()


tips.head()


categorical_cols = ['sex', 'smoker', 'day', 'time']


features = tips.copy().loc[:, ['total_bill', 'size']]
for c in categorical_cols:
    for val in tips[c].unique():
        features[f'{c}={val}'] = (tips[c] == val).astype(int)


features.head()


tips.shape

(244, 8)


features.shape

(244, 12)


lr = LinearRegression()
lr.fit(X=features, y=tips['tip'])

LinearRegression()


rmse_dict['all features'] = rmse(tips['tip'], lr.predict(X=features))
rmse_dict

{'constant, tip': 1.3807999538298958,
 'constant, pct_tip': 1.146820820140744,
 'simple linear model': 1.0178504025697377,
 'all features': 1.0051634500049156}


preds = lr.predict(X=features)

plt.figure(figsize=(10, 5))
plt.scatter(tips['total_bill'], tips['tip'], label='actual tips')
plt.scatter(tips['total_bill'], preds, label='predicted tips')
plt.xlabel('total_bill')
plt.ylabel('tip')
plt.legend();


sales = pd.read_csv('data/sinusoidal.csv').sort_values(by='day').reset_index(drop=True)
sales.plot(kind='scatter', x='day', y='units sold', title='Daily Sales Volume');


sns.lmplot(data=sales, x='day', y='units sold', line_kws={'color': 'red'});
sns.residplot(data=sales, x='day', y='units sold', color='orange', label='residuals');
plt.title('Units Sold vs. Day')
plt.legend();


def transform_day(day):
    return day +  5 * np.sin(2 * np.pi * day / 7)


sales['day_transformed'] = transform_day(sales['day'])


plt.scatter(sales['day'], sales['day_transformed'], label='day_transformed')
plt.scatter(sales['day'], sales['units sold'], label='units sold')
plt.xlabel('day')
plt.legend();


sales.plot(kind='scatter', x='day_transformed', y='units sold');


sns.lmplot(data=sales, x='day_transformed', y='units sold', line_kws={'color': 'red'})
sns.residplot(data=sales, x='day_transformed', y='units sold', color='orange', label='residuals')
plt.title('Units Sold vs. Transformed Day')
plt.legend();


from sklearn.preprocessing import Binarizer


tips = sns.load_dataset('tips') # To remove the columns we "engineered" before
tips['total_bill'].head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64


bi = Binarizer(threshold=20)


transformed_bills = bi.transform(tips[['total_bill']]) # Must pass transform a 2D array/DataFrame
transformed_bills[:5]

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.]])


((tips['total_bill'] > 20).astype(int) == transformed_bills.flatten()).all()

True


tips_quant = tips[['total_bill', 'tip', 'size']]
tips_quant.head()


from sklearn.preprocessing import StandardScaler


stdscaler = StandardScaler()


stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_14076/3962348888.py in <module>
----> 1 stdscaler.transform(tips_quant)

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py in transform(self, X, copy)
    878             Transformed array.
    879         """
--> 880         check_is_fitted(self)
    881 
    882         copy = copy if copy is not None else self.copy

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
   1096 
   1097     if not attrs:
-> 1098         raise NotFittedError(msg % {'name': type(estimator).__name__})
   1099 
   1100 

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


stdscaler.fit(tips_quant)

StandardScaler()


# First column is 'total_bill', second column is 'tip', third column is 'size'
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-0.31471131, -1.43994695, -0.60019263],
       [-1.06323531, -0.96920534,  0.45338292],
       [ 0.1377799 ,  0.36335554,  0.45338292],
       [ 0.4383151 ,  0.22575414, -0.60019263],
       [ 0.5407447 ,  0.4430195 ,  1.50695847]])


stdscaler.mean_

array([19.78594262,  2.99827869,  2.56967213])


stdscaler.var_

array([78.92813149,  1.90660851,  0.9008835 ])


stdscaler.transform(tips_quant.head(5))

array([[-0.31471131, -1.43994695, -0.60019263],
       [-1.06323531, -0.96920534,  0.45338292],
       [ 0.1377799 ,  0.36335554,  0.45338292],
       [ 0.4383151 ,  0.22575414, -0.60019263],
       [ 0.5407447 ,  0.4430195 ,  1.50695847]])


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()


ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]


ohe_features = ohe.transform(tips_cat)
ohe_features

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>


ohe_features.toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


ohe.get_feature_names() # x0, x1, x2, and x3 correspond to column names in tips_cat

array(['x0_Female', 'x0_Male', 'x1_No', 'x1_Yes', 'x2_Fri', 'x2_Sat',
       'x2_Sun', 'x2_Thur', 'x3_Dinner', 'x3_Lunch'], dtype=object)


ohe.inverse_transform(ohe_features[:10])

array([['Female', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Female', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner'],
       ['Male', 'No', 'Sun', 'Dinner']], dtype=object)


from sklearn.linear_model import LinearRegression


LinearRegression?


tips.head()


lr = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
lr.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()


# Predicted tip from a table of 3 that spends $25 
lr.predict([[25, 3]])

array([3.56457154])


# Predicted tip from a table of 14 that spends $1000 – probably not accurate!
lr.predict([[1000, 14]])

array([96.07865069])


lr.intercept_

0.6689447408125031


lr.coef_

array([0.09271334, 0.19259779])


all_preds = lr.predict(tips[['total_bill', 'size']])


np.sqrt(np.mean((all_preds - tips['tip']) ** 2))

1.0072561271146618


lr.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587


tips.head()


all_preds[:5]

array([2.62933992, 2.20539403, 3.19464533, 3.24959215, 3.71915687])


np.var(all_preds) / np.var(tips['tip'])

0.4678693087961248


(np.corrcoef(all_preds, tips['tip'])) ** 2

array([[1.        , 0.46786931],
       [0.46786931, 1.        ]])


lr.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587

	total_bill	size	sex=Female	sex=Male	smoker=No	day=Sun	time=Dinner
0	16.99	2	1	0	1	1	1
1	10.34	3	0	1	1	1	1
2	21.01	3	0	1	1	1	1
3	23.68	2	0	1	1	1	1
4	24.59	4	1	0	1	1	1

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-scale the data (no parameters)
Fit the transformer	`stdscaler.fit(data)`	compute the mean and SD of `data`
Transform data in a dataset	`feat = stdscaler.transform(newdata)`	z-scale `newdata` with mean and SD of `data`

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(data, responses)`	Determines regression coefficients
Use model for prediction	`lr.predict(newdata)`	Use regression line make predictions
Evaluate the model	`lr.score(data, responses)`	Calculate the $R^2$ of the LR model
Access model attributes	`lr.coef_`	Access the regression coefficients

Lecture 22 – Modeling and `sklearn`¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Restaurant tips 🧑‍🍳¶

Model #1: Constant¶

Model #2: Tip percentages instead of tips¶

Constant tip vs. constant tip percentage¶

Model #3: Linear model¶

Fitting a linear model¶

Recap¶

What's next?¶

One-hot encoding categorical variables¶

Aside: Periodic sales¶

Example: Periodic sales¶

`sklearn` overview¶

The steps of the modeling pipeline¶

Features and models using `sklearn`¶

`preprocessing` and `linear_models`¶

Transformers in `sklearn`¶

Transformer classes¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `OneHotEncoder`¶

Models in `sklearn`¶

Model classes¶

The `LinearRegression` class¶

Example: Predicting `'tip'` from `'total_bill'` and `'size'`¶

Aside: $R^2$¶

Calculating $R^2$¶

`LinearRegression` summary¶

Summary, next time¶

Summary¶

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4
...	...	...	...	...	...	...	...
239	29.03	5.92	Male	No	Sat	Dinner	3
240	27.18	2.00	Female	Yes	Sat	Dinner	2
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

Lecture 22 – Modeling and sklearn¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Restaurant tips 🧑‍🍳¶

Model #1: Constant¶

Model #2: Tip percentages instead of tips¶

Constant tip vs. constant tip percentage¶

Model #3: Linear model¶

Fitting a linear model¶

Recap¶

What's next?¶

One-hot encoding categorical variables¶

Aside: Periodic sales¶

Example: Periodic sales¶

sklearn overview¶

The steps of the modeling pipeline¶

Features and models using sklearn¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Example transformer: Binarizer¶

Example transformer: StdScaler¶

Example transformer: OneHotEncoder¶

Models in sklearn¶

Model classes¶

The LinearRegression class¶

Example: Predicting 'tip' from 'total_bill' and 'size'¶

Aside: $R^2$¶

Calculating $R^2$¶

LinearRegression summary¶

Summary, next time¶

Summary¶

Lecture 22 – Modeling and `sklearn`¶

`sklearn` overview¶

Features and models using `sklearn`¶

`preprocessing` and `linear_models`¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `OneHotEncoder`¶

Models in `sklearn`¶

The `LinearRegression` class¶

Example: Predicting `'tip'` from `'total_bill'` and `'size'`¶

`LinearRegression` summary¶