import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

from sklearn.linear_model import LinearRegression

import util


tips = px.data.tips()
tips.head()


from sklearn.preprocessing import Binarizer


tips['total_bill'].head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64


bi = Binarizer(threshold=20)


transformed_bills = bi.transform(tips[['total_bill']]) # Must give transform a 2D array/DataFrame.
transformed_bills[:5]

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.]])


tips_quant = tips[['total_bill', 'size']]
tips_quant.head()


from sklearn.preprocessing import StandardScaler


stdscaler = StandardScaler()


stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_67268/3962348888.py in <module>
----> 1 stdscaler.transform(tips_quant)

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/preprocessing/_data.py in transform(self, X, copy)
    878             Transformed array.
    879         """
--> 880         check_is_fitted(self)
    881 
    882         copy = copy if copy is not None else self.copy

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

~/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
   1096 
   1097     if not attrs:
-> 1098         raise NotFittedError(msg % {'name': type(estimator).__name__})
   1099 
   1100 

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


# This is like saying "determine the mean and SD of each column in tips_quant".
stdscaler.fit(tips_quant)

StandardScaler()


# First column is 'total_bill', second column is 'size'.
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-0.31471131, -0.60019263],
       [-1.06323531,  0.45338292],
       [ 0.1377799 ,  0.45338292],
       [ 0.4383151 , -0.60019263],
       [ 0.5407447 ,  1.50695847]])


stdscaler.mean_

array([19.78594262,  2.56967213])


stdscaler.var_

array([78.92813149,  0.9008835 ])


stdscaler.transform(tips_quant.sample(5))

array([[-0.53758011, -0.60019263],
       [-0.2257889 , -0.60019263],
       [-0.3057065 ,  0.45338292],
       [ 0.3246295 , -0.60019263],
       [-0.92028411, -0.60019263]])


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()


ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]


ohe.transform(tips_cat)

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>


ohe.transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


ohe.get_feature_names() # x0, x1, x2, and x3 correspond to column names in tips_cat.

array(['x0_Female', 'x0_Male', 'x1_No', 'x1_Yes', 'x2_Fri', 'x2_Sat',
       'x2_Sun', 'x2_Thur', 'x3_Dinner', 'x3_Lunch'], dtype=object)


pd.DataFrame(ohe.transform(tips_cat).toarray(), 
             columns=ohe.get_feature_names()) # If we need a DataFrame back, for some reason.


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.pipeline import Pipeline


pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])


pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])


pl.predict([['Female', 'Yes', 'Sat', 'Lunch']])

array([2.41792163])


pl.predict(tips_cat.iloc[:5])

array([3.10415414, 3.27436302, 3.27436302, 3.27436302, 3.10415414])


pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}


pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


pl.named_steps['lin-reg'].coef_

array([-0.08510444,  0.08510444, -0.04216238,  0.04216238, -0.20256076,
       -0.12962763,  0.13756057,  0.19462781,  0.25168453, -0.25168453])


pl.score(tips_cat, tips['tip'])

0.027496790201475663


from sklearn.compose import ColumnTransformer


tips_features = tips.drop('tip', axis=1)
tips_features.head()


preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)


pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])


pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])


tips_features.head()


# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([2.73813307, 2.32343202, 3.3700388 , 3.36798392, 3.74755924])


dfs = []
for trans in pl['preprocessor'].transformers_:
    if isinstance(trans[1], str) and trans[1] == 'passthrough':
        df = tips_features.iloc[:, trans[2]]
    else:
        vals = trans[1].transform(tips_features[trans[2]])
        columns = trans[2]
        if str(trans[1]) == 'OneHotEncoder()':
            vals = vals.toarray()
            columns = trans[1].get_feature_names()
        df = pd.DataFrame(vals, columns=columns)
    dfs.append(df)

pd.concat(dfs, axis=1)


from sklearn.preprocessing import FunctionTransformer


f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.        , 1.41421356, 1.73205081])


np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()


px.scatter(sample_1, x='x', y='y', title='Sample 1', template=TEMPLATE)


from sklearn.preprocessing import PolynomialFeatures


# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])


# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')


fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')


util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`

Lecture 22 – Pipelines, Generalization¶

DSC 80, Winter 2023¶

📣 Announcements¶

Agenda¶

The modeling process¶

The modeling process¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Transformer classes¶

Case study: Restaurant tips 🧑‍🍳¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `StdScaler`¶

`StdScaler` summary¶

Example transformer: `OneHotEncoder`¶

Pipelines¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

Summary: `Pipeline`s¶

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Summary, next time¶

Summary¶

Next time¶

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0

Lecture 22 – Pipelines, Generalization¶

DSC 80, Winter 2023¶

📣 Announcements¶

Agenda¶

The modeling process¶

The modeling process¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Case study: Restaurant tips 🧑‍🍳¶

Example transformer: Binarizer¶

Example transformer: StdScaler¶

Example transformer: StdScaler¶

StdScaler summary¶

Example transformer: OneHotEncoder¶

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

Summary: Pipelines¶

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Summary, next time¶

Summary¶

Next time¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `StdScaler`¶

`StdScaler` summary¶

Example transformer: `OneHotEncoder`¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

Summary: `Pipeline`s¶

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0