import pandas as pd
import numpy as np
import os

import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = 'plotly'
TEMPLATE = 'seaborn'

from sklearn.linear_model import LinearRegression

import util


tips = px.data.tips()
tips.head()


from sklearn.preprocessing import Binarizer


tips['total_bill'].head()

0    16.99
1    10.34
2    21.01
3    23.68
4    24.59
Name: total_bill, dtype: float64


bi = Binarizer(threshold=20)


transformed_bills = bi.transform(tips[['total_bill']]) # Must give transform a 2D array/DataFrame.
transformed_bills[:5]

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:434: UserWarning: X has feature names, but Binarizer was fitted without feature names
  warnings.warn(

array([[0.],
       [0.],
       [1.],
       [1.],
       [1.]])


tips_quant = tips[['total_bill', 'size']]
tips_quant.head()


from sklearn.preprocessing import StandardScaler


stdscaler = StandardScaler()


stdscaler.transform(tips_quant)

---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
Input In [10], in <cell line: 1>()
----> 1 stdscaler.transform(tips_quant)

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:970, in StandardScaler.transform(self, X, copy)
    955 def transform(self, X, copy=None):
    956     """Perform standardization by centering and scaling.
    957 
    958     Parameters
   (...)
    968         Transformed array.
    969     """
--> 970     check_is_fitted(self)
    972     copy = copy if copy is not None else self.copy
    973     X = self._validate_data(
    974         X,
    975         reset=False,
   (...)
    980         force_all_finite="allow-nan",
    981     )

File ~/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/validation.py:1208, in check_is_fitted(estimator, attributes, msg, all_or_any)
   1203     fitted = [
   1204         v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
   1205     ]
   1207 if not fitted:
-> 1208     raise NotFittedError(msg % {"name": type(estimator).__name__})

NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.


# This is like saying "determine the mean and SD of each column in tips_quant".
stdscaler.fit(tips_quant)

StandardScaler()


# First column is 'total_bill', second column is 'size'.
tips_quant_z = stdscaler.transform(tips_quant)
tips_quant_z[:5]

array([[-0.31471131, -0.60019263],
       [-1.06323531,  0.45338292],
       [ 0.1377799 ,  0.45338292],
       [ 0.4383151 , -0.60019263],
       [ 0.5407447 ,  1.50695847]])


stdscaler.mean_

array([19.78594262,  2.56967213])


stdscaler.var_

array([78.92813149,  0.9008835 ])


stdscaler.transform(tips_quant.sample(5))

array([[-1.02834171, -0.60019263],
       [ 0.0792487 , -0.60019263],
       [ 0.66681191,  0.45338292],
       [ 0.97410071, -0.60019263],
       [ 0.1377799 ,  0.45338292]])


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.preprocessing import OneHotEncoder


ohe = OneHotEncoder()
ohe.fit(tips_cat)

OneHotEncoder()


ohe.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]


ohe.transform(tips_cat)

<244x10 sparse matrix of type '<class 'numpy.float64'>'
	with 976 stored elements in Compressed Sparse Row format>


ohe.transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


ohe.get_feature_names() # x0, x1, x2, and x3 correspond to column names in tips_cat.

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

array(['x0_Female', 'x0_Male', 'x1_No', 'x1_Yes', 'x2_Fri', 'x2_Sat',
       'x2_Sun', 'x2_Thur', 'x3_Dinner', 'x3_Lunch'], dtype=object)


pd.DataFrame(ohe.transform(tips_cat).toarray(), 
             columns=ohe.get_feature_names()) # If we need a DataFrame back, for some reason.

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

pl.fit(X, y)


tips_cat = tips[['sex', 'smoker', 'day', 'time']]
tips_cat.head()


from sklearn.pipeline import Pipeline


pl = Pipeline([
    ('one-hot', OneHotEncoder()),
    ('lin-reg', LinearRegression())
])


pl.fit(tips_cat, tips['tip'])

Pipeline(steps=[('one-hot', OneHotEncoder()), ('lin-reg', LinearRegression())])


pl.predict([['Female', 'Yes', 'Sat', 'Lunch']])

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:441: UserWarning: X does not have valid feature names, but OneHotEncoder was fitted with feature names
  warnings.warn(

array([2.41792163])


pl.predict(tips_cat.iloc[:5])

array([3.10415414, 3.27436302, 3.27436302, 3.27436302, 3.10415414])


pl.named_steps

{'one-hot': OneHotEncoder(), 'lin-reg': LinearRegression()}


pl.named_steps['one-hot'].transform(tips_cat).toarray()

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])


pl.named_steps['lin-reg'].coef_

array([-0.08510444,  0.08510444, -0.04216238,  0.04216238, -0.20256076,
       -0.12962763,  0.13756057,  0.19462781,  0.25168453, -0.25168453])


pl.score(tips_cat, tips['tip'])

0.027496790201475663


from sklearn.compose import ColumnTransformer


tips_features = tips.drop('tip', axis=1)
tips_features.head()


preproc = ColumnTransformer(
    transformers=[
        ('size', Binarizer(threshold=2), ['size']),
        ('categorical_cols', OneHotEncoder(), ['sex', 'smoker', 'day', 'time'])
    ],
    remainder='passthrough' # Specify what to do with all other columns ('total_bill' here) – drop or passthrough.
)


pl = Pipeline([
    ('preprocessor', preproc), 
    ('lin-reg', LinearRegression())
])


pl.fit(tips_features, tips['tip'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('size',
                                                  Binarizer(threshold=2),
                                                  ['size']),
                                                 ('categorical_cols',
                                                  OneHotEncoder(),
                                                  ['sex', 'smoker', 'day',
                                                   'time'])])),
                ('lin-reg', LinearRegression())])


tips_features.head()


# Note that we fit the Pipeline using tips_features, not tips_features.head()!
pl.predict(tips_features.head())

array([2.73813307, 2.32343202, 3.3700388 , 3.36798392, 3.74755924])


dfs = []
for trans in pl['preprocessor'].transformers_:
    if isinstance(trans[1], str) and trans[1] == 'passthrough':
        df = tips_features.iloc[:, trans[2]]
    else:
        vals = trans[1].transform(tips_features[trans[2]])
        columns = trans[2]
        if str(trans[1]) == 'OneHotEncoder()':
            vals = vals.toarray()
            columns = trans[1].get_feature_names()
        df = pd.DataFrame(vals, columns=columns)
    dfs.append(df)

pd.concat(dfs, axis=1)

/Users/larry/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)


from sklearn.preprocessing import FunctionTransformer


f = FunctionTransformer(np.sqrt)
f.transform([1, 2, 3])

array([1.        , 1.41421356, 1.73205081])


np.random.seed(23) # For reproducibility.

def sample_dgp(n=100):
    x = np.linspace(-2, 3, n)
    y = x ** 3 + (np.random.normal(0, 3, size=n))
    return pd.DataFrame({'x': x, 'y': y})

sample_1 = sample_dgp()
sample_2 = sample_dgp()


px.scatter(sample_1, x='x', y='y', title='Sample 1', template=TEMPLATE)


from sklearn.preprocessing import PolynomialFeatures


# fit_transform fits and transforms the same input.
d2 = PolynomialFeatures(3)
d2.fit_transform(np.array([1, 2, 3, 4, -2]).reshape(-1, 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.],
       [ 1., -2.,  4., -8.]])


# Look at the definition of train_and_plot in util.py if you're curious as to how the plotting works.
fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_1, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 1')


fig = util.train_and_plot(train_sample=sample_1, test_sample=sample_2, degs=[1, 3, 25])
fig.update_layout(title='Trained on Sample 1, Performance on Sample 2')


util.plot_multiple_models(sample_1, sample_2, degs=[1, 3, 25])

Property	Example	Description
Initialize with parameters	`binar = Binarizer(thresh)`	set x=1 if x > thresh, else 0
Transform data in a dataset	`feat = binar.transform(data)`	Binarize all columns in `data`

Property	Example	Description
Initialize with parameters	`stdscaler = StandardScaler()`	z-score the data (no parameters)
Fit the transformer	`stdscaler.fit(X)`	Compute the mean and SD of `X`
Transform data in a dataset	`feat = stdscaler.transform(X_new)`	z-score `X_new` with mean and SD of `X`

Lecture 22 – Pipelines, Generalization¶

DSC 80, Spring 2023¶

Agenda¶

The modeling process¶

The modeling process¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Transformer classes¶

Case study: Restaurant tips 🧑‍🍳¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `StdScaler`¶

`StdScaler` summary¶

Example transformer: `OneHotEncoder`¶

Pipelines¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

Summary: `Pipeline`s¶

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Summary, next time¶

Summary¶

Next time¶

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0

Lecture 22 – Pipelines, Generalization¶

DSC 80, Spring 2023¶

Agenda¶

The modeling process¶

The modeling process¶

preprocessing and linear_models¶

Transformers in sklearn¶

Transformer classes¶

Case study: Restaurant tips 🧑‍🍳¶

Example transformer: Binarizer¶

Example transformer: StdScaler¶

Example transformer: StdScaler¶

StdScaler summary¶

Example transformer: OneHotEncoder¶

Pipelines¶

Pipelines in sklearn¶

Our first Pipeline¶

More sophisticated Pipelines¶

Planning our first ColumnTransformer¶

Building a Pipeline using a ColumnTransformer¶

Aside: FunctionTransformer¶

Summary: Pipelines¶

Generalization¶

Motivation¶

Evaluating the quality of a model¶

Example: Overfitting and underfitting¶

Polynomial regression¶

Bias and variance¶

Summary, next time¶

Summary¶

Next time¶

`preprocessing` and `linear_model`s¶

Transformers in `sklearn`¶

Example transformer: `Binarizer`¶

Example transformer: `StdScaler`¶

Example transformer: `StdScaler`¶

`StdScaler` summary¶

Example transformer: `OneHotEncoder`¶

`Pipeline`s in `sklearn`¶

Our first `Pipeline`¶

More sophisticated `Pipeline`s¶

Planning our first `ColumnTransformer`¶

Building a `Pipeline` using a `ColumnTransformer`¶

Aside: `FunctionTransformer`¶

Summary: `Pipeline`s¶

	x0_Female	x0_Male	x1_No	x1_Yes	x2_Fri	x2_Sat	x2_Sun	x2_Thur	x3_Dinner	x3_Lunch
0	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
1	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
2	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
3	0.0	1.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
4	1.0	0.0	1.0	0.0	0.0	0.0	1.0	0.0	1.0	0.0
...	...	...	...	...	...	...	...	...	...	...
239	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
240	1.0	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
241	0.0	1.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0
242	0.0	1.0	1.0	0.0	0.0	1.0	0.0	0.0	1.0	0.0
243	1.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0