from dsc80_utils import *

from pathlib import Path
sotu_txt = Path('data') / 'stateoftheunion1790-2023.txt'
sotu = sotu_txt.read_text()
speeches = sotu.split('\n***\n')[1:]

import re
def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
speeches_df

speeches_df

unique_words = speeches_df['contents'].str.split().explode().value_counts()
# Take the top 500 most common words for speed
unique_words = unique_words.iloc[:500].index
unique_words

Index(['the', 'of', 'to', 'and', 'in', 'a', 'that', 'for', 'be', 'our',
       ...
       'desire', 'call', 'submitted', 'increasing', 'months', 'point', 'trust',
       'throughout', 'set', 'object'],
      dtype='object', name='contents', length=500)

from tqdm.notebook import tqdm

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

# Wrap the sequence with `tqdm()` to display a progress bar
for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf = pd.DataFrame(tfidf_dict)
tfidf.head()

summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
          ...    
230          it's
231       tonight
232          it's
Length: 233, dtype: object

def five_largest(row):
    return ', '.join(row.index[row.argsort()][-5:])

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)

keywords_df

# display_df(keywords_df, rows=233)

tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf_nl = pd.DataFrame(tfidf_nl_dict)
tfidf_nl.head()

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)
keywords_nl_df

(1000 / 999)

1.001001001001001

np.log(1000 / 999)

np.float64(0.001000500333583622)

(50 / 2)

25.0

(500 / 2)

250.0

np.log(50 / 2)

np.float64(3.2188758248682006)

np.log(500 / 2)

np.float64(5.521460917862246)

# The dataset is built into plotly!
tips = px.data.tips()
tips

fig = tips.plot(kind='scatter', x='total_bill', y='tip', title='Tip vs. Total Bill')
fig.update_layout(xaxis_title='Total Bill', yaxis_title='Tip')

fig = tips.plot(kind='hist', x='tip', title='Distribution of Tip', nbins=20)
fig.update_layout(xaxis_title='Tip', yaxis_title='Frequency')

mean_tip = tips['tip'].mean()
mean_tip

np.float64(2.99827868852459)

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_hline(mean_tip, line_width=3, line_color='orange', opacity=1)
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

np.mean((tips['tip'] - mean_tip) ** 2)

np.float64(1.9066085124966412)

# The same! A fact from 40A.
np.var(tips['tip'])

np.float64(1.9066085124966412)

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

rmse(tips['tip'], mean_tip)

np.float64(1.3807999538298954)

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict

{'constant tip amount': np.float64(1.3807999538298954)}

tips.head()

from sklearn.linear_model import LinearRegression

LinearRegression?

Init signature:
LinearRegression(
    *,
    fit_intercept=True,
    copy_X=True,
    n_jobs=None,
    positive=False,
)
Docstring:     
Ordinary least squares Linear Regression.

LinearRegression fits a linear model with coefficients w = (w1, ..., wp)
to minimize the residual sum of squares between the observed targets in
the dataset, and the targets predicted by the linear approximation.

Parameters
----------
fit_intercept : bool, default=True
    Whether to calculate the intercept for this model. If set
    to False, no intercept will be used in calculations
    (i.e. data is expected to be centered).

copy_X : bool, default=True
    If True, X will be copied; else, it may be overwritten.

n_jobs : int, default=None
    The number of jobs to use for the computation. This will only provide
    speedup in case of sufficiently large problems, that is if firstly
    `n_targets > 1` and secondly `X` is sparse or if `positive` is set
    to `True`. ``None`` means 1 unless in a
    :obj:`joblib.parallel_backend` context. ``-1`` means using all
    processors. See :term:`Glossary <n_jobs>` for more details.

positive : bool, default=False
    When set to ``True``, forces the coefficients to be positive. This
    option is only supported for dense arrays.

    .. versionadded:: 0.24

Attributes
----------
coef_ : array of shape (n_features, ) or (n_targets, n_features)
    Estimated coefficients for the linear regression problem.
    If multiple targets are passed during the fit (y 2D), this
    is a 2D array of shape (n_targets, n_features), while if only
    one target is passed, this is a 1D array of length n_features.

rank_ : int
    Rank of matrix `X`. Only available when `X` is dense.

singular_ : array of shape (min(X, y),)
    Singular values of `X`. Only available when `X` is dense.

intercept_ : float or array of shape (n_targets,)
    Independent term in the linear model. Set to 0.0 if
    `fit_intercept = False`.

n_features_in_ : int
    Number of features seen during :term:`fit`.

    .. versionadded:: 0.24

feature_names_in_ : ndarray of shape (`n_features_in_`,)
    Names of features seen during :term:`fit`. Defined only when `X`
    has feature names that are all strings.

    .. versionadded:: 1.0

See Also
--------
Ridge : Ridge regression addresses some of the
    problems of Ordinary Least Squares by imposing a penalty on the
    size of the coefficients with l2 regularization.
Lasso : The Lasso is a linear model that estimates
    sparse coefficients with l1 regularization.
ElasticNet : Elastic-Net is a linear regression
    model trained with both l1 and l2 -norm regularization of the
    coefficients.

Notes
-----
From the implementation point of view, this is just plain Ordinary
Least Squares (scipy.linalg.lstsq) or Non Negative Least Squares
(scipy.optimize.nnls) wrapped as a predictor object.

Examples
--------
>>> import numpy as np
>>> from sklearn.linear_model import LinearRegression
>>> X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
>>> # y = 1 * x_0 + 2 * x_1 + 3
>>> y = np.dot(X, np.array([1, 2])) + 3
>>> reg = LinearRegression().fit(X, y)
>>> reg.score(X, y)
1.0
>>> reg.coef_
array([1., 2.])
>>> reg.intercept_
np.float64(3.0...)
>>> reg.predict(np.array([[3, 5]]))
array([16.])
File:           ~/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/linear_model/_base.py
Type:           ABCMeta
Subclasses:

model = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
model.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()

LinearRegression()

model.intercept_, model.coef_[0]

(np.float64(0.920269613554674), np.float64(0.10502451738435332))

line_pts = pd.DataFrame({'total_bill': [0, 60]})

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_trace(go.Scatter(
    x=line_pts['total_bill'],
    y=[mean_tip, mean_tip],
    mode='lines',
    name='Constant Model (Mean Tip)'
))
fig.add_trace(go.Scatter(
    x=line_pts['total_bill'],
    y=model.predict(line_pts),
    mode='lines',
    name='Linear Model: Total Bill Only'
))
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', 
                  yaxis_title='Tip')

model.predict([[15]])

/Users/sam/miniforge3/envs/dsc80/lib/python3.12/site-packages/sklearn/base.py:493: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([2.5])

# Since we trained on a DataFrame, the input to model.predict should also
# be a DataFrame. To avoid having to do this, we can use .to_numpy()
# when specifying X= and y=.
test_points = pd.DataFrame({'total_bill': [15, 4, 100]})
model.predict(test_points)

array([ 2.5 ,  1.34, 11.42])

all_preds = model.predict(tips[['total_bill']])

rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict

{'constant tip amount': np.float64(1.3807999538298954),
 'one feature: total bill': np.float64(1.0178504025697377)}

rmse_dict

{'constant tip amount': np.float64(1.3807999538298954),
 'one feature: total bill': np.float64(1.0178504025697377)}

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

tips.head()

model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

LinearRegression()

model_two.intercept_, model_two.coef_

(np.float64(0.6689447408125035), array([0.09, 0.19]))

test_pts = pd.DataFrame({'total_bill': [25], 'size': [4]})
model_two.predict(test_pts)

array([3.76])

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'], 
                           y=tips['size'], 
                           z=tips['tip'], mode='markers',
                           marker={'color': '#656DF1', 'size': 5}))

fig.update_layout(scene=dict(xaxis_title='Total Bill',
                             yaxis_title='Table Size',
                             zaxis_title='Tip'),
                  title='Tip vs. Total Bill and Table Size',
                  width=500, height=500)

rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)

pd.DataFrame({'rmse': rmse_dict.values()}, index=rmse_dict.keys())

# Let's start with the single-variable model:
with_resid = tips.assign(**{
    'Predicted Tip': model.predict(tips[['total_bill']]),
    'Residual': tips['tip'] - model.predict(tips[['total_bill']]),
})
fig = px.scatter(with_resid, x='Predicted Tip', y='Residual')
fig.add_hline(0, line_width=2, opacity=1).update_layout(title='Residual Plot (Simple Linear Model)')

# What about the two-variable model?
with_resid = tips.assign(**{
    'Predicted Tip': model_two.predict(tips[['total_bill', 'size']]),
    'Residual': tips['tip'] - model_two.predict(tips[['total_bill', 'size']]),
})
fig = px.scatter(with_resid, x='Predicted Tip', y='Residual')
fig.add_hline(0, line_width=2, opacity=1).update_layout(title='Residual Plot (Multiple Regression)')

grades = pd.read_csv('data/gradesW4315.csv')[['midterm', 'final']]
grades

tips.head()

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	president	date	0
0	George Washington	January 8, 1790	your, proper, regard, ought, object
1	George Washington	December 8, 1790	case, established, object, commerce, convention
2	George Washington	October 25, 1791	community, upon, lands, proper, provision
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	get, americans, percent, jobs, it's
231	Joseph R. Biden Jr.	March 1, 2022	let, jobs, americans, get, tonight
232	Joseph R. Biden Jr.	February 7, 2023	down, percent, jobs, tonight, it's

	the	of	to	and	...	trust	throughout	set	object
0	0.09	0.06	0.05	0.04	...	1.47e-03	0.00e+00	0.00e+00	5.78e-03
1	0.09	0.06	0.03	0.03	...	0.00e+00	0.00e+00	0.00e+00	2.99e-03
2	0.11	0.07	0.04	0.03	...	1.39e-03	0.00e+00	1.30e-03	1.82e-03
3	0.09	0.07	0.04	0.03	...	2.29e-03	7.53e-04	0.00e+00	2.01e-03
4	0.09	0.07	0.04	0.02	...	8.12e-04	1.60e-03	0.00e+00	1.07e-03

	president	date	0
0	George Washington	January 8, 1790	a, and, to, of, the
1	George Washington	December 8, 1790	in, and, to, of, the
2	George Washington	October 25, 1791	a, and, to, of, the
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	of, it's, and, to, the
231	Joseph R. Biden Jr.	March 1, 2022	we, of, to, and, the
232	Joseph R. Biden Jr.	February 7, 2023	a, of, and, to, the

	...	trust	throughout	set	object
0	...	4.29e-04	0.00e+00	0.00e+00	2.04e-03
1	...	0.00e+00	0.00e+00	0.00e+00	1.06e-03
2	...	4.06e-04	0.00e+00	3.48e-04	6.44e-04
3	...	6.70e-04	2.17e-04	0.00e+00	7.09e-04
4	...	2.38e-04	4.62e-04	0.00e+00	3.77e-04

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
...	...	...	...	...	...	...	...
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

	rmse
constant tip amount	1.38
one feature: total bill	1.02
two features	1.01

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(X, y)`	Determines regression coefficients
Use model for prediction	`lr.predict(X_new)`	Uses regression line to make predictions
Evaluate the model	`lr.score(X, y)`	Calculates the $R^2$ of the LR model
Access model attributes	`lr.coef_`, `lr.intercept_`	Accesses the regression coefficients and intercept

	midterm	final
0	80	103
1	53	79
2	91	122
...	...	...
49	77	98
50	70	134
51	75	99

Lecture 13 – Linear Regression¶

DSC 80, Fall 2024¶

Announcements 📣¶

Agenda 📆¶

State of the Union addresses¶

Finding the most important words in each speech¶

💡 Pro-Tip: Using tqdm¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Question 🤔 (Answer at dsc80.com/q)

Modeling¶

Reflection¶

Modeling¶

Philosophy¶

Goals of modeling¶

Features¶

Example: Restaurant tips 🧑‍🍳¶

About the data¶

Predicting tips¶

Exploratory data analysis¶

Model #1: Constant¶

Estimating $h^{\text{true}}$¶

Looking at the data¶

Empirical risk minimization¶

The mean tip¶

The quality of predictions¶

Root mean squared error¶

Computing and storing the RMSE¶

Model #2: Simple linear regression using total bill¶

Recap: Simple linear regression¶

Empirical risk minimization, by hand¶

Regression in sklearn¶

sklearn¶

The LinearRegression class¶

Fitting a simple linear model¶

Making predictions¶

Comparing models¶

💡 Pro-Tip: Making a DF just to display¶

Model #3: Multiple linear regression using total bill and table size¶

Multiple linear regression¶

Plane of best fit ✈️¶

Comparing models, again¶

Residual plots¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

LinearRegression summary¶

Next time¶

💡 Pro-Tip: Using `tqdm`¶

Regression in `sklearn`¶

`sklearn`¶

The `LinearRegression` class¶

`LinearRegression` summary¶