from dsc80_utils import *

from IPython.display import YouTubeVideo
YouTubeVideo('gzcBTUvVp7M')

from pathlib import Path
import re

sotu_txt = Path('data') / 'stateoftheunion1790-2023.txt'
sotu = sotu_txt.read_text()
speeches = sotu.split('\n***\n')[1:]

def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
speeches_df

speeches_df

unique_words = speeches_df['contents'].str.split().explode().value_counts()
# Take the top 500 most common words for speed
unique_words = unique_words.iloc[:500].index
unique_words

Index(['the', 'of', 'to', 'and', 'in', 'a', 'that', 'for', 'be', 'our',
       ...
       'desire', 'call', 'submitted', 'increasing', 'months', 'point', 'trust',
       'throughout', 'set', 'object'],
      dtype='object', length=500)

from tqdm.notebook import tqdm

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

# Wrap the sequence with `tqdm()` to display a progress bar
for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf = pd.DataFrame(tfidf_dict)
tfidf.head()

summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
          ...    
230          it's
231       tonight
232          it's
Length: 233, dtype: object

def five_largest(row):
    return list(row.index[row.argsort()][-5:])

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)

display_df(keywords_df, rows=233)

tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf_nl = pd.DataFrame(tfidf_nl_dict)
tfidf_nl.head()

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)
keywords_nl_df

(1000 / 999)

1.001001001001001

np.log(1000 / 999)

0.001000500333583622

(50 / 2)

25.0

(500 / 2)

250.0

np.log(50 / 2)

3.2188758248682006

np.log(500 / 2)

5.521460917862246

# The dataset is built into plotly (and seaborn)!
tips = px.data.tips()
tips

tips.plot(kind='scatter', 
          x='total_bill', y='tip',
          title='Tip vs. Total Bill')

tips.plot(kind='hist', 
          x='total_bill', 
          title='Distribution of Total Bill',
          nbins=50)

tips.plot(kind='hist', 
          x='tip', 
          title='Distribution of Tip',
          nbins=50)

mean_tip = tips['tip'].mean()
mean_tip

2.9982786885245902

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_hline(mean_tip, line_width=3, line_color='orange', opacity=1)
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

np.mean((tips['tip'] - mean_tip) ** 2)

1.9066085124966428

# The same! A fact from 40A.
np.var(tips['tip'])

1.9066085124966428

def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))

rmse(tips['tip'], mean_tip)

1.3807999538298958

rmse_dict = {}
rmse_dict['constant tip amount'] = rmse(tips['tip'], mean_tip)
rmse_dict

{'constant tip amount': 1.3807999538298958}

tips.head()

from sklearn.linear_model import LinearRegression

LinearRegression?

model = LinearRegression()

# Note that there are two arguments to fit – X and y!
# (It is not necessary to write X= and y=)
model.fit(X=tips[['total_bill']], y=tips['tip'])

LinearRegression()

model.intercept_, model.coef_

(0.9202696135546731, array([0.11]))

line_pts = pd.DataFrame({ 'total_bill': [0, 60] })

fig = px.scatter(tips, x='total_bill', y='tip')
fig.add_trace(go.Scatter(
    x=line_pts['total_bill'],
    y=model.predict(line_pts),
    mode='lines',
    name='Linear: Total Bill Only'
))
fig.update_layout(title='Tip vs. Total Bill',
                  xaxis_title='Total Bill', yaxis_title='Tip')

model.predict([[15]])

/Users/sam/mambaforge/envs/dsc80/lib/python3.8/site-packages/sklearn/base.py:450: UserWarning:

X does not have valid feature names, but LinearRegression was fitted with feature names

array([2.5])

# Since we trained on a DataFrame, the input to model.predict should also
# be a DataFrame.
test_points = pd.DataFrame({'total_bill': [15, 4, 100]})
model.predict(test_points)

array([ 2.5 ,  1.34, 11.42])

all_preds = model.predict(tips[['total_bill']])

rmse_dict['one feature: total bill'] = rmse(tips['tip'], all_preds)
rmse_dict

{'constant tip amount': 1.3807999538298958,
 'one feature: total bill': 1.0178504025697377}

tips.head()

model_two = LinearRegression()
model_two.fit(X=tips[['total_bill', 'size']], y=tips['tip'])

LinearRegression()

model_two.intercept_, model_two.coef_

(0.6689447408125031, array([0.09, 0.19]))

test_pts = pd.DataFrame({'total_bill': [25], 'size': [4]})
model_two.predict(test_pts)

array([3.76])

XX, YY = np.mgrid[0:50:2, 0:8:1]
Z = model_two.intercept_ + model_two.coef_[0] * XX + model_two.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=tips['total_bill'], 
                           y=tips['size'], 
                           z=tips['tip'], mode='markers', marker = {'color': '#656DF1'}))

fig.update_layout(scene=dict(xaxis_title='total bill',
                             yaxis_title='table size',
                             zaxis_title='tip'),
                  title='Tip vs. Total Bill and Table Size',
                  width=500, height=500)

rmse_dict['two features'] = rmse(
    tips['tip'], model_two.predict(tips[['total_bill', 'size']])
)

rmse_dict

{'constant tip amount': 1.3807999538298958,
 'one feature: total bill': 1.0178504025697377,
 'two features': 1.007256127114662}

# Let's start with the single-variable model:
with_resid = tips.assign(
    pred=model.predict(tips[['total_bill']]),
    resid=tips['tip'] - model.predict(tips[['total_bill']]),
)
fig = px.scatter(with_resid, x='pred', y='resid')
fig.add_hline(0, line_width=2, opacity=1)

# What about the two-variable model?
with_resid = tips.assign(
    pred=model_two.predict(tips[['total_bill', 'size']]),
    resid=tips['tip'] - model_two.predict(tips[['total_bill', 'size']]),
)
fig = px.scatter(with_resid, x='pred', y='resid')
fig.add_hline(0, line_width=2, opacity=1)

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587

tips.head()

pred = tips.assign(predicted=model_two.predict(tips[['total_bill', 'size']]))
pred

np.var(pred['predicted']) / np.var(pred['tip'])

0.46786930879612504

(np.corrcoef(tips['tip'], tips['total_bill'])) ** 2

array([[1.  , 0.46],
       [0.46, 1.  ]])

model_two.score(tips[['total_bill', 'size']], tips['tip'])

0.46786930879612587

tips.head()

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	the	of	to	and	...	trust	throughout	set	object
0	0.09	0.06	0.05	0.04	...	1.47e-03	0.00e+00	0.00e+00	5.78e-03
1	0.09	0.06	0.03	0.03	...	0.00e+00	0.00e+00	0.00e+00	2.99e-03
2	0.11	0.07	0.04	0.03	...	1.39e-03	0.00e+00	1.30e-03	1.82e-03
3	0.09	0.07	0.04	0.03	...	2.29e-03	7.53e-04	0.00e+00	2.01e-03
4	0.09	0.07	0.04	0.02	...	8.12e-04	1.60e-03	0.00e+00	1.07e-03

	president	date	0
0	George Washington	January 8, 1790	[a, and, to, of, the]
1	George Washington	December 8, 1790	[in, and, to, of, the]
2	George Washington	October 25, 1791	[a, and, to, of, the]
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	[of, it's, and, to, the]
231	Joseph R. Biden Jr.	March 1, 2022	[we, of, to, and, the]
232	Joseph R. Biden Jr.	February 7, 2023	[a, of, and, to, the]

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
...	...	...	...	...	...	...	...
241	22.67	2.00	Male	Yes	Sat	Dinner	2
242	17.82	1.75	Male	No	Sat	Dinner	2
243	18.78	3.00	Female	No	Thur	Dinner	2

	...	trust	throughout	set	object
0	...	4.29e-04	0.00e+00	0.00e+00	2.04e-03
1	...	0.00e+00	0.00e+00	0.00e+00	1.06e-03
2	...	4.06e-04	0.00e+00	3.48e-04	6.44e-04
3	...	6.70e-04	2.17e-04	0.00e+00	7.09e-04
4	...	2.38e-04	4.62e-04	0.00e+00	3.77e-04

	president	date	0
0	George Washington	January 8, 1790	[your, proper, regard, ought, object]
1	George Washington	December 8, 1790	[case, established, object, commerce, convention]
2	George Washington	October 25, 1791	[community, upon, lands, proper, provision]
3	George Washington	November 6, 1792	[subject, upon, information, proper, provision]
4	George Washington	December 3, 1793	[having, vessels, executive, shall, ought]
5	George Washington	November 19, 1794	[too, army, let, ought, constitution]
6	George Washington	December 8, 1795	[army, prevent, object, provision, treaty]
7	George Washington	December 7, 1796	[republic, treaty, britain, ought, object]
8	John Adams	November 22, 1797	[spain, british, claims, treaty, vessels]
9	John Adams	December 8, 1798	[st, minister, treaty, spain, commerce]
10	John Adams	December 3, 1799	[civil, period, british, minister, treaty]
11	John Adams	November 11, 1800	[experience, protection, navy, commerce, ought]
12	Thomas Jefferson	December 8, 1801	[consideration, shall, object, vessels, subject]
13	Thomas Jefferson	December 15, 1802	[shall, debt, naval, duties, vessels]
14	Thomas Jefferson	October 17, 1803	[debt, vessels, sum, millions, friendly]
15	Thomas Jefferson	November 8, 1804	[received, convention, having, due, friendly]
16	Thomas Jefferson	December 3, 1805	[families, convention, sum, millions, vessels]
17	Thomas Jefferson	December 2, 1806	[due, consideration, millions, shall, spain]
18	Thomas Jefferson	October 27, 1807	[whether, army, british, vessels, shall]
19	Thomas Jefferson	November 8, 1808	[thus, british, millions, commerce, her]
20	James Madison	November 29, 1809	[cases, having, due, british, minister]
21	James Madison	December 5, 1810	[provisions, view, minister, commerce, british]
22	James Madison	November 5, 1811	[britain, provisions, commerce, minister, brit...
23	James Madison	November 4, 1812	[nor, subject, provisions, britain, british]
24	James Madison	December 7, 1813	[number, having, naval, britain, british]
25	James Madison	September 20, 1814	[naval, vessels, britain, his, british]
26	James Madison	December 5, 1815	[debt, treasury, millions, establishment, sum]
27	James Madison	December 3, 1816	[constitution, annual, sum, treasury, british]
28	James Monroe	December 12, 1817	[improvement, territory, indian, millions, lands]
29	James Monroe	November 16, 1818	[minister, object, territory, her, spain]
30	James Monroe	December 7, 1819	[parties, friendly, minister, treaty, spain]
31	James Monroe	November 14, 1820	[amount, minister, extent, vessels, spain]
32	James Monroe	December 3, 1821	[powers, duties, revenue, spain, vessels]
33	James Monroe	December 3, 1822	[object, proper, vessels, spain, convention]
34	James Monroe	December 2, 1823	[th, department, object, minister, spain]
35	James Monroe	December 7, 1824	[spain, governments, convention, parties, object]
36	John Quincy Adams	December 6, 1825	[officers, commerce, condition, upon, improvem...
37	John Quincy Adams	December 5, 1826	[commercial, upon, vessels, british, duties]
38	John Quincy Adams	December 4, 1827	[lands, british, receipts, upon, th]
39	John Quincy Adams	December 2, 1828	[duties, revenue, upon, commercial, britain]
40	Andrew Jackson	December 8, 1829	[attention, subject, her, upon, duties]
41	Andrew Jackson	December 6, 1830	[general, subject, character, vessels, upon]
42	Andrew Jackson	December 6, 1831	[indian, commerce, claims, treaty, minister]
43	Andrew Jackson	December 4, 1832	[general, subject, duties, lands, commerce]
44	Andrew Jackson	December 3, 1833	[treasury, convention, minister, spain, duties]
45	Andrew Jackson	December 1, 1834	[bill, treaty, minister, claims, upon]
46	Andrew Jackson	December 7, 1835	[treaty, upon, claims, subject, minister]
47	Andrew Jackson	December 5, 1836	[upon, treasury, duties, revenue, banks]
48	Martin van Buren	December 5, 1837	[price, subject, upon, banks, lands]
49	Martin van Buren	December 3, 1838	[subject, upon, indian, banks, court]
50	Martin van Buren	December 2, 1839	[duties, treasury, extent, institutions, banks]
51	Martin van Buren	December 5, 1840	[general, revenue, upon, extent, having]
52	John Tyler	December 7, 1841	[banks, britain, amount, duties, treasury]
53	John Tyler	December 6, 1842	[claims, minister, thus, amount, treasury]
54	John Tyler	December 6, 1843	[treasury, british, her, minister, mexico]
55	John Tyler	December 3, 1844	[minister, upon, treaty, her, mexico]
56	James Polk	December 2, 1845	[british, convention, territory, duties, mexico]
57	James Polk	December 8, 1846	[army, territory, minister, her, mexico]
58	James Polk	December 7, 1847	[amount, treaty, her, army, mexico]
59	James Polk	December 5, 1848	[tariff, upon, bill, constitution, mexico]
60	Zachary Taylor	December 4, 1849	[territory, treaty, recommend, minister, mexico]
61	Millard Fillmore	December 2, 1850	[recommend, claims, upon, mexico, duties]
62	Millard Fillmore	December 2, 1851	[department, annual, fiscal, subject, mexico]
63	Millard Fillmore	December 6, 1852	[duties, navy, mexico, subject, her]
64	Franklin Pierce	December 5, 1853	[commercial, regard, upon, construction, subject]
65	Franklin Pierce	December 4, 1854	[character, duties, naval, minister, property]
66	Franklin Pierce	December 31, 1855	[constitution, british, territory, convention,...
67	Franklin Pierce	December 2, 1856	[institutions, property, condition, thus, terr...
68	James Buchanan	December 8, 1857	[treaty, constitution, territory, convention, ...
69	James Buchanan	December 6, 1858	[june, mexico, minister, constitution, territory]
70	James Buchanan	December 19, 1859	[minister, th, fiscal, mexico, june]
71	James Buchanan	December 3, 1860	[minister, duties, claims, convention, constit...
72	Abraham Lincoln	December 3, 1861	[army, claims, labor, capital, court]
73	Abraham Lincoln	December 1, 1862	[upon, population, shall, per, sum]
74	Abraham Lincoln	December 8, 1863	[upon, receipts, subject, navy, naval]
75	Abraham Lincoln	December 6, 1864	[condition, secretary, naval, treasury, navy]
76	Andrew Johnson	December 4, 1865	[form, commerce, powers, general, constitution]
77	Andrew Johnson	December 3, 1866	[thus, june, constitution, mexico, condition]
78	Andrew Johnson	December 3, 1867	[june, value, department, upon, constitution]
79	Andrew Johnson	December 9, 1868	[millions, amount, expenditures, june, per]
80	Ulysses S. Grant	December 6, 1869	[subject, upon, receipts, per, spain]
81	Ulysses S. Grant	December 5, 1870	[her, convention, vessels, spain, british]
82	Ulysses S. Grant	December 4, 1871	[object, powers, treaty, desire, recommend]
83	Ulysses S. Grant	December 2, 1872	[territory, line, her, britain, treaty]
84	Ulysses S. Grant	December 1, 1873	[consideration, banks, subject, amount, claims]
85	Ulysses S. Grant	December 7, 1874	[duties, upon, attention, claims, convention]
86	Ulysses S. Grant	December 7, 1875	[parties, territory, court, spain, claims]
87	Ulysses S. Grant	December 5, 1876	[subject, court, per, commission, claims]
88	Rutherford B. Hayes	December 3, 1877	[upon, sum, fiscal, commercial, value]
89	Rutherford B. Hayes	December 2, 1878	[per, secretary, fiscal, june, indian]
90	Rutherford B. Hayes	December 1, 1879	[subject, territory, june, commission, indian]
91	Rutherford B. Hayes	December 6, 1880	[subject, office, relations, attention, commer...
92	Chester A. Arthur	December 6, 1881	[spain, international, british, relations, fri...
93	Chester A. Arthur	December 4, 1882	[territory, establishment, mexico, internation...
94	Chester A. Arthur	December 4, 1883	[total, convention, mexico, commission, treaty]
95	Chester A. Arthur	December 1, 1884	[treaty, territory, commercial, secretary, ves...
96	Grover Cleveland	December 8, 1885	[duties, vessels, treaty, condition, upon]
97	Grover Cleveland	December 6, 1886	[mexico, claims, subject, convention, fiscal]
98	Grover Cleveland	December 6, 1887	[condition, sum, thus, price, tariff]
99	Grover Cleveland	December 3, 1888	[secretary, treaty, upon, per, june]
100	Benjamin Harrison	December 3, 1889	[general, commission, indian, upon, lands]
101	Benjamin Harrison	December 1, 1890	[receipts, subject, upon, per, tariff]
102	Benjamin Harrison	December 9, 1891	[court, tariff, indian, upon, per]
103	Benjamin Harrison	December 6, 1892	[tariff, secretary, upon, value, per]
104	William McKinley	December 6, 1897	[conditions, upon, international, territory, s...
105	William McKinley	December 5, 1898	[navy, commission, naval, june, spain]
106	William McKinley	December 5, 1899	[treaty, officers, commission, international, ...
107	William McKinley	December 3, 1900	[settlement, civil, shall, convention, commiss...
108	Theodore Roosevelt	December 3, 1901	[army, commercial, conditions, navy, man]
109	Theodore Roosevelt	December 2, 1902	[upon, man, navy, conditions, tariff]
110	Theodore Roosevelt	December 7, 1903	[june, lands, territory, property, treaty]
111	Theodore Roosevelt	December 6, 1904	[cases, conditions, indian, labor, man]
112	Theodore Roosevelt	December 5, 1905	[upon, conditions, commission, cannot, man]
113	Theodore Roosevelt	December 3, 1906	[upon, navy, tax, court, man]
114	Theodore Roosevelt	December 3, 1907	[conditions, navy, upon, army, man]
115	Theodore Roosevelt	December 8, 1908	[man, officers, labor, control, banks]
116	William H. Taft	December 7, 1909	[convention, banks, court, department, tariff]
117	William H. Taft	December 6, 1910	[department, court, commercial, international,...
118	William H. Taft	December 5, 1911	[mexico, department, per, tariff, court]
119	William H. Taft	December 3, 1912	[tariff, upon, army, per, department]
120	Woodrow Wilson	December 2, 1913	[how, shall, upon, mexico, ought]
121	Woodrow Wilson	December 8, 1914	[shall, convention, ought, matter, upon]
122	Woodrow Wilson	December 7, 1915	[her, navy, millions, economic, cannot]
123	Woodrow Wilson	December 5, 1916	[commerce, shall, upon, commission, bill]
124	Woodrow Wilson	December 4, 1917	[purpose, her, know, settlement, shall]
125	Woodrow Wilson	December 2, 1918	[shall, go, men, upon, back]
126	Woodrow Wilson	December 2, 1919	[economic, her, budget, labor, conditions]
127	Woodrow Wilson	December 7, 1920	[expenditures, receipts, treasury, budget, upon]
128	Warren Harding	December 6, 1921	[capital, ought, problems, conditions, tariff]
129	Warren Harding	December 8, 1922	[responsibility, republic, problems, ought, per]
130	Calvin Coolidge	December 6, 1923	[conditions, production, commission, ought, co...
131	Calvin Coolidge	December 3, 1924	[navy, international, desire, economic, court]
132	Calvin Coolidge	December 8, 1925	[international, budget, economic, ought, court]
133	Calvin Coolidge	December 7, 1926	[tax, federal, reduction, tariff, ought]
134	Calvin Coolidge	December 6, 1927	[construction, banks, per, program, property]
135	Calvin Coolidge	December 4, 1928	[federal, department, production, program, per]
136	Herbert Hoover	December 3, 1929	[commission, federal, construction, tariff, per]
137	Herbert Hoover	December 2, 1930	[about, budget, economic, per, construction]
138	Herbert Hoover	December 8, 1931	[upon, construction, federal, economic, banks]
139	Herbert Hoover	December 6, 1932	[health, june, value, economic, banks]
140	Franklin D. Roosevelt	January 3, 1934	[labor, permanent, problems, cannot, banks]
141	Franklin D. Roosevelt	January 4, 1935	[private, work, local, program, cannot]
142	Franklin D. Roosevelt	January 3, 1936	[income, shall, let, say, today]
143	Franklin D. Roosevelt	January 6, 1937	[powers, convention, needs, help, problems]
144	Franklin D. Roosevelt	January 3, 1938	[budget, business, economic, today, income]
145	Franklin D. Roosevelt	January 4, 1939	[labor, cannot, capital, income, billion]
146	Franklin D. Roosevelt	January 3, 1940	[world, domestic, cannot, economic, today]
147	Franklin D. Roosevelt	January 6, 1941	[freedom, problems, cannot, program, today]
148	Franklin D. Roosevelt	January 6, 1942	[him, today, know, forces, production]
149	Franklin D. Roosevelt	January 7, 1943	[pacific, get, cannot, americans, production]
150	Franklin D. Roosevelt	January 11, 1944	[individual, total, know, economic, cannot]
151	Franklin D. Roosevelt	January 6, 1945	[cannot, production, army, forces, jobs]
152	Harry S. Truman	January 21, 1946	[fiscal, program, billion, million, dollars]
153	Harry S. Truman	January 6, 1947	[commission, budget, economic, labor, program]
154	Harry S. Truman	January 7, 1948	[tax, billion, today, program, economic]
155	Harry S. Truman	January 5, 1949	[economic, price, program, cannot, production]
156	Harry S. Truman	January 4, 1950	[income, today, program, programs, economic]
157	Harry S. Truman	January 8, 1951	[help, program, production, strength, economic]
158	Harry S. Truman	January 9, 1952	[defense, working, program, help, production]
159	Harry S. Truman	January 7, 1953	[republic, free, cannot, world, economic]
160	Dwight D. Eisenhower	February 2, 1953	[federal, labor, budget, economic, programs]
161	Dwight D. Eisenhower	January 7, 1954	[federal, programs, economic, budget, program]
162	Dwight D. Eisenhower	January 6, 1955	[problems, federal, economic, programs, program]
163	Dwight D. Eisenhower	January 5, 1956	[billion, federal, problems, economic, program]
164	Dwight D. Eisenhower	January 10, 1957	[cannot, programs, human, program, economic]
165	Dwight D. Eisenhower	January 9, 1958	[program, strength, today, programs, economic]
166	Dwight D. Eisenhower	January 9, 1959	[growth, help, billion, programs, economic]
167	Dwight D. Eisenhower	January 7, 1960	[freedom, cannot, today, economic, help]
168	Dwight D. Eisenhower	January 12, 1961	[million, percent, billion, program, programs]
169	John F. Kennedy	January 30, 1961	[budget, programs, problems, economic, program]
170	John F. Kennedy	January 11, 1962	[billion, help, program, jobs, cannot]
171	John F. Kennedy	January 14, 1963	[help, cannot, tax, percent, billion]
172	Lyndon B. Johnson	January 8, 1964	[help, billion, americans, budget, million]
173	Lyndon B. Johnson	January 4, 1965	[americans, man, programs, tonight, help]
174	Lyndon B. Johnson	January 12, 1966	[program, percent, help, billion, tonight]
175	Lyndon B. Johnson	January 10, 1967	[programs, americans, billion, tonight, percent]
176	Lyndon B. Johnson	January 17, 1968	[programs, million, budget, tonight, billion]
177	Lyndon B. Johnson	January 14, 1969	[americans, program, billion, budget, tonight]
178	Richard Nixon	January 22, 1970	[billion, percent, america, today, programs]
179	Richard Nixon	January 22, 1971	[federal, americans, budget, tonight, let]
180	Richard Nixon	January 20, 1972	[america, program, programs, today, help]
181	Richard Nixon	February 2, 1973	[economic, help, americans, working, programs]
182	Richard Nixon	January 30, 1974	[program, americans, today, energy, tonight]
183	Gerald R. Ford	January 15, 1975	[program, percent, billion, programs, energy]
184	Gerald R. Ford	January 19, 1976	[federal, americans, budget, jobs, programs]
185	Gerald R. Ford	January 12, 1977	[programs, today, percent, jobs, energy]
186	Jimmy Carter	January 19, 1978	[cannot, economic, tonight, jobs, it's]
187	Jimmy Carter	January 25, 1979	[cannot, budget, tonight, americans, it's]
188	Jimmy Carter	January 21, 1980	[help, america, energy, tonight, it's]
189	Jimmy Carter	January 16, 1981	[percent, economic, energy, program, programs]
190	Ronald Reagan	January 26, 1982	[jobs, help, program, billion, programs]
191	Ronald Reagan	January 25, 1983	[problems, programs, americans, economic, perc...
192	Ronald Reagan	January 25, 1984	[budget, help, americans, tonight, it's]
193	Ronald Reagan	February 6, 1985	[help, tax, jobs, tonight, it's]
194	Ronald Reagan	February 4, 1986	[america, cannot, it's, budget, tonight]
195	Ronald Reagan	January 27, 1987	[percent, let, budget, tonight, it's]
196	Ronald Reagan	January 25, 1988	[let, americans, it's, budget, tonight]
197	George H.W. Bush	February 9, 1989	[help, ask, it's, budget, tonight]
198	George H.W. Bush	January 31, 1990	[percent, budget, today, tonight, it's]
199	George H.W. Bush	January 29, 1991	[jobs, budget, americans, know, tonight]
200	George H.W. Bush	January 28, 1992	[know, get, tonight, help, it's]
201	William J. Clinton	February 17, 1993	[tax, budget, percent, tonight, jobs]
202	William J. Clinton	January 25, 1994	[americans, it's, health, get, jobs]
203	William J. Clinton	January 24, 1995	[jobs, americans, get, tonight, it's]
204	William J. Clinton	January 23, 1996	[tonight, families, working, americans, children]
205	William J. Clinton	February 4, 1997	[america, children, budget, americans, tonight]
206	William J. Clinton	January 27, 1998	[ask, americans, children, help, tonight]
207	William J. Clinton	January 19, 1999	[children, budget, help, americans, tonight]
208	William J. Clinton	January 27, 2000	[families, help, children, americans, tonight]
209	George W. Bush	February 27, 2001	[help, tax, percent, tonight, budget]
210	George W. Bush	September 20, 2001	[freedom, america, ask, americans, tonight]
211	George W. Bush	January 29, 2002	[americans, budget, tonight, america, jobs]
212	George W. Bush	January 28, 2003	[america, help, million, americans, tonight]
213	George W. Bush	January 20, 2004	[children, america, americans, help, tonight]
214	George W. Bush	February 2, 2005	[freedom, tonight, help, social, americans]
215	George W. Bush	January 31, 2006	[reform, jobs, americans, america, tonight]
216	George W. Bush	January 23, 2007	[children, health, americans, tonight, help]
217	George W. Bush	January 29, 2008	[america, americans, trust, tonight, help]
218	Barack Obama	February 24, 2009	[know, budget, jobs, tonight, it's]
219	Barack Obama	January 27, 2010	[get, tonight, americans, jobs, it's]
220	Barack Obama	January 25, 2011	[percent, get, tonight, jobs, it's]
221	Barack Obama	January 24, 2012	[americans, tonight, get, it's, jobs]
222	Barack Obama	February 12, 2013	[families, it's, get, tonight, jobs]
223	Barack Obama	January 28, 2014	[get, tonight, help, it's, jobs]
224	Barack Obama	January 20, 2015	[families, americans, tonight, jobs, it's]
225	Barack Obama	January 12, 2016	[tonight, jobs, americans, get, it's]
226	Donald J. Trump	February 27, 2017	[america, jobs, americans, it's, tonight]
227	Donald J. Trump	January 30, 2018	[tax, get, it's, americans, tonight]
228	Donald J. Trump	February 5, 2019	[get, jobs, americans, it's, tonight]
229	Donald J. Trump	February 4, 2020	[jobs, it's, americans, percent, tonight]
230	Joseph R. Biden Jr.	April 28, 2021	[get, americans, percent, jobs, it's]
231	Joseph R. Biden Jr.	March 1, 2022	[let, jobs, americans, get, tonight]
232	Joseph R. Biden Jr.	February 7, 2023	[down, percent, jobs, tonight, it's]

`'total_bill'`	`'tip'`
Right skewed	Right skewed
Mean around $20	Mean around $3
Mode around $16	Possibly bimodal at \$2 and \$3?
No particularly large bills	Large outliers?

Property	Example	Description
Initialize model parameters	`lr = LinearRegression()`	Create (empty) linear regression model
Fit the model to the data	`lr.fit(X, y)`	Determines regression coefficients
Use model for prediction	`lr.predict(X_new)`	Uses regression line to make predictions
Evaluate the model	`lr.score(X, y)`	Calculates the $R^2$ of the LR model
Access model attributes	`lr.coef_`, `lr.intercept_`	Accesses the regression coefficients and intercept

	total_bill	tip	sex	smoker	day	time	size
0	3.07	1.00	Female	Yes	Sat	Dinner	1
1	18.78	3.00	Female	No	Thur	Dinner	2
2	26.59	3.41	Male	Yes	Sat	Dinner	3
3	14.26	2.50	Male	No	Thur	Lunch	2
4	21.16	3.00	Male	No	Thur	Lunch	2

Lecture 13 – Linear Regression¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

🙋🙋🏽‍♀️ Slido¶

Example: State of the Union addresses 🎤¶

State of the Union addresses¶

The data¶

Finding the most important words in each speech¶

💡 Pro-Tip: Using tqdm¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

🙋🙋🏽‍♀️ Questions?¶

Modeling¶

Reflection¶

Modeling¶

Goals of modeling¶

Features¶

Example: Restaurant tips 🧑‍🍳¶

About the data¶

Predicting tips¶

Exploratory data analysis (EDA)¶

Visualizing distributions¶

Observations¶

Model #1: Constant¶

"All models are wrong, but some are useful."¶

Estimating $h^{\text{true}}$¶

Empirical risk minimization¶

The mean tip¶

The quality of predictions¶

Root mean squared error¶

Computing and storing the RMSE¶

🙋🙋🏽‍♀️ Questions?¶

Model #2: Simple linear regression using total bill¶

Recap: Simple linear regression¶

Empirical risk minimization, by hand¶

Regression in sklearn¶

sklearn¶

The LinearRegression class¶

Fitting a simple linear model¶

Making predictions¶

Comparing models¶

🙋🙋🏽‍♀️ Questions?¶

Model #3: Multiple linear regression using total bill and table size¶

Multiple linear regression¶

Plane of best fit ✈️¶

Comparing models, again¶

Residual Plots¶

Conclusion¶

🙋🙋🏽‍♀️ Questions?¶

The .score method of a LinearRegression object¶

Aside: $R^2$¶

Calculating $R^2$¶

LinearRegression summary¶

What's next?¶

💡 Pro-Tip: Using `tqdm`¶

Regression in `sklearn`¶

`sklearn`¶

The `LinearRegression` class¶

The `.score` method of a `LinearRegression` object¶

`LinearRegression` summary¶