import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

import re
import util

plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)


def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

def five_largest(row):
    return list(row.index[row.argsort()][-5:])

sotu = open('data/stateoftheunion1790-2022.txt').read()
speeches = sotu.split('\n***\n')[1:]
speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
unique_words = pd.Series(speeches_df['contents'].str.split().sum()).value_counts()
unique_words = unique_words.iloc[:500].index

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
    re_pat = fr' {word} ' # Imperfect pattern for speed
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf
    
tfidf = pd.DataFrame(tfidf_dict)

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)


tfidf


keywords_df


tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
    re_pat = fr' {word} ' # Imperfect pattern for speed
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl
    
tfidf_nl = pd.DataFrame(tfidf_nl_dict)

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)


tfidf_nl


keywords_nl_df


(1000 / 999)

1.001001001001001


np.log(1000 / 999)

0.001000500333583622


(50 / 2)

25.0


(500 / 2)

250.0


np.log(50 / 2)

3.2188758248682006


np.log(500 / 2)

5.521460917862246


salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
util.anonymize_names(salaries)


salaries.head()


galton = pd.read_csv('data/galton.csv')
galton.head()


pd.plotting.scatter_matrix(galton, figsize=(12, 8));


sns.lmplot(data=galton, x='father', y='childHeight');


from scipy.stats import linregress


lm = linregress(x=galton['father'], y=galton['childHeight'])
lm

LinregressResult(slope=0.38450503160660654, intercept=40.139294814520184, rvalue=0.26603853892271895, pvalue=1.3498077148871706e-16, stderr=0.045636214044328646, intercept_stderr=3.1599136481260373)


lm.intercept

40.139294814520184


lm.slope

0.38450503160660654


def pred_child(father):
    return lm.intercept + lm.slope * father


pred_child(60)

63.20959671091658


galton


pred_child(galton['father'])

0      70.322940
1      70.322940
2      70.322940
3      70.322940
4      69.169425
         ...    
929    63.978607
930    63.978607
931    63.978607
932    64.170859
933    64.170859
Name: father, Length: 934, dtype: float64


def mse(actual, pred):
    return np.mean((actual - pred) ** 2)


mse(galton['childHeight'], pred_child(galton['father']))

11.891573073385155


mse(galton['childHeight'], pred_child(galton['father']))

11.891573073385155


def rmse(actual, pred):
    return np.sqrt(np.mean((actual - pred) ** 2))


rmse_dict = {}
rmse_dict['father only'] = rmse(galton['childHeight'], pred_child(galton['father']))
rmse_dict

{'father only': 3.448416023826759}


sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'], 
                y=pred_child(galton['father']), 
                label='predicted child heights'
);


from sklearn.linear_model import LinearRegression


lr = LinearRegression()
lr.fit(X=galton[['father', 'mother']], y=galton['childHeight'])

LinearRegression()


lr.intercept_, lr.coef_

(22.643279708153415, array([0.36828233, 0.29050997]))


predictions = lr.predict(galton[['father', 'mother']])
predictions[:5]

array([71.01761111, 71.01761111, 71.01761111, 71.01761111, 69.76750912])


rmse_dict['father and mother'] = rmse(galton['childHeight'], predictions)
rmse_dict

{'father only': 3.448416023826759, 'father and mother': 3.3838935702036945}


XX, YY = np.mgrid[60:80:2, 55:75:2]
Z = lr.intercept_ + lr.coef_[0] * XX + lr.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')

fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=galton['father'], 
                           y=galton['mother'], 
                           z=galton['childHeight'], mode='markers', marker = {'color': '#656DF1'}))

fig.update_layout(scene = dict(
    xaxis_title = 'father',
    yaxis_title = 'mother',
    zaxis_title = 'child'),
    width=1000, height=800)


sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'], 
                y=predictions, 
                label='predicted child heights using father and mother'
);


sns.scatterplot(data=galton, x='mother', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['mother'], 
                y=predictions, 
                label='predicted child heights using father and mother'
);


sns.lmplot(data=galton, x='father', y='childHeight', hue='gender', 
           palette={'male': 'purple', 'female': 'green'});


galton.head()


galton['gender=female'] = (galton['gender'] == 'female').astype(int)
galton.head()


lr_three_features = LinearRegression()
lr_three_features.fit(galton[['father', 'mother', 'gender=female']], galton['childHeight'])

LinearRegression()


predictions_three_features = lr_three_features.predict(galton[['father', 'mother', 'gender=female']])


rmse_dict['father, mother, and gender'] = rmse(galton['childHeight'], predictions_three_features)
rmse_dict

{'father only': 3.448416023826759,
 'father and mother': 3.3838935702036945,
 'father, mother, and gender': 2.1600506762028053}


lr_three_features.intercept_, lr_three_features.coef_

(21.736229255415104, array([ 0.39284333,  0.31761007, -5.21498935]))


XX, YY = np.mgrid[60:80:2, 55:75:2]
Z_female = (lr_three_features.intercept_ + lr_three_features.coef_[2]) + lr_three_features.coef_[0] * XX + lr_three_features.coef_[1] * YY
Z_male = lr_three_features.intercept_ + lr_three_features.coef_[0] * XX + lr_three_features.coef_[1] * YY

plane_female = go.Surface(x=XX, y=YY, z=Z_female, colorscale ='Greens')
plane_male = go.Surface(x=XX, y=YY, z=Z_male, colorscale='Purples')

fig = go.Figure(data=[plane_female, plane_male])

galton_female = galton[galton['gender'] == 'female']
galton_male = galton[galton['gender'] == 'male']

fig.add_trace(go.Scatter3d(x=galton_female['father'], 
                           y=galton_female['mother'], 
                           z=galton_female['childHeight'], mode='markers', marker = {'color': 'green'}))

fig.add_trace(go.Scatter3d(x=galton_male['father'], 
                           y=galton_male['mother'], 
                           z=galton_male['childHeight'], mode='markers', marker = {'color': 'purple'}))

fig.update_layout(scene = dict(
    xaxis_title = 'father',
    yaxis_title = 'mother',
    zaxis_title = 'child'),
    width=1000, height=800,
    showlegend=False,
    title="Predicted child's heights given parents' heights and gender (purple=male, green=female)")


sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'], 
                y=predictions_three_features, 
                label='predicted child heights using father, mother, and gender'
);

	president	date	0
0	George Washington	January 8, 1790	[proper, your, regard, ought, object]
1	George Washington	December 8, 1790	[case, established, object, commerce, convention]
2	George Washington	October 25, 1791	[upon, community, lands, proper, provision]
3	George Washington	November 6, 1792	[subject, upon, information, proper, provision]
4	George Washington	December 3, 1793	[having, vessels, executive, shall, ought]
...	...	...	...
227	Donald J. Trump	January 30, 2018	[jobs, tax, get, americans, tonight]
228	Donald J. Trump	February 5, 2019	[members, get, jobs, americans, tonight]
229	Donald J. Trump	February 4, 2020	[million, jobs, americans, percent, tonight]
230	Joseph R. Biden Jr.	April 28, 2021	[america, get, americans, percent, jobs]
231	Joseph R. Biden Jr.	March 1, 2022	[let, jobs, americans, get, tonight]

	the	of	to	and	in	a	that	for	be	our	...	submitted	did	increasing	throughout	point	months	set	object	agreement	almost
0	0.089073	0.063361	0.051423	0.037649	0.018365	0.019284	0.013774	0.006428	0.018365	0.009183	...	0.000000	0.000000	0.001392	0.000000	0.000000	0.000000	0.000000	0.005758	0.000000	0.000000
1	0.086957	0.063435	0.034925	0.032074	0.019244	0.014968	0.012117	0.011404	0.012830	0.012117	...	0.001312	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.002979	0.000000	0.000000
2	0.105035	0.069010	0.038194	0.031684	0.017795	0.018229	0.013889	0.009549	0.014757	0.002170	...	0.000799	0.000000	0.000658	0.000000	0.000000	0.000000	0.001291	0.001814	0.000000	0.000000
3	0.093212	0.066444	0.042065	0.026769	0.022945	0.015296	0.011472	0.014340	0.013862	0.005258	...	0.000000	0.000000	0.000000	0.000749	0.000000	0.000000	0.000000	0.001998	0.000000	0.000000
4	0.091603	0.067176	0.037659	0.024936	0.013232	0.017303	0.006107	0.011705	0.021883	0.008142	...	0.000000	0.000772	0.000000	0.001595	0.000000	0.000000	0.000000	0.001064	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
227	0.040610	0.023646	0.033756	0.041981	0.017992	0.017135	0.011652	0.007882	0.005312	0.017820	...	0.000000	0.000520	0.000000	0.000000	0.000253	0.001343	0.000510	0.000000	0.000000	0.000834
228	0.049582	0.025413	0.029145	0.034654	0.020970	0.017949	0.011374	0.011374	0.004443	0.017416	...	0.000000	0.002156	0.000000	0.000000	0.000263	0.001114	0.000793	0.000000	0.001309	0.002307
229	0.047201	0.023920	0.026631	0.035242	0.018338	0.016425	0.011162	0.010046	0.003987	0.015149	...	0.000000	0.000242	0.000000	0.000250	0.000236	0.001250	0.000237	0.000000	0.000587	0.000776
230	0.053181	0.026402	0.035831	0.033819	0.023007	0.014835	0.010686	0.008423	0.004652	0.009806	...	0.000000	0.001906	0.000191	0.000591	0.000557	0.000197	0.000374	0.000000	0.000231	0.000816
231	0.044309	0.023458	0.033759	0.039221	0.017004	0.017376	0.012908	0.009433	0.004716	0.008316	...	0.000000	0.001129	0.000188	0.000389	0.000550	0.000584	0.000554	0.000000	0.000000	0.000604

	president	date	0
0	George Washington	January 8, 1790	[a, and, to, of, the]
1	George Washington	December 8, 1790	[in, and, to, of, the]
2	George Washington	October 25, 1791	[a, and, to, of, the]
3	George Washington	November 6, 1792	[in, and, to, of, the]
4	George Washington	December 3, 1793	[be, and, to, of, the]
...	...	...	...
227	Donald J. Trump	January 30, 2018	[we, of, to, the, and]
228	Donald J. Trump	February 5, 2019	[in, of, to, and, the]
229	Donald J. Trump	February 4, 2020	[in, of, to, and, the]
230	Joseph R. Biden Jr.	April 28, 2021	[in, of, and, to, the]
231	Joseph R. Biden Jr.	March 1, 2022	[we, of, to, and, the]

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Pension Debt	Total Pay & Benefits	Year	Notes	Agency	Status
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	NaN	354692.0	2020	NaN	San Diego	FT
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	NaN	352494.0	2020	NaN	San Diego	FT
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	NaN	347643.0	2020	NaN	San Diego	PT
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	NaN	342606.0	2020	NaN	San Diego	FT
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	NaN	342476.0	2020	NaN	San Diego	FT

	family	father	mother	children	childNum	gender	childHeight
0	1	78.5	67.0	4	1	male	73.2
1	1	78.5	67.0	4	2	female	69.2
2	1	78.5	67.0	4	3	female	69.0
3	1	78.5	67.0	4	4	female	69.0
4	2	75.5	66.5	4	1	male	73.5

Lecture 20 – Features¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: TF-IDF¶

Term frequency-inverse document frequency¶

Example: State of the Union addresses 🎤¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Features¶

Reflection¶

Features¶

Example: San Diego employee salaries¶

What makes a good feature?¶

Example: Predicting child heights 📏¶

Galton's heights dataset¶

Exploratory data analysis¶

Attempt #1: Predict child's height using father's height¶

Recap: Simple linear regression¶

Finding the regression line programatically¶

Aside: MSE vs. RMSE¶

Visualizing our single-feature predictions¶

Attempt #2: Predict child's height using father's and mother's heights¶

Multiple regression in `sklearn`¶

Visualizing our two-feature predictions¶

Attempt #3: Adding gender as a feature¶

Attempt #3: Adding gender as a feature¶

Visualizing our three-feature predictions¶

Summary, next time¶

Summary¶

Next time: feature engineering¶

	the	of	to	and	in	a	that	for	be	our	...	submitted	did	increasing	throughout	point	months	set	object	agreement	almost
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000000	0.000382	0.000000	0.000000	0.000000	0.000000	0.002031	0.000000	0.000000
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000435	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.001051	0.000000	0.000000
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000265	0.000000	0.000181	0.000000	0.000000	0.000000	0.000345	0.000640	0.000000	0.000000
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000000	0.000000	0.000215	0.000000	0.000000	0.000000	0.000705	0.000000	0.000000
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000212	0.000000	0.000458	0.000000	0.000000	0.000000	0.000375	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
227	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000143	0.000000	0.000000	0.000067	0.000385	0.000136	0.000000	0.000000	0.000249
228	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000592	0.000000	0.000000	0.000069	0.000320	0.000212	0.000000	0.000434	0.000688
229	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000066	0.000000	0.000072	0.000062	0.000358	0.000063	0.000000	0.000195	0.000231
230	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000523	0.000052	0.000170	0.000147	0.000057	0.000100	0.000000	0.000077	0.000243
231	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.000000	0.000310	0.000052	0.000112	0.000145	0.000167	0.000148	0.000000	0.000000	0.000180

Lecture 20 – Features¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: TF-IDF¶

Term frequency-inverse document frequency¶

Example: State of the Union addresses 🎤¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Features¶

Reflection¶

Features¶

Example: San Diego employee salaries¶

What makes a good feature?¶

Example: Predicting child heights 📏¶

Galton's heights dataset¶

Exploratory data analysis¶

Attempt #1: Predict child's height using father's height¶

Recap: Simple linear regression¶

Finding the regression line programatically¶

Aside: MSE vs. RMSE¶

Visualizing our single-feature predictions¶

Attempt #2: Predict child's height using father's and mother's heights¶

Multiple regression in sklearn¶

Visualizing our two-feature predictions¶

Attempt #3: Adding gender as a feature¶

Attempt #3: Adding gender as a feature¶

Visualizing our three-feature predictions¶

Summary, next time¶

Summary¶

Next time: feature engineering¶

Multiple regression in `sklearn`¶