import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import re
import util
plt.style.use('seaborn-white')
plt.rc('figure', dpi=100, figsize=(7, 5))
plt.rc('font', size=12)
The term frequency-inverse document frequency (TF-IDF) of word $t$ in document $d$ is the product:
$$ \begin{align*}\text{tfidf}(t, d) &= \text{tf}(t, d) \cdot \text{idf}(t) \\\ &= \frac{\text{number of occurrences of $t$ in $d$}}{\text{total number of words in $d$}} \cdot \log \left(\frac{\text{total number of documents}}{\text{number of documents in which $t$ appears}} \right) \end{align*} $$If $\text{tfidf}(t, d)$ is large, then $t$ is a good summary of $d$.
TF-IDF is a heuristic – it has no probabilistic justification.
Recall, last class, we computed the TF-IDF for every word and every SOTU speech. We used TF-IDFs to summarize speeches.
def extract_struct(speech):
L = speech.strip().split('\n', maxsplit=3)
L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
return dict(zip(['speech', 'president', 'date', 'contents'], L))
def five_largest(row):
return list(row.index[row.argsort()][-5:])
sotu = open('data/stateoftheunion1790-2022.txt').read()
speeches = sotu.split('\n***\n')[1:]
speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
unique_words = pd.Series(speeches_df['contents'].str.split().sum()).value_counts()
unique_words = unique_words.iloc[:500].index
tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
re_pat = fr' {word} ' # Imperfect pattern for speed
tf = speeches_df['contents'].str.count(re_pat) / tf_denom
idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
tfidf_dict[word] = tf * idf
tfidf = pd.DataFrame(tfidf_dict)
keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
speeches_df['president'],
speeches_df['date'],
keywords
], axis=1)
tfidf
the | of | to | and | in | a | that | for | be | our | ... | submitted | did | increasing | throughout | point | months | set | object | agreement | almost | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000382 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.002031 | 0.000000 | 0.000000 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000435 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.001051 | 0.000000 | 0.000000 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000265 | 0.000000 | 0.000181 | 0.000000 | 0.000000 | 0.000000 | 0.000345 | 0.000640 | 0.000000 | 0.000000 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000215 | 0.000000 | 0.000000 | 0.000000 | 0.000705 | 0.000000 | 0.000000 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000212 | 0.000000 | 0.000458 | 0.000000 | 0.000000 | 0.000000 | 0.000375 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
227 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000143 | 0.000000 | 0.000000 | 0.000067 | 0.000385 | 0.000136 | 0.000000 | 0.000000 | 0.000249 |
228 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000592 | 0.000000 | 0.000000 | 0.000069 | 0.000320 | 0.000212 | 0.000000 | 0.000434 | 0.000688 |
229 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000066 | 0.000000 | 0.000072 | 0.000062 | 0.000358 | 0.000063 | 0.000000 | 0.000195 | 0.000231 |
230 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000523 | 0.000052 | 0.000170 | 0.000147 | 0.000057 | 0.000100 | 0.000000 | 0.000077 | 0.000243 |
231 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.000000 | 0.000310 | 0.000052 | 0.000112 | 0.000145 | 0.000167 | 0.000148 | 0.000000 | 0.000000 | 0.000180 |
232 rows × 500 columns
keywords_df
president | date | 0 | |
---|---|---|---|
0 | George Washington | January 8, 1790 | [proper, your, regard, ought, object] |
1 | George Washington | December 8, 1790 | [case, established, object, commerce, convention] |
2 | George Washington | October 25, 1791 | [upon, community, lands, proper, provision] |
3 | George Washington | November 6, 1792 | [subject, upon, information, proper, provision] |
4 | George Washington | December 3, 1793 | [having, vessels, executive, shall, ought] |
... | ... | ... | ... |
227 | Donald J. Trump | January 30, 2018 | [jobs, tax, get, americans, tonight] |
228 | Donald J. Trump | February 5, 2019 | [members, get, jobs, americans, tonight] |
229 | Donald J. Trump | February 4, 2020 | [million, jobs, americans, percent, tonight] |
230 | Joseph R. Biden Jr. | April 28, 2021 | [america, get, americans, percent, jobs] |
231 | Joseph R. Biden Jr. | March 1, 2022 | [let, jobs, americans, get, tonight] |
232 rows × 3 columns
Let's try it and see what happens.
tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
re_pat = fr' {word} ' # Imperfect pattern for speed
tf = speeches_df['contents'].str.count(re_pat) / tf_denom
idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
tfidf_nl_dict[word] = tf * idf_nl
tfidf_nl = pd.DataFrame(tfidf_nl_dict)
keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
speeches_df['president'],
speeches_df['date'],
keywords_nl
], axis=1)
tfidf_nl
the | of | to | and | in | a | that | for | be | our | ... | submitted | did | increasing | throughout | point | months | set | object | agreement | almost | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.089073 | 0.063361 | 0.051423 | 0.037649 | 0.018365 | 0.019284 | 0.013774 | 0.006428 | 0.018365 | 0.009183 | ... | 0.000000 | 0.000000 | 0.001392 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.005758 | 0.000000 | 0.000000 |
1 | 0.086957 | 0.063435 | 0.034925 | 0.032074 | 0.019244 | 0.014968 | 0.012117 | 0.011404 | 0.012830 | 0.012117 | ... | 0.001312 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.002979 | 0.000000 | 0.000000 |
2 | 0.105035 | 0.069010 | 0.038194 | 0.031684 | 0.017795 | 0.018229 | 0.013889 | 0.009549 | 0.014757 | 0.002170 | ... | 0.000799 | 0.000000 | 0.000658 | 0.000000 | 0.000000 | 0.000000 | 0.001291 | 0.001814 | 0.000000 | 0.000000 |
3 | 0.093212 | 0.066444 | 0.042065 | 0.026769 | 0.022945 | 0.015296 | 0.011472 | 0.014340 | 0.013862 | 0.005258 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000749 | 0.000000 | 0.000000 | 0.000000 | 0.001998 | 0.000000 | 0.000000 |
4 | 0.091603 | 0.067176 | 0.037659 | 0.024936 | 0.013232 | 0.017303 | 0.006107 | 0.011705 | 0.021883 | 0.008142 | ... | 0.000000 | 0.000772 | 0.000000 | 0.001595 | 0.000000 | 0.000000 | 0.000000 | 0.001064 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
227 | 0.040610 | 0.023646 | 0.033756 | 0.041981 | 0.017992 | 0.017135 | 0.011652 | 0.007882 | 0.005312 | 0.017820 | ... | 0.000000 | 0.000520 | 0.000000 | 0.000000 | 0.000253 | 0.001343 | 0.000510 | 0.000000 | 0.000000 | 0.000834 |
228 | 0.049582 | 0.025413 | 0.029145 | 0.034654 | 0.020970 | 0.017949 | 0.011374 | 0.011374 | 0.004443 | 0.017416 | ... | 0.000000 | 0.002156 | 0.000000 | 0.000000 | 0.000263 | 0.001114 | 0.000793 | 0.000000 | 0.001309 | 0.002307 |
229 | 0.047201 | 0.023920 | 0.026631 | 0.035242 | 0.018338 | 0.016425 | 0.011162 | 0.010046 | 0.003987 | 0.015149 | ... | 0.000000 | 0.000242 | 0.000000 | 0.000250 | 0.000236 | 0.001250 | 0.000237 | 0.000000 | 0.000587 | 0.000776 |
230 | 0.053181 | 0.026402 | 0.035831 | 0.033819 | 0.023007 | 0.014835 | 0.010686 | 0.008423 | 0.004652 | 0.009806 | ... | 0.000000 | 0.001906 | 0.000191 | 0.000591 | 0.000557 | 0.000197 | 0.000374 | 0.000000 | 0.000231 | 0.000816 |
231 | 0.044309 | 0.023458 | 0.033759 | 0.039221 | 0.017004 | 0.017376 | 0.012908 | 0.009433 | 0.004716 | 0.008316 | ... | 0.000000 | 0.001129 | 0.000188 | 0.000389 | 0.000550 | 0.000584 | 0.000554 | 0.000000 | 0.000000 | 0.000604 |
232 rows × 500 columns
keywords_nl_df
president | date | 0 | |
---|---|---|---|
0 | George Washington | January 8, 1790 | [a, and, to, of, the] |
1 | George Washington | December 8, 1790 | [in, and, to, of, the] |
2 | George Washington | October 25, 1791 | [a, and, to, of, the] |
3 | George Washington | November 6, 1792 | [in, and, to, of, the] |
4 | George Washington | December 3, 1793 | [be, and, to, of, the] |
... | ... | ... | ... |
227 | Donald J. Trump | January 30, 2018 | [we, of, to, the, and] |
228 | Donald J. Trump | February 5, 2019 | [in, of, to, and, the] |
229 | Donald J. Trump | February 4, 2020 | [in, of, to, and, the] |
230 | Joseph R. Biden Jr. | April 28, 2021 | [in, of, and, to, the] |
231 | Joseph R. Biden Jr. | March 1, 2022 | [we, of, to, and, the] |
232 rows × 3 columns
(1000 / 999)
1.001001001001001
np.log(1000 / 999)
0.001000500333583622
(50 / 2)
25.0
(500 / 2)
250.0
np.log(50 / 2)
3.2188758248682006
np.log(500 / 2)
5.521460917862246
So far this quarter, we've learned how to:
pandas
and regular expressions.Note: TF-IDF is a feature we've created that summarizes documents!
What features are present in salaries
? What features can we create?
salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
util.anonymize_names(salaries)
salaries.head()
Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Pension Debt | Total Pay & Benefits | Year | Notes | Agency | Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Michael Xxxx | Police Officer | 117691.0 | 187290.0 | 13331.00 | 36380.0 | 318312.0 | NaN | 354692.0 | 2020 | NaN | San Diego | FT |
1 | Gary Xxxx | Police Officer | 117691.0 | 160062.0 | 42946.00 | 31795.0 | 320699.0 | NaN | 352494.0 | 2020 | NaN | San Diego | FT |
2 | Eric Xxxx | Fire Engineer | 35698.0 | 204462.0 | 69121.00 | 38362.0 | 309281.0 | NaN | 347643.0 | 2020 | NaN | San Diego | PT |
3 | Gregg Xxxx | Retirement Administrator | 305000.0 | 0.0 | 12814.00 | 24792.0 | 317814.0 | NaN | 342606.0 | 2020 | NaN | San Diego | FT |
4 | Joseph Xxxx | Fire Battalion Chief | 94451.0 | 157778.0 | 48151.00 | 42096.0 | 300380.0 | NaN | 342476.0 | 2020 | NaN | San Diego | FT |
salaries
.salaries
with another data source, like the SSA baby names dataset, to create this feature.A good feature should be...
Often times, the columns in a dataset aren't good features on their own. In such cases, we may need to "engineer" features that are useful.
galton = pd.read_csv('data/galton.csv')
galton.head()
family | father | mother | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 4 | 1 | male | 73.5 |
The following scatter matrix contains a scatter plot of all pairs of quantitative attributes, and a histogram for each quantitative attribute on its own.
pd.plotting.scatter_matrix(galton, figsize=(12, 8));
Is a linear model suitable for prediction? If so, on which attributes?
We will assume that the relationship between father's heights and child's heights is linear. That is,
$$\text{predicted child's height} = w_0^* + w_1^* \cdot \text{father's height}$$where $w_0^*$ and $w_1^*$ are carefully chosen parameters.
seaborn
's lmplot
function can automatically plot the "line of best fit" on a scatter plot.
sns.lmplot(data=galton, x='father', y='childHeight');
For any father's height $x_i$, their predicted child's height is given by
$$H(x_i) = w_0 + w_1x_i$$There are several packages that can perform linear regression; scipy.stats
is one of them.
from scipy.stats import linregress
lm = linregress(x=galton['father'], y=galton['childHeight'])
lm
LinregressResult(slope=0.38450503160660654, intercept=40.139294814520184, rvalue=0.26603853892271895, pvalue=1.3498077148871706e-16, stderr=0.045636214044328646, intercept_stderr=3.1599136481260373)
The lm
object has several attributes, most notably, slope
and intercept.
lm.intercept
40.139294814520184
lm.slope
0.38450503160660654
def pred_child(father):
return lm.intercept + lm.slope * father
pred_child
words on scalar values:
pred_child(60)
63.20959671091658
But it also works on arrays/Series:
galton
family | father | mother | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 4 | 1 | male | 73.5 |
... | ... | ... | ... | ... | ... | ... | ... |
929 | 203 | 62.0 | 66.0 | 3 | 1 | male | 64.0 |
930 | 203 | 62.0 | 66.0 | 3 | 2 | female | 62.0 |
931 | 203 | 62.0 | 66.0 | 3 | 3 | female | 61.0 |
932 | 204 | 62.5 | 63.0 | 2 | 1 | male | 66.5 |
933 | 204 | 62.5 | 63.0 | 2 | 2 | female | 57.0 |
934 rows × 7 columns
pred_child(galton['father'])
0 70.322940 1 70.322940 2 70.322940 3 70.322940 4 69.169425 ... 929 63.978607 930 63.978607 931 63.978607 932 64.170859 933 64.170859 Name: father, Length: 934, dtype: float64
Recall, a lower MSE means a better fit on the training data. Let's compute the MSE of this simple linear model; it will be useful later.
def mse(actual, pred):
return np.mean((actual - pred) ** 2)
mse(galton['childHeight'], pred_child(galton['father']))
11.891573073385155
An issue with mean squared error is that its units are the square of the units of the $y$-values.
$$\text{MSE} = \frac{1}{n} \sum_{i = 1}^n \big( y_i - H(x_i) \big)^2$$For instance, the number below is 11.892 "inches squared".
mse(galton['childHeight'], pred_child(galton['father']))
11.891573073385155
To correct the units of mean squared error, we can take the square root. The result, root mean squared error (RMSE) is also a measure of how well a model fits training data.
$$\text{RMSE} = \sqrt{\frac{1}{n} \sum_{i = 1}^n \big( y_i - H(x_i) \big)^2}$$Important: The line that minimizes MSE is the same line that minimizes RMSE and SSE (sum of squared errors).
def rmse(actual, pred):
return np.sqrt(np.mean((actual - pred) ** 2))
Let's create a dictionary to keep track of the RMSEs of the various models we create.
rmse_dict = {}
rmse_dict['father only'] = rmse(galton['childHeight'], pred_child(galton['father']))
rmse_dict
{'father only': 3.448416023826759}
sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'],
y=pred_child(galton['father']),
label='predicted child heights'
);
sklearn
¶We'll cover sklearn
in more detail in the coming lectures.
from sklearn.linear_model import LinearRegression
A typical pattern in sklearn
is instantiate, fit, and predict.
lr = LinearRegression()
lr.fit(X=galton[['father', 'mother']], y=galton['childHeight'])
LinearRegression()
After calling fit
on lr
, we can access the intercept and coefficients of the plane of best fit (i.e. these are $w_0^*$, $w_1^*$, and $w_2^*$).
lr.intercept_, lr.coef_
(22.643279708153415, array([0.36828233, 0.29050997]))
However, we don't actually need to access these directly. Fit LinearRegression
objects have the predict
method, which we can use directly:
predictions = lr.predict(galton[['father', 'mother']])
predictions[:5]
array([71.01761111, 71.01761111, 71.01761111, 71.01761111, 69.76750912])
How well does this model perform?
rmse_dict['father and mother'] = rmse(galton['childHeight'], predictions)
rmse_dict
{'father only': 3.448416023826759, 'father and mother': 3.3838935702036945}
It seems like this two-feature model has a lower RMSE than the original single-feature model (which we'd expect), but it's only slightly lower.
Here, we must draw a 3D scatter plot and plane, with one axis for father's height, one axis for mother's height, and one axis for child's height. The code below does this.
XX, YY = np.mgrid[60:80:2, 55:75:2]
Z = lr.intercept_ + lr.coef_[0] * XX + lr.coef_[1] * YY
plane = go.Surface(x=XX, y=YY, z=Z, colorscale='Oranges')
fig = go.Figure(data=[plane])
fig.add_trace(go.Scatter3d(x=galton['father'],
y=galton['mother'],
z=galton['childHeight'], mode='markers', marker = {'color': '#656DF1'}))
fig.update_layout(scene = dict(
xaxis_title = 'father',
yaxis_title = 'mother',
zaxis_title = 'child'),
width=1000, height=800)
If we want to visualize in 2D, we must pick a single feature to display on the $x$-axis.
sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'],
y=predictions,
label='predicted child heights using father and mother'
);
sns.scatterplot(data=galton, x='mother', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['mother'],
y=predictions,
label='predicted child heights using father and mother'
);
sns.lmplot(data=galton, x='father', y='childHeight', hue='gender',
palette={'male': 'purple', 'female': 'green'});
Observation: It appears that the two lines have similar slopes, but different intercepts.
There's an issue: gender is a categorical feature, but in order to use it as a feature in a regression model, it must be quantitative.
galton.head()
family | father | mother | children | childNum | gender | childHeight | |
---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 4 | 1 | male | 73.2 |
1 | 1 | 78.5 | 67.0 | 4 | 2 | female | 69.2 |
2 | 1 | 78.5 | 67.0 | 4 | 3 | female | 69.0 |
3 | 1 | 78.5 | 67.0 | 4 | 4 | female | 69.0 |
4 | 2 | 75.5 | 66.5 | 4 | 1 | male | 73.5 |
Solution: Create a column named 'gender=female'
, that is
'gender'
is 'female'
, andgalton['gender=female'] = (galton['gender'] == 'female').astype(int)
galton.head()
family | father | mother | children | childNum | gender | childHeight | gender=female | |
---|---|---|---|---|---|---|---|---|
0 | 1 | 78.5 | 67.0 | 4 | 1 | male | 73.2 | 0 |
1 | 1 | 78.5 | 67.0 | 4 | 2 | female | 69.2 | 1 |
2 | 1 | 78.5 | 67.0 | 4 | 3 | female | 69.0 | 1 |
3 | 1 | 78.5 | 67.0 | 4 | 4 | female | 69.0 | 1 |
4 | 2 | 75.5 | 66.5 | 4 | 1 | male | 73.5 | 0 |
Now, we can use 'gender=female'
as a feature, just as we used 'father'
and 'mother'
as features.
lr_three_features = LinearRegression()
lr_three_features.fit(galton[['father', 'mother', 'gender=female']], galton['childHeight'])
LinearRegression()
predictions_three_features = lr_three_features.predict(galton[['father', 'mother', 'gender=female']])
rmse_dict['father, mother, and gender'] = rmse(galton['childHeight'], predictions_three_features)
rmse_dict
{'father only': 3.448416023826759, 'father and mother': 3.3838935702036945, 'father, mother, and gender': 2.1600506762028053}
The RMSE of our new three feature model is significantly lower than the RMSEs of the earlier models. This indicates that 'gender=female'
is very useful in predicting child's heights.
To visualize our data and linear model, we'd need 4 dimensions:
'gender=female'
.Humans can't visualize in 4D, but there may be a solution.
lr_three_features.intercept_, lr_three_features.coef_
(21.736229255415104, array([ 0.39284333, 0.31761007, -5.21498935]))
Above, we are given the values of $w_0^*$, $w_1^*$, $w_2^*$, and $w_3^*$. This means our linear model is of the form:
$$\text{predicted child's height} \\ = 21.736 + 0.393 \cdot \text{father's height} + 0.318 \cdot \text{mother's height} - 5.215 \cdot \text{gender=female}$$But remember, 'gender=female'
is either 1 or 0. Let's look at those two cases separately.
XX, YY = np.mgrid[60:80:2, 55:75:2]
Z_female = (lr_three_features.intercept_ + lr_three_features.coef_[2]) + lr_three_features.coef_[0] * XX + lr_three_features.coef_[1] * YY
Z_male = lr_three_features.intercept_ + lr_three_features.coef_[0] * XX + lr_three_features.coef_[1] * YY
plane_female = go.Surface(x=XX, y=YY, z=Z_female, colorscale ='Greens')
plane_male = go.Surface(x=XX, y=YY, z=Z_male, colorscale='Purples')
fig = go.Figure(data=[plane_female, plane_male])
galton_female = galton[galton['gender'] == 'female']
galton_male = galton[galton['gender'] == 'male']
fig.add_trace(go.Scatter3d(x=galton_female['father'],
y=galton_female['mother'],
z=galton_female['childHeight'], mode='markers', marker = {'color': 'green'}))
fig.add_trace(go.Scatter3d(x=galton_male['father'],
y=galton_male['mother'],
z=galton_male['childHeight'], mode='markers', marker = {'color': 'purple'}))
fig.update_layout(scene = dict(
xaxis_title = 'father',
yaxis_title = 'mother',
zaxis_title = 'child'),
width=1000, height=800,
showlegend=False,
title="Predicted child's heights given parents' heights and gender (purple=male, green=female)")
If we want to visualize in 2D, we must pick a single feature to display on the $x$-axis.
sns.scatterplot(data=galton, x='father', y='childHeight', label='actual child heights')
sns.scatterplot(x=galton['father'],
y=predictions_three_features,
label='predicted child heights using father, mother, and gender'
);
'gender=female'
column in galton
, we engineered a feature that we thought would be useful for our model.