# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10, 5)
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
import warnings
warnings.filterwarnings('ignore')
# Demonstration code
from IPython.display import display
import ipywidgets as widgets
import plotly.express as px
def r_scatter(r):
"Generate a scatter plot with a correlation approximately r"
x = np.random.normal(0, 1, 1000)
z = np.random.normal(0, 1, 1000)
y = r * x + (np.sqrt(1 - r ** 2)) * z
plt.scatter(x, y)
plt.xlim(-4, 4)
plt.ylim(-4, 4)
plt.title(f'$r={r}$')
def show_scatter_grid():
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([-1, -2/3, -1/3, 0]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'r = {np.round(r, 2)}')
plt.show()
plt.subplots(1, 4, figsize=(10, 2))
for i, r in enumerate([1, 2/3, 1/3]):
plt.subplot(1, 4, i+1)
r_scatter(r)
plt.title(f'$r = {np.round(r, 2)}$')
plt.subplot(1, 4, 4)
plt.axis('off')
plt.show()
At a high level, the second half of this class has been about statistical inference – using a sample to draw conclusions about the population.
hybrid = bpd.read_csv('data/hybrid.csv')
hybrid
| vehicle | year | price | acceleration | mpg | class | |
|---|---|---|---|---|---|---|
| 0 | Prius (1st Gen) | 1997 | 24509.74 | 7.46 | 41.26 | Compact |
| 1 | Tino | 2000 | 35354.97 | 8.20 | 54.10 | Compact |
| 2 | Prius (2nd Gen) | 2000 | 26832.25 | 7.97 | 45.23 | Compact |
| ... | ... | ... | ... | ... | ... | ... |
| 150 | C-Max Energi Plug-in | 2013 | 32950.00 | 11.76 | 43.00 | Midsize |
| 151 | Fusion Energi Plug-in | 2013 | 38700.00 | 11.76 | 43.00 | Midsize |
| 152 | Chevrolet Volt | 2013 | 39145.00 | 11.11 | 37.00 | Compact |
153 rows × 6 columns
'price' vs. 'acceleration'¶Is there an association between these two variables? If so, what kind?
(Note: When looking at a scatter plot, we often describe it in the form "$y$ vs. $x$.")
hybrid.plot(kind='scatter', x='acceleration', y='price');
Acceleration here is measured in kilometers per hour per second – this means that larger accelerations correspond to quicker cars!
'price' vs. 'mpg'¶Is there an association between these two variables? If so, what kind?
hybrid.plot(kind='scatter', x='mpg', y='price');
Just for fun, we can look at an interactive version of the previous plot. Hover over a point to see the name of the corresponding car.
px.scatter(hybrid.to_df(), x='mpg', y='price', hover_name='vehicle')
Do you recognize any of the most expensive or most efficient cars? (If not, Google "ActiveHybrid 7i", "Panamera S", and "Prius.")
Consider the following few scatter plots.
show_scatter_grid()
Observation: When $r = 1$, the scatter plot of $x$ and $y$ is a line with a positive slope, and when $r = -1$, the scatter plot of $x$ and $y$ is a line with a negative slope. When $r = 0$, the scatter plot of $x$ and $y$ is a patternless cloud of points.
To see more example scatter plots with different values of $r$, play with the widget that appears below.
widgets.interact(r_scatter, r=(-1, 1, 0.05));
interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…
Let's now compute the value of $r$ for the two scatter plots ('price' vs. 'acceleration' and 'price' vs. 'mpg') we saw earlier.
def standard_units(col):
return (col - col.mean()) / np.std(col)
Let's define a function that calculates the correlation, $r$, between two columns in a DataFrame.
def calculate_r(df, x, y):
'''Returns the average value of the product of x and y,
when both are measured in standard units.'''
x_su = standard_units(df.get(x))
y_su = standard_units(df.get(y))
return (x_su * y_su).mean()
'price' vs. 'acceleration'¶Let's calculate $r$ for 'acceleration' and 'price'.
hybrid.plot(kind='scatter', x='acceleration', y='price');
calculate_r(hybrid, 'acceleration', 'price')
0.695577899691398
Observation: $r$ is positive, and the association between 'acceleration' and 'price' is positive.
'price' vs. 'mpg'¶Let's now calculate $r$ for 'mpg' and 'price'.
hybrid.plot(kind='scatter', x='mpg', y='price');
calculate_r(hybrid, 'mpg', 'price')
-0.5318263633683789
Observation: $r$ is negative, and the association between 'mpg' and 'price' is negative.
Also, $r$ here is closer to 0 than it was on the previous slide – that is, the magnitude (absolute value) of $r$ is less than in the previous scatter plot – and the points in this scatter plot look less like a straight line than they did in the previous scatter plot.
A linear transformation to a variable results from adding, subtracting, multiplying, and/or dividing each value by a constant. For example, the conversion from degrees Celsius to degrees Fahrenheit is a linear transformation:
$$\text{temperature (Fahrenheit)} = \frac{9}{5} \times \text{temperature (Celsius)} + 32$$
hybrid.plot(kind='scatter', x='mpg', y='price', title='price (dollars) vs. mpg');
hybrid.assign(
price_yen=hybrid.get('price') * 147.20, # The USD to Japanese Yen exchange rate as of today morning.
kpg=hybrid.get('mpg') * 1.6 # 1 mile is 1.6 kilometers.
).plot(kind='scatter', x='kpg', y='price_yen', title='price (yen) vs. kpg');
For instance,
$$r\big(\text{price in dollars}, \text{fuel economy in miles per gallon}\big)$$
To better understand why $r$ is the average value of the product of $x$ and $y$, when both are measured in standard units, let's convert the 'acceleration', 'mpg', and 'price' columns to standard units, redraw the same scatter plots we saw earlier, and explore what we can learn from the product of $x$ and $y$ in standard units.
def standardize(df):
"""Return a DataFrame in which all columns of df are converted to standard units."""
df_su = bpd.DataFrame()
for column in df.columns:
# This uses syntax that is out of scope; don't worry about how it works.
df_su = df_su.assign(**{column + ' (su)': standard_units(df.get(column))})
return df_su
hybrid_su = standardize(hybrid.get(['acceleration', 'mpg', 'price']))
hybrid_su
| acceleration (su) | mpg (su) | price (su) | |
|---|---|---|---|
| 0 | -1.54 | 0.59 | -6.94e-01 |
| 1 | -1.28 | 1.76 | -1.86e-01 |
| 2 | -1.36 | 0.95 | -5.85e-01 |
| ... | ... | ... | ... |
| 150 | -0.07 | 0.75 | -2.98e-01 |
| 151 | -0.07 | 0.75 | -2.90e-02 |
| 152 | -0.29 | 0.20 | -8.17e-03 |
153 rows × 3 columns
'price' vs. 'acceleration'¶Here, 'acceleration' and 'price' are both measured in standard units. Note that the shape of the scatter plot is the same as before; it's just that the axes are on a different scale.
hybrid_su.plot(kind='scatter', x='acceleration (su)', y='price (su)')
plt.axvline(0, color='black')
plt.axhline(0, color='black');
'acceleration' ($x$) and 'price' ($y$) is positive ↗️. Most data points fall in either the lower left quadrant (where $x_{i \: \text{(su)}}$ and $y_{i \: \text{(su)}}$ are both negative) or the upper right quadrant (where $x_{i \: \text{(su)}}$ and $y_{i \: \text{(su)}}$ are both positive).calculate_r(hybrid, 'acceleration', 'price')
0.695577899691398
'price' vs. 'mpg'¶Here, 'mpg' and 'price' are both measured in standard units. Again, note that the shape of the scatter plot is the same as before.
hybrid_su.plot(kind='scatter', x='mpg (su)', y='price (su)')
plt.axvline(0, color='black');
plt.axhline(0, color='black');
'mpg' ($x$) and 'price' ($y$) is negative ↘️. Most data points fall in the upper left or lower right quadrants.calculate_r(hybrid, 'mpg', 'price')
-0.5318263633683789
# Once again, run this cell and play with the slider that appears!
widgets.interact(r_scatter, r=(-1, 1, 0.05));
interactive(children=(FloatSlider(value=0.0, description='r', max=1.0, min=-1.0, step=0.05), Output()), _dom_c…
Which of the following does the scatter plot below show?
x2 = bpd.DataFrame().assign(
x=np.arange(-6, 6.1, 0.5),
y=np.arange(-6, 6.1, 0.5) ** 2
)
x2.plot(kind='scatter', x='x', y='y');
The data below was collected in the late 1800s by Francis Galton.
galton = bpd.read_csv('data/galton.csv')
galton
| family | father | mother | midparentHeight | children | childNum | gender | childHeight | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 78.5 | 67.0 | 75.43 | 4 | 1 | male | 73.2 |
| 1 | 1 | 78.5 | 67.0 | 75.43 | 4 | 2 | female | 69.2 |
| 2 | 1 | 78.5 | 67.0 | 75.43 | 4 | 3 | female | 69.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 931 | 203 | 62.0 | 66.0 | 66.64 | 3 | 3 | female | 61.0 |
| 932 | 204 | 62.5 | 63.0 | 65.27 | 2 | 1 | male | 66.5 |
| 933 | 204 | 62.5 | 63.0 | 65.27 | 2 | 2 | female | 57.0 |
934 rows × 8 columns
Let's focus on the relationship between mothers' heights and their adult sons' heights.
male_children = galton[galton.get('gender') == 'male'].reset_index()
mom_son = bpd.DataFrame().assign(mom=male_children.get('mother'), son=male_children.get('childHeight'))
mom_son
| mom | son | |
|---|---|---|
| 0 | 67.0 | 73.2 |
| 1 | 66.5 | 73.5 |
| 2 | 66.5 | 72.5 |
| ... | ... | ... |
| 478 | 60.0 | 66.0 |
| 479 | 66.0 | 64.0 |
| 480 | 63.0 | 66.5 |
481 rows × 2 columns
mom_son.plot(kind='scatter', x='mom', y='son');
The scatter plot demonstrates a positive linear association between mothers' heights and their adult sons' heights, so $r$ should be positive.
r_mom_son = calculate_r(mom_son, 'mom', 'son')
r_mom_son
0.32300498368490554
'mom' and 'son' are measured in standard units, not inches.mom_son_su = standardize(mom_son)
def constant_prediction(prediction):
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=f'Predicting a height of {prediction} SUs for all sons', figsize=(10, 5));
plt.axhline(prediction, color='orange', lw=4);
plt.xlim(-3, 3)
plt.show()
prediction = widgets.FloatSlider(value=-3, min=-3,max=3,step=0.5, description='prediction')
ui = widgets.HBox([prediction])
out = widgets.interactive_output(constant_prediction, {'prediction': prediction})
display(ui, out)
HBox(children=(FloatSlider(value=-3.0, description='prediction', max=3.0, min=-3.0, step=0.5),))
Output()
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title='A good prediction is the mean height of sons (0 SUs)', figsize=(10, 5));
plt.axhline(0, color='orange', lw=4);
plt.xlim(-3, 3);
def linear_prediction(slope):
x = np.linspace(-3, 3)
y = x * slope
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', figsize=(10, 5));
plt.plot(x, y, color='orange', lw=4)
plt.xlim(-3, 3)
plt.title(r"Predicting sons' heights using $\mathrm{son}_{\mathrm{(su)}}$ = " + str(np.round(slope, 2)) + r"$ \cdot \mathrm{mother}_{\mathrm{(su)}}$")
plt.show()
slope = widgets.FloatSlider(value=0, min=-1,max=1,step=1/6, description='slope')
ui = widgets.HBox([slope])
out = widgets.interactive_output(linear_prediction, {'slope': slope})
display(ui, out)
HBox(children=(FloatSlider(value=0.0, description='slope', max=1.0, min=-1.0, step=0.16666666666666666),))
Output()
x = np.linspace(-3, 3)
y = x * r_mom_son
mom_son_su.plot(kind='scatter', x='mom (su)', y='son (su)', title=r'A good line goes through the origin and has slope $r$', figsize=(10, 5));
plt.plot(x, y, color='orange', label='regression line', lw=4)
plt.xlim(-3, 3)
plt.legend();


Ideally, we'd like to be able to predict a son's height in inches, not just in standard units. Given a mother's height in inches, here's how we'll predict her son's height in inches:
Let's try it!
mom_mean = mom_son.get('mom').mean()
mom_sd = np.std(mom_son.get('mom'))
son_mean = mom_son.get('son').mean()
son_sd = np.std(mom_son.get('son'))
def predict_with_r(mom):
"""Return a prediction for the height of a son whose mother has height mom,
using linear regression.
"""
mom_su = (mom - mom_mean) / mom_sd
son_su = r_mom_son * mom_su
return son_su * son_sd + son_mean
predict_with_r(68)
70.68219686848828
predict_with_r(60)
67.76170758654767
preds = mom_son.assign(
predicted_height=mom_son.get('mom').apply(predict_with_r)
)
ax = preds.plot(kind='scatter', x='mom', y='son', title='Regression line predictions, in original units', figsize=(10, 5), label='original data')
preds.plot(kind='line', x='mom', y='predicted_height', ax=ax, color='orange', label='regression line', lw=4);
plt.legend();
A course has a midterm (mean 80, standard deviation 15) and a really hard final (mean 50, standard deviation 12).
If the scatter plot comparing midterm & final scores for students looks linearly associated with correlation 0.75, then what is the predicted final exam score for a student who received a 90 on the midterm?
More on regression, including: