# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
# Animations
import ipywidgets as widgets
from IPython.display import display, HTML
delays = bpd.read_csv('data/delays.csv')
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5))
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)');
Question: Which is larger – the mean or the median?
delays.get('Delay').mean()
16.658155515370705
delays.get('Delay').median()
2.0
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', alpha=0.65, figsize=(10, 5))
plt.plot([delays.get('Delay').mean(), delays.get('Delay').mean()], [0, 1], color='green', label='Mean', linewidth=2)
plt.scatter([delays.get('Delay').mean()], [-0.0017], color='green', marker='^', s=250)
plt.plot([delays.get('Delay').median(), delays.get('Delay').median()], [0, 1], color='purple', label='Median', linewidth=2)
plt.title('Flight Delays')
plt.xlabel('Delay (minutes)')
plt.ylim(-0.005, 0.065)
plt.legend();
data = np.array([2, 3, 3, 9])
np.mean(data)
4.25
deviations = data - np.mean(data)
deviations
array([-2.25, -1.25, -1.25, 4.75])
Each entry in deviations
describes how far the corresponding element in data
is from 4.25.
What is the average deviation?
np.mean(deviations)
0.0
# Square all the deviations:
deviations ** 2
array([ 5.06, 1.56, 1.56, 22.56])
variance = np.mean(deviations ** 2)
variance
7.6875
This quantity, the average squared deviation from the mean, is called the variance.
# Standard deviation (SD) is the square root of the variance.
sd = variance ** 0.5
sd
2.7726341266023544
numpy
has a function, np.std
, that calculates the standard deviation for us.# Note that this evaluates to the same number we found on the previous slide.
np.std(data)
2.7726341266023544
To summarize:
$$\begin{align*}\text{variance} &= \text{average squared deviation from the mean}\\ &= \frac{(\text{value}_1 - \text{mean})^2 + ... + (\text{value}_n - \text{mean})^2}{n}\\ \text{standard deviation} &= \sqrt{\text{variance}} \end{align*}$$where $n$ is the number of observations.
It turns out, in any numerical distribution, the bulk of the data are in the range “mean ± a few SDs”.
Let's make this more precise.
Fact: In any numerical distribution, the proportion of values in the range “mean ± $z$ SDs” is at least
$$1 - \frac{1}{z^2} $$Range | Proportion |
---|---|
mean ± 2 SDs | at least $1 - \frac{1}{4}$ (75%) |
mean ± 3 SDs | at least $1 - \frac{1}{9}$ (88.88..%) |
mean ± 4 SDs | at least $1 - \frac{1}{16}$ (93.75%) |
mean ± 5 SDs | at least $1 - \frac{1}{25}$ (96%) |
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');
delay_mean = delays.get('Delay').mean()
delay_mean
16.658155515370705
delay_std = np.std(delays.get('Delay')) # There is no .std() method in babypandas!
delay_std
39.480199851609314
Chebyshev's inequality tells us that
delay_mean - 2 * delay_std, delay_mean + 2 * delay_std
(-62.30224418784792, 95.61855521858934)
delay_mean - 3 * delay_std, delay_mean + 3 * delay_std
(-101.78244403945723, 135.09875507019865)
Let's visualize these intervals!
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, alpha=0.65, ec='w', figsize=(10, 5), title='Flight Delays')
plt.axvline(delay_mean - 2 * delay_std, color='maroon', label='± 2 SD')
plt.axvline(delay_mean + 2 * delay_std, color='maroon')
plt.axvline(delay_mean + 3 * delay_std, color='blue', label='± 3 SD')
plt.axvline(delay_mean - 3 * delay_std, color='blue')
plt.axvline(delay_mean, color='green', label='Mean')
plt.scatter([delay_mean], [-0.0017], color='green', marker='^', s=250)
plt.ylim(-0.0038, 0.06)
plt.legend();
Remember, Chebyshev's inequality states that at least $1 - \frac{1}{z^2}$ of values are within $z$ SDs from the mean, for any numerical distribution.
For instance, it tells us that at least 75% of delays are in the following interval:
delay_mean - 2 * delay_std, delay_mean + 2 * delay_std
(-62.30224418784792, 95.61855521858934)
However, in this case, a much larger fraction of delays are in that interval.
within_2_sds = delays[(delays.get('Delay') >= delay_mean - 2 * delay_std) &
(delays.get('Delay') <= delay_mean + 2 * delay_std)]
within_2_sds.shape[0] / delays.shape[0]
0.9560940325497288
If we know more about the shape of the distribution, we can provide better guarantees for the proportion of values within $z$ SDs of the mean.
For a particular set of data points, Chebyshev's inequality states that at least $\frac{8}{9}$ of the data points are between $-20$ and $40$. What is the standard deviation of the data?
We'll work with a data set containing the heights and weights of 5000 adult males.
height_and_weight = bpd.read_csv('data/height_and_weight.csv')
height_and_weight
Height | Weight | |
---|---|---|
0 | 73.85 | 241.89 |
1 | 68.78 | 162.31 |
2 | 74.11 | 212.74 |
... | ... | ... |
4997 | 67.01 | 199.20 |
4998 | 71.56 | 185.91 |
4999 | 70.35 | 198.90 |
5000 rows × 2 columns
Let's look at the distributions of both numerical variables.
height_and_weight.plot(kind='hist', y='Height', density=True, ec='w', bins=30, alpha=0.8, figsize=(10, 5));
height_and_weight.plot(kind='hist', y='Weight', density=True, ec='w', bins=30, alpha=0.8, color='C1', figsize=(10, 5));
height_and_weight.plot(kind='hist', density=True, ec='w', bins=60, alpha=0.8, figsize=(10, 5));
Observation: The two distributions look like shifted and stretched versions of the same basic shape, called a bell curve 🔔.
Suppose $x$ is a numerical variable, and $x_i$ is one value of that variable. The function $$z(x_i) = \frac{x_i - \text{mean of $x$}}{\text{SD of $x$}}$$
converts $x_i$ to standard units, which represents the number of standard deviations $x_i$ is above the mean.
Example: Suppose someone weighs 225 pounds. What is their weight in standard units?
weights = height_and_weight.get('Weight')
(225 - weights.mean()) / np.std(weights)
1.9201699181580782
The process of converting all values of a variable (i.e. a column) to standard units is known as standardization, and the resulting values are considered to be standardized.
def standard_units(col):
return (col - col.mean()) / np.std(col)
standardized_height = standard_units(height_and_weight.get('Height'))
standardized_height
0 1.68 1 -0.09 2 1.78 ... 4997 -0.70 4998 0.88 4999 0.46 Name: Height, Length: 5000, dtype: float64
standardized_weight = standard_units(height_and_weight.get('Weight'))
standardized_weight
0 2.77 1 -1.25 2 1.30 ... 4997 0.62 4998 -0.06 4999 0.60 Name: Weight, Length: 5000, dtype: float64
Standardized variables have:
We often standardize variables to bring them to the same scale.
Aside: To quickly see summary statistics for a numerical Series, use the .describe()
Series method.
# e-14 means 10^(-14), which is a very small number, effectively zero.
standardized_height.describe()
count 5.00e+03 mean 1.49e-15 std 1.00e+00 ... 50% 4.76e-04 75% 6.85e-01 max 3.48e+00 Name: Height, Length: 8, dtype: float64
standardized_weight.describe()
count 5.00e+03 mean 5.98e-16 std 1.00e+00 ... 50% 6.53e-04 75% 6.74e-01 max 4.19e+00 Name: Weight, Length: 8, dtype: float64
Let's look at how the process of standardization works visually.
HTML('data/height_anim.html')
HTML('data/weight_anim.html')
Now that we've standardized the distributions of height and weight, let's see how they look on the same set of axes.
standardized_height_and_weight = bpd.DataFrame().assign(
Height=standardized_height,
Weight=standardized_weight
)
standardized_height_and_weight.plot(kind='hist', density=True, ec='w',bins=30, alpha=0.8, figsize=(10, 5));
These both look pretty similar!
def normal_curve(z):
return 1 / np.sqrt(2 * np.pi) * np.exp((-z**2)/2)
x = np.linspace(-4, 4, 1000)
y = normal_curve(x)
plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black');
plt.xlabel('$z$');
plt.title(r'$\phi(z) = \frac{1}{\sqrt{2 \pi}} e^{-\frac{1}{2}z^2}$');
If a distribution follows this shape, we say it is roughly normal.
standardized_height_and_weight.plot(kind='hist', density=True, ec='w', bins=120, alpha=0.8, figsize=(10, 5));
plt.plot(x, y, color='black', linestyle='--', label='Normal', linewidth=5)
plt.legend(loc='upper right');
def normal_area(a, b, bars=False):
x = np.linspace(-4, 4, 1000)
y = normal_curve(x)
ix = (x >= a) & (x <= b)
plt.figure(figsize=(10, 5))
plt.plot(x, y, color='black')
plt.fill_between(x[ix], y[ix], color='gold')
if bars:
plt.axvline(a, color='red')
plt.axvline(b, color='red')
plt.title(f'Area between {np.round(a, 2)} and {np.round(b, 2)}')
plt.show()
a = widgets.FloatSlider(value=0, min=-4,max=3,step=0.25, description='a')
b = widgets.FloatSlider(value=1, min=-4,max=4,step=0.25, description='b')
bars = widgets.Checkbox(value=False, description='bars')
ui = widgets.HBox([a, b, bars])
out = widgets.interactive_output(normal_area, {'a': a, 'b': b, 'bars': bars})
display(ui, out)
# cdf(0) should give us the gold area below.
normal_area(-np.inf, 0)
scipy
module in Python. The function scipy.stats.norm.cdf(z)
computes the area under the standard normal curve to the left of z
.What does scipy.stats.norm.cdf(0)
evaluate to? Why?
normal_area(-np.inf, 0)
from scipy import stats
stats.norm.cdf(0)
0.5
Suppose we want to find the area to the right of 2 under the standard normal curve.
normal_area(2, np.inf)
The following expression gives us the area to the left of 2.
stats.norm.cdf(2)
0.9772498680518208
normal_area(-np.inf, 2)
However, since the total area under the standard normal curve is 1:
$$\text{area right of $2$} = 1 - (\text{area left of $2$})$$1 - stats.norm.cdf(2)
0.02275013194817921
How might we use stats.norm.cdf
to compute the area between -1 and 0?
normal_area(-1, 0)
Strategy:
$$\text{area from $-1$ to $0$} = (\text{area left of $0$}) - (\text{area left of $-1$})$$stats.norm.cdf(0) - stats.norm.cdf(-1)
0.3413447460685429
The area under a standard normal curve in the interval $[a, b]$ is
stats.norm.cdf(b) - stats.norm.cdf(a)
What can we do with this? We're about to see!
Let's return to our data set of heights and weights.
height_and_weight
Height | Weight | |
---|---|---|
0 | 73.85 | 241.89 |
1 | 68.78 | 162.31 |
2 | 74.11 | 212.74 |
... | ... | ... |
4997 | 67.01 | 199.20 |
4998 | 71.56 | 185.91 |
4999 | 70.35 | 198.90 |
5000 rows × 2 columns
As we saw before, both variables are roughly normal. What benefit is there to knowing that the two distributions are roughly normal?
Let's suppose, as is often the case, that we don't have access to the entire distribution of weights, but just the mean and SD.
weight_mean = weights.mean()
weight_mean
187.0206206581932
weight_std = np.std(weights)
weight_std
19.779176302396458
Using just this information, we can estimate the proportion of weights between 200 and 225 pounds:
stats.norm.cdf
to find the area between (1) and (2).left = (200 - weight_mean) / weight_std
left
0.656214351061435
right = (225 - weight_mean) / weight_std
right
1.9201699181580782
normal_area(left, right)
approximation = stats.norm.cdf(right) - stats.norm.cdf(left)
approximation
0.22842488819306406
Since we have access to the entire set of weights, we can compute the true proportion of weights between 200 and 225 pounds.
# True proportion of values between 200 and 225.
height_and_weight[
(height_and_weight.get('Weight') >= 200) &
(height_and_weight.get('Weight') <= 225)
].shape[0] / height_and_weight.shape[0]
0.2294
# Approximation using the standard normal curve.
approximation
0.22842488819306406
Pretty good for an approximation! 🤩
Consider the distribution of delays from earlier in the lecture.
delays.plot(kind='hist', y='Delay', bins=np.arange(-20.5, 210, 5), density=True, ec='w', figsize=(10, 5), title='Flight Delays')
plt.xlabel('Delay (minutes)');
The distribution above does not look normal. It won't look normal even if we standardize it. By standardizing a distribution, all we do is move it horizontally and stretch it vertically – the shape itself doesn't change.
HTML('data/delay_anim.html')