# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
# Imports for animation.
from lec13 import sampling_animation
from IPython.display import display, HTML
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
🚨 The second half of the course is more conceptual than the first. Reading the textbook (and coming to lecture) will become even more important.
Remember, today's lecture is not in scope for the Midterm Exam!
The distribution is uniform, meaning that each outcome has the same chance of occurring.
die_faces = np.arange(1, 7, 1)
die = bpd.DataFrame().assign(face=die_faces)
die
face | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
bins = np.arange(0.5, 6.6, 1)
# Note that you can add titles to your visualizations, like this!
die.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title='Probability Distribution of a Die Roll',
figsize=(5, 3))
# You can also set the y-axis label with plt.ylabel.
plt.ylabel('Probability');
np.random.choice
.num_rolls = 25
many_rolls = np.random.choice(die_faces, num_rolls)
many_rolls
array([5, 5, 4, ..., 6, 5, 3])
(bpd.DataFrame()
.assign(face=many_rolls)
.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title=f'Empirical Distribution of {num_rolls} Dice Rolls',
figsize=(5, 3))
)
plt.ylabel('Probability');
What happens as we increase the number of rolls?
for num_rolls in [10, 50, 100, 500, 1000, 5000, 10000]:
# Don't worry about how .sample works just yet – we'll cover it shortly.
(die.sample(n=num_rolls, replace=True)
.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title=f'Distribution of {num_rolls} Die Rolls',
figsize=(8, 3))
)
The law of large numbers states that if a chance experiment is repeated
then the proportion of times that an event occurs gets closer and closer to the theoretical probability of that event.
A simple random sample (SRS) is a sample drawn uniformly at random without replacement.
To perform an SRS from a list or array options
, we use np.random.choice(options, n, replace=False)
.
staff = ['Gina Roberg', 'Gabriel Cha', 'Sophia Fang', 'Zoe Ludena',
'Costin Smilovici', 'Suhani Sharma', 'Jasmine Lo', 'Doris Gao',
'Vanessa Hu', 'Arjun Malleswaran', 'Raine Hoang', 'Charlie Gillet',
'Abel Seyoum', 'Anthony Li', 'Oren Ciolli', 'Costin Smilovici',
'Teresa Rexin']
# Simple random sample of 4 course staff members.
np.random.choice(staff, 4, replace=False)
array(['Teresa Rexin', 'Gina Roberg', 'Suhani Sharma', 'Charlie Gillet'], dtype='<U17')
If we use replace=True
, then we're sampling uniformly at random with replacement – there's no simpler term for this.
united_full
contains information about all United flights leaving SFO between 6/1/15 and 8/31/15.
For this lecture, treat this dataset as our population.
united_full = bpd.read_csv('data/united_summer2015.csv')
united_full
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
0 | 6/1/15 | 73 | HNL | 257 |
1 | 6/1/15 | 217 | EWR | 28 |
2 | 6/1/15 | 237 | STL | -3 |
... | ... | ... | ... | ... |
13822 | 8/31/15 | 1994 | ORD | 3 |
13823 | 8/31/15 | 2000 | PHX | -1 |
13824 | 8/31/15 | 2013 | EWR | -2 |
13825 rows × 4 columns
If we want to sample rows from a DataFrame, we can use the .sample
method on a DataFrame. That is,
df.sample(n)
returns a random subset of n
rows of df
, drawn without replacement (i.e. the default is replace=False
, unlike np.random.choice
).
# 5 flights, chosen randomly without replacement.
united_full.sample(5)
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
7474 | 7/20/15 | 1993 | IAD | -2 |
10215 | 8/7/15 | 1263 | MCO | -4 |
4141 | 6/28/15 | 1250 | IAH | 28 |
8458 | 7/27/15 | 1118 | EWR | 1 |
6493 | 7/14/15 | 1227 | EWR | 41 |
# 5 flights, chosen randomly with replacement.
united_full.sample(5, replace=True)
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
3971 | 6/27/15 | 701 | AUS | -5 |
10128 | 8/7/15 | 268 | SNA | 8 |
6351 | 7/13/15 | 1563 | ORD | -6 |
3866 | 6/26/15 | 1456 | EWR | 27 |
783 | 6/6/15 | 498 | SAN | 0 |
Note: The probability of seeing the same row multiple times when sampling with replacement is quite low, since our sample size (5) is small relative to the size of the population (13825).
We only need the 'Delay'
s, so let's select just that column.
united = united_full.get(['Delay'])
united
Delay | |
---|---|
0 | 257 |
1 | 28 |
2 | -3 |
... | ... |
13822 | 3 |
13823 | -1 |
13824 | -2 |
13825 rows × 1 columns
bins = np.arange(-20, 300, 10)
united.plot(kind='hist', y='Delay', bins=bins, density=True, ec='w',
title='Population Distribution of Flight Delays', figsize=(8, 3))
plt.ylabel('Proportion per minute');
Note that this distribution is fixed – nothing about it is random.
united
constitute our population.united
without replacement.sample_size = 100 # Change this and see what happens!
(united
.sample(sample_size)
.plot(kind='hist', y='Delay', bins=bins, density=True, ec='w',
title=f'Distribution of Flight Delays in a Sample of Size {sample_size}',
figsize=(8, 3))
);
Note that as we increase sample_size
, the sample distribution of delays looks more and more like the true population distribution of delays.
To remember: parameter and population both start with p, statistic and sample both start with s.
Question: What was the average delay of all United flights out of SFO in Summer 2015? 🤔
The population mean is a parameter.
# Calculate the mean of the population.
united_mean = united.get('Delay').mean()
united_mean
16.658155515370705
This number (like the population distribution) is fixed, and is not random. In reality, we would not be able to see this number – we can only see it right now because this is a demonstration for teaching!
The sample mean is a statistic. Since it depends on our sample, which was drawn at random, the sample mean is also random.
# Size 100.
united.sample(100).get('Delay').mean()
12.53
What if we choose a larger sample size?
# Size 1000.
united.sample(1000).get('Delay').mean()
17.357
Smaller samples:
Larger samples:
%%capture
anim, anim_means = sampling_animation(united, 1000);
HTML(anim.to_jshtml())
# Sample one thousand flights, two thousand times.
sample_size = 1000
repetitions = 2000
sample_means = np.array([])
for n in np.arange(repetitions):
m = united.sample(sample_size).get('Delay').mean()
sample_means = np.append(sample_means, m)
bpd.DataFrame().assign(sample_means=sample_means) \
.plot(kind='hist', bins=np.arange(10, 25, 0.5), density=True, ec='w',
title=f'Distribution of Sample Mean with Sample Size {sample_size}',
figsize=(10, 5));
plt.axvline(x=united_mean, c='black', linewidth=4, label='population mean')
plt.legend();
We just sampled one thousand flights, two thousand times. If we now sample one hundred flights, two thousand times, how will the histogram change?