# Set up packages for lecture. Don't worry about understanding this code, but
# make sure to run it if you're following along.
import numpy as np
import babypandas as bpd
import pandas as pd
from matplotlib_inline.backend_inline import set_matplotlib_formats
import matplotlib.pyplot as plt
set_matplotlib_formats("svg")
plt.style.use('ggplot')
np.set_printoptions(threshold=20, precision=2, suppress=True)
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)
The distribution is uniform, meaning that each outcome has the same probability of occurring.
die_faces = np.arange(1, 7, 1)
die = bpd.DataFrame().assign(face=die_faces)
die
face | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
bins = np.arange(0.5, 6.6, 1)
# Note that you can add titles to your visualizations, like this!
die.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title='Probability Distribution of a Die Roll',
figsize=(5, 3))
# You can also set the y-axis label with plt.ylabel
plt.ylabel('Probability');
np.random.choice
.num_rolls = 25
many_rolls = np.random.choice(die_faces, num_rolls)
many_rolls
array([5, 4, 3, ..., 1, 6, 1])
(bpd.DataFrame()
.assign(face=many_rolls)
.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title=f'Empirical Distribution of {num_rolls} Dice Rolls',
figsize=(5, 3))
)
plt.ylabel('Probability');
for num_rolls in [10, 50, 100, 500, 1000, 5000, 10000]:
# Don't worry about how .sample works just yet – we'll cover it shortly
(die.sample(n=num_rolls, replace=True)
.plot(kind='hist', y='face', bins=bins, density=True, ec='w',
title=f'Distribution of {num_rolls} Die Rolls',
figsize=(8, 3))
)
The law of large numbers states that if a chance experiment is repeated
then the proportion of times that an event occurs gets closer and closer to the theoretical probability of that event.
For example: As you roll a die repeatedly, the proportion of times you roll a 5 gets closer to $\frac{1}{6}$.
Question: How do we collect a good sample, so that the sample distribution closely approximates the population distribution?
Bad idea ❌: Survey whoever you can get ahold of (e.g. internet survey, people in line at Panda Express at PC).
top = bpd.read_csv('data/top_movies.csv')
top
Title | Studio | Gross | Gross (Adjusted) | Year | |
---|---|---|---|---|---|
0 | Star Wars: The Force Awakens | Buena Vista (Disney) | 906723418 | 906723400 | 2015 |
1 | Avatar | Fox | 760507625 | 846120800 | 2009 |
2 | Titanic | Paramount | 658672302 | 1178627900 | 1997 |
... | ... | ... | ... | ... | ... |
197 | Duel in the Sun | Selz. | 20408163 | 443877500 | 1946 |
198 | Sergeant York | Warner Bros. | 16361885 | 418671800 | 1941 |
199 | The Four Horsemen of the Apocalypse | MPC | 9183673 | 399489800 | 1921 |
200 rows × 5 columns
start = np.random.choice(np.arange(10))
top.take(np.arange(start, 200, 10))
Title | Studio | Gross | Gross (Adjusted) | Year | |
---|---|---|---|---|---|
7 | Star Wars | Fox | 460998007 | 1549640500 | 1977 |
17 | The Hunger Games | Lionsgate | 408010692 | 442510400 | 2012 |
27 | The Passion of the Christ | NM | 370782930 | 519432100 | 2004 |
... | ... | ... | ... | ... | ... |
177 | Cleopatra (1963) | Fox | 57777778 | 584496100 | 1963 |
187 | Swiss Family Robinson | Disney | 40356000 | 468129600 | 1960 |
197 | Duel in the Sun | Selz. | 20408163 | 443877500 | 1946 |
20 rows × 5 columns
options
, we use np.random.choice(options, replace=False)
.replace=True
, then we're sampling uniformly at random with replacement – there's no simpler term for this.If we want to sample rows from a DataFrame, we can use the .sample
method on a DataFrame. That is,
df.sample(n)
returns a random subset of n
rows of df
, drawn without replacement (i.e. the default is replace=False
, unlike np.random.choice
).
# Without replacement
top.sample(5)
Title | Studio | Gross | Gross (Adjusted) | Year | |
---|---|---|---|---|---|
4 | Marvel's The Avengers | Buena Vista (Disney) | 623357910 | 668866600 | 2012 |
78 | Toy Story 2 | Buena Vista (Disney) | 245852179 | 416177700 | 1999 |
177 | Cleopatra (1963) | Fox | 57777778 | 584496100 | 1963 |
166 | Pinocchio | Disney | 84254167 | 586409000 | 1940 |
42 | Iron Man | Paramount | 318412101 | 385808100 | 2008 |
# With replacement
top.sample(5, replace=True)
Title | Studio | Gross | Gross (Adjusted) | Year | |
---|---|---|---|---|---|
163 | Peter Pan | Disney | 87404651 | 396924700 | 1953 |
177 | Cleopatra (1963) | Fox | 57777778 | 584496100 | 1963 |
178 | 2001: A Space Odyssey | MGM | 56954992 | 377027700 | 1968 |
167 | M.A.S.H. | Fox | 81600000 | 467052600 | 1970 |
78 | Toy Story 2 | Buena Vista (Disney) | 245852179 | 416177700 | 1999 |
united_full
contains information about all United flights leaving SFO between 6/1/15 and 8/31/15.
united_full = bpd.read_csv('data/united_summer2015.csv')
united_full
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
0 | 6/1/15 | 73 | HNL | 257 |
1 | 6/1/15 | 217 | EWR | 28 |
2 | 6/1/15 | 237 | STL | -3 |
... | ... | ... | ... | ... |
13822 | 8/31/15 | 1994 | ORD | 3 |
13823 | 8/31/15 | 2000 | PHX | -1 |
13824 | 8/31/15 | 2013 | EWR | -2 |
13825 rows × 4 columns
We only need the 'Delay'
s, so let's select just that column.
united = united_full.get(['Delay'])
united
Delay | |
---|---|
0 | 257 |
1 | 28 |
2 | -3 |
... | ... |
13822 | 3 |
13823 | -1 |
13824 | -2 |
13825 rows × 1 columns
bins = np.arange(-20, 300, 10)
united.plot(kind='hist', y='Delay', bins=bins, density=True, ec='w',
title='Population Distribution of Flight Delays', figsize=(8, 3))
plt.ylabel('Proportion per minute');
Note that this distribution is fixed – nothing about it is random.
united
constitute our population.united
without replacement.# Sample distribution
sample_size = 100
(united
.sample(sample_size)
.plot(kind='hist', y='Delay', bins=bins, density=True, ec='w',
title='Sample Distribution of Flight Delays',
figsize=(8, 3))
);
Note that as we increase sample_size
, the sample distribution of delays looks more and more like the true population distribution of delays.
To remember: parameter and population both start with p, statistic and sample both start with s.
Question: What is the average delay of United flights out of SFO? 🤔
The population mean is a parameter.
# Calculate the mean of the population
united_mean = united.get('Delay').mean()
united_mean
16.658155515370705
This number (like the population distribution) is fixed, and is not random. In reality, we would not be able to see this number – we can only see it right now because this is a pedagogical demonstration!
The sample mean is a statistic. Since it depends on our sample, which was drawn at random, the sample mean is also random.
# Size 100
united.sample(100).get('Delay').mean()
14.68
What if we choose a larger sample size?
# Size 1000
united.sample(1000).get('Delay').mean()
16.276
Smaller samples:
Larger samples:
# Sample one thousand flights, two thousand times
sample_size = 1000
repetitions = 2000
sample_means = np.array([])
for n in np.arange(repetitions):
m = united.sample(sample_size).get('Delay').mean()
sample_means = np.append(sample_means, m)
bpd.DataFrame().assign(sample_means=sample_means) \
.plot(kind='hist', bins=np.arange(10, 25, 0.5), density=True, ec='w',
title=f'Distribution of Sample Mean with Sample Size {sample_size}',
figsize=(10, 5));
plt.axvline(x=united_mean, c='black');
We just sampled one thousand flights, two thousand times. If we now sample one hundred flights, two thousand times, how will the histogram change?
Next, we'll start talking about statistical models, which will lead us towards hypothesis testing.