from dsc80_utils import *

baby_path = Path('data') / 'babyweights.csv'
baby = pd.read_csv(baby_path)
baby = baby[['Maternal Smoker', 'Birth Weight']]

baby.head()

%%time

n_repetitions = 3000

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, alphabetically, False comes before True,
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.diff().iloc[-1]
    
    # Step 3: Store the result.
    differences.append(difference)

CPU times: user 1.65 s, sys: 4.59 ms, total: 1.66 s
Wall time: 1.66 s

pio.renderers.default = 'plotly_mimetype+notebook' # If the plot doesn't load, uncomment this.
fig = px.histogram(pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Test Statistic, Original Approach')
fig.update_layout(xaxis_range=[-5, 5])

is_smoker = baby['Maternal Smoker'].to_numpy()
weights = baby['Birth Weight'].to_numpy()

is_smoker

array([False, False,  True, ...,  True, False, False])

%%time

np.random.seed(24) # So that we get the same results each time (for lecture).

# We are still using a for-loop!
is_smoker_permutations = np.column_stack([
    np.random.permutation(is_smoker)
    for _ in range(3000)
]).T

CPU times: user 73.3 ms, sys: 4.24 ms, total: 77.5 ms
Wall time: 77.8 ms

is_smoker_permutations

array([[ True,  True, False, ...,  True,  True, False],
       [ True, False,  True, ...,  True, False, False],
       [False,  True, False, ...,  True, False,  True],
       ...,
       [False, False,  True, ..., False, False,  True],
       [ True,  True, False, ...,  True, False,  True],
       [ True,  True,  True, ..., False,  True,  True]])

is_smoker_permutations.shape

(3000, 1174)

is_smoker_permutations.sum(axis=1)

array([459, 459, 459, ..., 459, 459, 459])

is_smoker_permutations[0]

array([ True,  True, False, ...,  True,  True, False])

weights * is_smoker_permutations[0]

array([120, 113,   0, ..., 130, 125,   0])

weights * is_smoker_permutations

array([[120, 113,   0, ..., 130, 125,   0],
       [120,   0, 128, ..., 130,   0,   0],
       [  0, 113,   0, ..., 130,   0, 117],
       ...,
       [  0,   0, 128, ...,   0,   0, 117],
       [120, 113,   0, ..., 130,   0, 117],
       [120, 113, 128, ...,   0, 125, 117]])

n_smokers = is_smoker.sum()
mean_smokers = (weights * is_smoker_permutations).sum(axis=1) / n_smokers
mean_smokers

array([118.94, 118.08, 120.39, ..., 119.77, 119.27, 120.22])

mean_smokers.shape

(3000,)

n_non_smokers = 1174 - n_smokers
mean_non_smokers = (weights * ~is_smoker_permutations).sum(axis=1) / n_non_smokers
mean_non_smokers

array([119.8 , 120.35, 118.87, ..., 119.27, 119.58, 118.97])

test_statistics = mean_smokers - mean_non_smokers
test_statistics

array([-0.86, -2.28,  1.52, ...,  0.5 , -0.31,  1.25])

%%time

is_smoker = baby['Maternal Smoker'].values
weights = baby['Birth Weight'].values
n_smokers = is_smoker.sum()
n_non_smokers = 1174 - n_smokers

is_smoker_permutations = np.column_stack([
    np.random.permutation(is_smoker)
    for _ in range(3000)
]).T

mean_smokers = (weights * is_smoker_permutations).sum(axis=1) / n_smokers
mean_non_smokers = (weights * ~is_smoker_permutations).sum(axis=1) / n_non_smokers
fast_differences = mean_smokers - mean_non_smokers

CPU times: user 87.1 ms, sys: 10.6 ms, total: 97.7 ms
Wall time: 93.4 ms

fig = px.histogram(pd.DataFrame(fast_differences), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Test Statistic, Faster Approach')
fig.update_layout(xaxis_range=[-5, 5])

to_shuffle = baby.copy()
weights = to_shuffle['Birth Weight']

%%timeit
np.random.permutation(weights.to_numpy())

12.9 µs ± 281 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

%%timeit
weights.sample(frac=1)

38.7 µs ± 1.67 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
to_shuffle['Shuffled_Weights'] = np.random.permutation(weights.to_numpy())

27.9 µs ± 685 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
to_shuffle.assign(Shuffled_Weights=np.random.permutation(weights.to_numpy()))

67.8 µs ± 643 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Aside: Fast Permutation Tests¶

Speeding things up 🏃¶

Speeding up permutation tests¶

Example: Birth weight and smoking 🚬¶

Timing the birth weights example ⏰¶

A faster approach¶

Broadcasting¶

Putting it all together¶

Other performance considerations¶

`np.random.permutation` (fast) vs `df.sample` (slow)¶

Adding columns in place (fast) vs. `assign` (slow)¶

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
3	True	108
4	False	136

Aside: Fast Permutation Tests¶

Speeding things up 🏃¶

Speeding up permutation tests¶

Example: Birth weight and smoking 🚬¶

Timing the birth weights example ⏰¶

A faster approach¶

Broadcasting¶

Putting it all together¶

Other performance considerations¶

np.random.permutation (fast) vs df.sample (slow)¶

Adding columns in place (fast) vs. assign (slow)¶

`np.random.permutation` (fast) vs `df.sample` (slow)¶

Adding columns in place (fast) vs. `assign` (slow)¶