import numpy as np
import pandas as pd
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'


baby_fp = os.path.join('data', 'baby.csv')
baby = pd.read_csv(baby_fp)
baby = baby[['Maternal Smoker', 'Birth Weight']]


baby.head()


%%time

n_repetitions = 3000

differences = []
for _ in range(n_repetitions):
    
    # Step 1: Shuffle the weights and store them in a DataFrame.
    with_shuffled = baby.assign(Shuffled_Weights=np.random.permutation(baby['Birth Weight']))

    # Step 2: Compute the test statistic.
    # Remember, alphabetically, False comes before True,
    # so this computes True - False.
    group_means = (
        with_shuffled
        .groupby('Maternal Smoker')
        .mean()
        .loc[:, 'Shuffled_Weights']
    )
    difference = group_means.diff().iloc[-1]
    
    # Step 3: Store the result.
    differences.append(difference)

CPU times: user 2.46 s, sys: 1.71 ms, total: 2.46 s
Wall time: 2.46 s


fig = px.histogram(pd.DataFrame(differences), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Test Statistic, Original Approach')
fig.update_layout(xaxis_range=[-5, 5])


is_smoker = baby['Maternal Smoker'].values
weights = baby['Birth Weight'].values


is_smoker

array([False, False,  True, ...,  True, False, False])


%%time

np.random.seed(24) # So that we get the same results each time (for lecture).

# We are still using a for-loop!
is_smoker_permutations = np.column_stack([
    np.random.permutation(is_smoker)
    for _ in range(3000)
]).T

CPU times: user 85.6 ms, sys: 2.26 ms, total: 87.9 ms
Wall time: 87.3 ms


is_smoker_permutations

array([[ True,  True, False, ...,  True,  True, False],
       [ True, False,  True, ...,  True, False, False],
       [False,  True, False, ...,  True, False,  True],
       ...,
       [False, False,  True, ..., False, False,  True],
       [ True,  True, False, ...,  True, False,  True],
       [ True,  True,  True, ..., False,  True,  True]])


is_smoker_permutations.shape

(3000, 1174)


is_smoker_permutations.sum(axis=1)

array([459, 459, 459, ..., 459, 459, 459])


is_smoker_permutations[0]

array([ True,  True, False, ...,  True,  True, False])


weights * is_smoker_permutations[0]

array([120, 113,   0, ..., 130, 125,   0])


weights * is_smoker_permutations

array([[120, 113,   0, ..., 130, 125,   0],
       [120,   0, 128, ..., 130,   0,   0],
       [  0, 113,   0, ..., 130,   0, 117],
       ...,
       [  0,   0, 128, ...,   0,   0, 117],
       [120, 113,   0, ..., 130,   0, 117],
       [120, 113, 128, ...,   0, 125, 117]])


n_smokers = is_smoker.sum()
mean_smokers = (weights * is_smoker_permutations).sum(axis=1) / n_smokers
mean_smokers

array([118.94117647, 118.07625272, 120.38562092, ..., 119.76688453,
       119.2745098 , 120.22222222])


mean_smokers.shape

(3000,)


n_non_smokers = 1174 - n_smokers
mean_non_smokers = (weights * ~is_smoker_permutations).sum(axis=1) / n_non_smokers
mean_non_smokers

array([119.7972028 , 120.35244755, 118.86993007, ..., 119.26713287,
       119.58321678, 118.97482517])


test_statistics = mean_smokers - mean_non_smokers
test_statistics

array([-0.85602633, -2.27619483,  1.51569085, ...,  0.49975166,
       -0.30870698,  1.24739705])


%%time

is_smoker = baby['Maternal Smoker'].values
weights = baby['Birth Weight'].values
n_smokers = is_smoker.sum()
n_non_smokers = 1174 - n_smokers

is_smoker_permutations = np.column_stack([
    np.random.permutation(is_smoker)
    for _ in range(3000)
]).T

mean_smokers = (weights * is_smoker_permutations).sum(axis=1) / n_smokers
mean_non_smokers = (weights * ~is_smoker_permutations).sum(axis=1) / n_non_smokers
fast_differences = mean_smokers - mean_non_smokers

CPU times: user 96.2 ms, sys: 1.98 ms, total: 98.2 ms
Wall time: 98.8 ms


fig = px.histogram(pd.DataFrame(fast_differences), x=0, nbins=50, histnorm='probability', 
                   title='Empirical Distribution of the Test Statistic, Faster Approach')
fig.update_layout(xaxis_range=[-5, 5])


to_shuffle = baby.copy()
weights = to_shuffle['Birth Weight']


%%timeit
np.random.permutation(weights.values)

17.2 µs ± 30.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


%%timeit
weights.sample(frac=1)

56.5 µs ± 245 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


%%timeit
to_shuffle['Shuffled_Weights'] = np.random.permutation(weights.values)

39.8 µs ± 157 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


%%timeit
to_shuffle.assign(Shuffled_Weights=np.random.permutation(weights.values))

111 µs ± 417 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Aside: Fast Permutation Tests¶

Speeding things up 🏃¶

Speeding up permutation tests¶

Example: Birth weight and smoking 🚬¶

Timing the birth weights example ⏰¶

A faster approach¶

Broadcasting¶

Putting it all together¶

Other performance considerations¶

`np.random.permutation` (fast) vs `df.sample` (slow)¶

Adding columns in place (fast) vs. `assign` (slow)¶

	Maternal Smoker	Birth Weight
0	False	120
1	False	113
2	True	128
3	True	108
4	False	136

Aside: Fast Permutation Tests¶

Speeding things up 🏃¶

Speeding up permutation tests¶

Example: Birth weight and smoking 🚬¶

Timing the birth weights example ⏰¶

A faster approach¶

Broadcasting¶

Putting it all together¶

Other performance considerations¶

np.random.permutation (fast) vs df.sample (slow)¶

Adding columns in place (fast) vs. assign (slow)¶

`np.random.permutation` (fast) vs `df.sample` (slow)¶

Adding columns in place (fast) vs. `assign` (slow)¶