import pandas as pd
import numpy as np
import os
pd.options.plotting.backend = 'plotly'


import yaml

player = '''
name: Magnus Carlsen
age: 32
country: NO
'''


yaml.safe_load(player)

{'name': 'Magnus Carlsen', 'age': 32, 'country': False}


stops = pd.read_csv(os.path.join('data', 'vehicle_stops_2016_datasd.csv'))
stops.head()


stops.head(1)


stops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103051 entries, 0 to 103050
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   stop_id           103051 non-null  int64 
 1   stop_cause        103044 non-null  object
 2   service_area      103051 non-null  object
 3   subject_race      102920 non-null  object
 4   subject_sex       102865 non-null  object
 5   subject_age       100284 non-null  object
 6   timestamp         102879 non-null  object
 7   stop_date         103051 non-null  object
 8   stop_time         103051 non-null  object
 9   sd_resident       83689 non-null   object
 10  arrested          84400 non-null   object
 11  searched          83330 non-null   object
 12  obtained_consent  4791 non-null    object
 13  contraband_found  4969 non-null    object
 14  property_seized   4924 non-null    object
dtypes: int64(1), object(14)
memory usage: 11.8+ MB


stops['subject_age'].unique()

array(['28', '25', '31', '29', '52', '24', '20', '50', '23', '54', '53',
       '35', '38', '18', '48', '68', '45', '63', '49', '42', '27', '19',
       '55', '32', '47', '33', '41', '59', '60', '58', '26', '36', '40',
       '39', '21', '64', '30', '43', '17', '51', '34', '56', '44', '22',
       '69', '46', '16', '57', '37', '65', '72', '67', '66', '70', '62',
       '73', '74', '0', '77', nan, '89', '79', '61', '78', '99', '75',
       '85', '82', '71', '15', '80', '81', '93', '84', '76', '2', '4',
       '86', '91', '83', '88', '98', '87', 'No Age', '9', '100', '14',
       '95', '96', '92', '119', '1', '90', '163', '5', '114', '94', '10',
       '212', '220', '6', '145', '97', '120'], dtype=object)


ages = pd.to_numeric(stops['subject_age'], errors='coerce')
ages.describe()

count    99648.000000
mean        37.277697
std         14.456934
min          0.000000
25%         25.000000
50%         34.000000
75%         47.000000
max        220.000000
Name: subject_age, dtype: float64


stops[ages > 100]


ages[ages < 16].value_counts()

0.0     218
15.0     27
2.0       6
14.0      5
4.0       4
1.0       2
10.0      2
9.0       1
5.0       1
6.0       1
Name: subject_age, dtype: int64


stops[ages < 16]


stops['stop_cause'].value_counts()

Moving Violation                      75200
Equipment Violation                   26234
Radio Call/Citizen Contact              571
Muni, County, H&S Code                  319
Personal Knowledge/Informant            289
No Cause Specified on a Card            184
Suspect Info (I.S., Bulletin, Log)      181
Personal Observ/Knowledge                45
MUNI, County, H&S Code                   14
Bicycle                                   2
Suspect Info                              2
BICYCLE                                   1
B & P                                     1
Bicycle Bicycle                           1
Name: stop_cause, dtype: int64


ages[(ages > 15) & (ages <= 85)].plot(kind='hist')


stops[['timestamp', 'stop_date', 'stop_time']].head()


stops[['timestamp', 'stop_date', 'stop_time']].tail(10)


stops


type(np.NaN)

float


# All of the rows where the subject age is missing.
stops[stops['subject_age'].isna()]


# Proportion of values missing in the subject_age column.
stops['subject_age'].isna().mean()

0.026850782622196777


# Proportion of missing values in all columns.
stops.isna().mean()

stop_id             0.000000
stop_cause          0.000068
service_area        0.000000
subject_race        0.001271
subject_sex         0.001805
subject_age         0.026851
timestamp           0.001669
stop_date           0.000000
stop_time           0.000000
sd_resident         0.187888
arrested            0.180988
searched            0.191371
obtained_consent    0.953508
contraband_found    0.951781
property_seized     0.952218
dtype: float64


stops.dropna().head()


stops.dropna().shape

(4619, 15)


stops.shape

(103051, 15)


nans = pd.DataFrame([['dog', 0, 1, np.NaN], ['cat', np.NaN, np.NaN, np.NaN], ['dog', 1, 2, 3]], columns='A B C D'.split())
nans


nans.dropna(how='any')


nans.dropna(how='all')


nans.dropna(subset=['B', 'C'])


nans.dropna(axis=1)


nans


# Filling all NaNs with the same value.
nans.fillna('zebra')


# Filling NaNs differently for each column.
nans.fillna({'B': 'f0', 'C': 'f1', 'D': 'f2'})


# Dictionary of column means.
# Note that most numerical methods ignore null values.
means = {c: nans[c].mean() for c in nans.columns[1:]}
means

{'B': 0.5, 'C': 1.5, 'D': 3.0}


# Filling NaNs with column means
nans.fillna(means)


# Another way of doing the same thing.
nans.iloc[:, 1:].apply(lambda x: x.fillna(x.mean()), axis=0)


# bfill stands for "backfill".
nans.fillna(method='bfill')


# ffill stands for "forward fill".
nans.fillna(method='ffill')


nans


from scipy.special import comb

def p_k_heads(k):
    return comb(100, k) * (0.5) ** 100


sum([p_k_heads(k) for k in range(59, 101)])

0.04431304005703377


plot_df = pd.DataFrame().assign(k = range(101))
plot_df['p_k'] = p_k_heads(plot_df['k'])
plot_df['color'] = plot_df['k'].apply(lambda k: 'orange' if k >= 59 else 'blue')

fig = plot_df.plot(kind='bar', x='k', y='p_k', color='color', width=1000)
fig.add_annotation(text='This red area is called the p-value!', x=77, y=0.008, showarrow=False)

	timestamp	stop_date	stop_time
0	2016-01-01 00:06:00	2016-01-01	0:06
1	2016-01-01 00:10:00	2016-01-01	0:10
2	2016-01-01 00:14:00	2016-01-01	0:14
3	2016-01-01 00:16:00	2016-01-01	0:16
4	2016-01-01 00:30:00	2016-01-01	0:30

	timestamp	stop_date	stop_time
103041	NaN	2016-12-13	0.398611111
103042	NaN	2016-12-15	0.383333333
103043	NaN	2016-12-15	0.397916667
103044	NaN	2016-12-19	0:91
103045	NaN	2016-12-20	0:82
103046	NaN	2016-12-20	0.397916667
103047	NaN	2016-12-21	0:73
103048	NaN	2016-12-21	0:94
103049	NaN	2016-12-29	0:81
103050	NaN	2016-12-29	-0:81

Lecture 8 – Unfaithful Data, Hypothesis Testing¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Messy data¶

More data type ambiguities¶

Example: The Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Example: Police vehicle stops 🚔¶

Data types¶

Unfaithfulness¶

Unfaithful `'subject_age'`¶

Human-entered data¶

Unfaithful data vs. outliers¶

Watch out for...¶

Missing values¶

Where'd you go?¶

Common representations of "null"¶

Common representations of "null"¶

Missing values in the stops dataset¶

Finding null values in `pandas`¶

Dropping observations with null values¶

Dropping observations with null values¶

Filling null values¶

Filling null values, so far¶

Hypothesis testing¶

Hypothesis testing¶

Example: Coin flipping¶

Test statistics¶

Generating the null distribution¶

Generating the null distribution, using math¶

Making a decision¶

Fun fact¶

Summary, next time¶

Summary¶

Next time¶

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1308198	Equipment Violation	530	W	M	28	2016-01-01 00:06:00	2016-01-01	0:06	Y	N	N	N	N	N
1	1308172	Moving Violation	520	B	M	25	2016-01-01 00:10:00	2016-01-01	0:10	N	N	N	NaN	NaN	NaN
2	1308171	Moving Violation	110	H	F	31	2016-01-01 00:14:00	2016-01-01	0:14	NaN	NaN	NaN	NaN	NaN	NaN
3	1308170	Moving Violation	Unknown	W	F	29	2016-01-01 00:16:00	2016-01-01	0:16	N	N	N	NaN	NaN	NaN
4	1308197	Moving Violation	230	W	M	52	2016-01-01 00:30:00	2016-01-01	0:30	N	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
25029	1333254	Equipment Violation	820	W	F	119	2016-03-19 22:20:00	2016-03-19	22:20	N	N	N	NaN	NaN	NaN
34179	1375483	Moving Violation	510	W	M	163	2016-04-18 16:35:00	2016-04-18	16:35	N	N	N	NaN	NaN	NaN
36968	1378369	Moving Violation	240	A	F	114	2016-04-28 11:44:00	2016-04-28	11:44	NaN	NaN	NaN	NaN	NaN	NaN
63570	1405478	Moving Violation	110	V	M	212	2016-08-04 18:10:00	2016-08-04	18:10	Y	N	N	NaN	NaN	NaN
70267	1411694	Moving Violation	310	H	F	220	2016-08-30 18:28:00	2016-08-30	18:28	Y	N	N	NaN	NaN	NaN
77038	1418885	Moving Violation	830	B	M	145	2016-09-22 20:35:00	2016-09-22	20:35	Y	N	Y	N	N	NaN
99449	1440889	Equipment Violation	120	W	F	120	2016-12-15 19:28:00	2016-12-15	19:28	Y	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
547	1308628	Moving Violation	120	W	M	0	2016-01-03 16:20:00	2016-01-03	16:20	NaN	NaN	NaN	NaN	NaN	NaN
747	1308820	Moving Violation	Unknown	H	F	0	2016-01-04 22:13:00	2016-01-04	22:13	N	N	N	NaN	NaN	NaN
1686	1309896	Moving Violation	120	W	M	15	2016-01-08 19:30:00	2016-01-08	19:30	N	N	N	NaN	NaN	NaN
1752	1309975	Moving Violation	Unknown	O	M	0	2016-01-08 23:20:00	2016-01-08	23:20	N	N	N	NaN	NaN	NaN
2054	1310377	Equipment Violation	430	H	F	0	2016-01-10 16:42:00	2016-01-10	16:42	Y	N	N	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
100465	1442806	Moving Violation	810	O	M	15	2016-12-20 10:45:00	2016-12-20	10:45	Y	N	N	NaN	NaN	NaN
101495	1443500	Moving Violation	320	W	M	0	2016-12-23 22:30:00	2016-12-23	22:30	NaN	NaN	NaN	NaN	NaN	NaN
101784	1443745	Moving Violation	Unknown	O	M	0	2016-12-26 19:30:00	2016-12-26	19:30	NaN	NaN	NaN	NaN	NaN	NaN
101790	1443747	Moving Violation	320	W	F	0	2016-12-26 20:00:00	2016-12-26	20:00	NaN	NaN	NaN	NaN	NaN	NaN
102848	1444647	Moving Violation	320	W	M	0	2016-12-31 17:45:00	2016-12-31	17:45	NaN	NaN	NaN	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
606	1309398	Moving Violation	Unknown	H	M	NaN	2016-01-04 00:01:00	2016-01-04	0:01	N	N	N	NaN	NaN	NaN
607	1309404	Moving Violation	510	W	M	NaN	2016-01-04 00:01:00	2016-01-04	0:01	Y	N	N	NaN	NaN	NaN
653	1309400	Moving Violation	620	W	F	NaN	2016-01-04 13:00:00	2016-01-04	13:00	Y	N	N	NaN	NaN	NaN
658	1309399	Moving Violation	620	W	F	NaN	2016-01-04 13:55:00	2016-01-04	13:55	Y	N	N	NaN	NaN	NaN
816	1309424	Moving Violation	620	W	F	NaN	2016-01-05 08:30:00	2016-01-05	8:30	N	N	N	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
102989	1412614	Moving Violation	120	W	F	NaN	NaN	2016-08-30	24:40:00	Y	N	N	NaN	NaN	NaN
102990	1414071	Moving Violation	110	W	M	NaN	NaN	2016-08-31	0:91	N	N	N	NaN	NaN	NaN
102991	1414020	Moving Violation	930	O	F	NaN	NaN	2016-08-31	98:35:00	Y	N	N	NaN	NaN	NaN
103002	1420099	Moving Violation	610	H	M	NaN	NaN	2016-09-27	0.565972222	N	N	N	NaN	NaN	NaN
103005	1422234	Moving Violation	620	W	M	NaN	NaN	2016-10-05	0.417361111	N	N	N	NaN	NaN	NaN

	A
0	dog
1	cat
2	dog

	A	B	C	D
0	dog	0.0	1.0	zebra
1	cat	zebra	zebra	zebra
2	dog	1.0	2.0	3.0

	A	B	C	D
0	dog	0.0	1.0	f2
1	cat	f0	f1	f2
2	dog	1.0	2.0	3.0

	A	B	C	D
0	dog	0.0	1.0	3.0
1	cat	0.5	1.5	3.0
2	dog	1.0	2.0	3.0

Lecture 8 – Unfaithful Data, Hypothesis Testing¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Messy data¶

More data type ambiguities¶

Example: The Norway problem 🇳🇴¶

Unfaithful data¶

Is the data "faithful" to the DGP?¶

Is the data "faithful" to the DGP?¶

Example: Police vehicle stops 🚔¶

Data types¶

Unfaithfulness¶

Unfaithful 'subject_age'¶

Human-entered data¶

Unfaithful data vs. outliers¶

Watch out for...¶

Missing values¶

Where'd you go?¶

Common representations of "null"¶

Common representations of "null"¶

Missing values in the stops dataset¶

Finding null values in pandas¶

Dropping observations with null values¶

Dropping observations with null values¶

Filling null values¶

Filling null values, so far¶

Hypothesis testing¶

Hypothesis testing¶

Example: Coin flipping¶

Test statistics¶

Generating the null distribution¶

Generating the null distribution, using math¶

Making a decision¶

Fun fact¶

Summary, next time¶

Summary¶

Next time¶

Unfaithful `'subject_age'`¶

Finding null values in `pandas`¶