import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 5)

import util


salary_path = util.safe_download('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')


salaries = pd.read_csv(salary_path)
util.anonymize_names(salaries)
salaries


salaries.head()


names_path = util.safe_download('https://www.ssa.gov/oact/babynames/names.zip')


import pathlib

dfs = []
for path in pathlib.Path('data/names/').glob('*.txt'):
    year = int(str(path)[14:18])
    if year >= 1964:
        df = pd.read_csv(path, names=['firstname', 'gender', 'count']).assign(year=year)
        dfs.append(df)
        
names = pd.concat(dfs)
names


names.head()


# Get the count of each unique value in the 'gender' column
names['gender'].value_counts()

F    838442
M    561304
Name: gender, dtype: int64


# Look at a single name
names[names['firstname'] == 'Billy']


# Look at various summary statistics
names.describe()


counts_by_gender = (
    names
    .groupby(['firstname', 'gender'])
    .sum()
    .reset_index()
    .pivot('firstname', 'gender', 'count')
    .fillna(0)
)
counts_by_gender


counts_by_gender['F'] > counts_by_gender['M']

firstname
Aaban      False
Aabha       True
Aabid      False
Aabidah     True
Aabir      False
           ...  
Zyvion     False
Zyvon      False
Zyyanna     True
Zyyon      False
Zzyzx      False
Length: 91360, dtype: bool


genders = counts_by_gender.assign(gender=np.where(counts_by_gender['F'] > counts_by_gender['M'], 'F', 'M'))
genders


# Add firstname column
salaries['firstname'] = salaries['Employee Name'].str.split().str[0]
salaries


# Merge salaries and genders
salaries_with_gender = salaries.merge(genders[['gender']], on='firstname', how='left')
salaries_with_gender


pd.concat([
    salaries_with_gender.groupby('gender')['Total Pay'].describe().T,
    salaries_with_gender['Total Pay'].describe().rename('All')
], axis=1)


n_female = np.count_nonzero(salaries_with_gender['gender'] == 'F')
n_female

4075


# Observed statistic
female_median = salaries_with_gender.loc[salaries_with_gender['gender'] == 'F']['Total Pay'].median()

# Simulate 1000 samples of size n_female from the population
medians = np.array([])
for _ in np.arange(1000):
    median = salaries_with_gender.sample(n_female)['Total Pay'].median()
    medians = np.append(medians, median)

medians[:10]

array([70393., 71017., 71715., 71917., 72007., 73564., 70067., 71809.,
       70490., 71005.])


title='Median salary of randomly chosen groups from population'
pd.Series(medians).plot(kind='hist', density=True, ec='w', title=title);
plt.axvline(x=female_median, color='red')
plt.legend(['Observed Median Salary of Female Employees', 'Median Salaries of Random Groups']);


salaries_with_gender[salaries_with_gender['gender'].isnull()]


# Proportion of employees whose names aren't in SSA dataset
salaries_with_gender['gender'].isnull().mean()

0.03863546211820706


# Description of total pay by joined vs. not joined
(
    salaries_with_gender
    .assign(joined=salaries_with_gender['gender'].notnull())
    .groupby('joined')['Total Pay']
    .describe()
    .T
)


nonjoins = salaries_with_gender.loc[salaries_with_gender['gender'].isnull()]

title = 'Distribution of Salaries'
nonjoins['Total Pay'].plot(kind='hist', bins=np.arange(0, 320000, 10000), alpha=0.5, density=True, sharex=True)
salaries_with_gender['Total Pay'].plot(kind='hist', bins=np.arange(0, 320000, 10000), alpha=0.5, density=True, sharex=True, title=title)
plt.legend(['Not in SSA','All']);


names


names['firstname']

0          Emily
1         Hannah
2        Madison
3         Ashley
4          Sarah
          ...   
32025     Zyheem
32026      Zykel
32027     Zyking
32028        Zyn
32029      Zyran
Name: firstname, Length: 1399746, dtype: object


names.iloc[3]

firstname    Ashley
gender            F
count         17997
year           2000
Name: 3, dtype: object


pd.Series([10, 23, 45, 53, 87])

0    10
1    23
2    45
3    53
4    87
dtype: int64


pd.Series({'a': 10, 'b': 23, 'c': 45, 'd': 53, 'e': 87}, name='people')

a    10
b    23
c    45
d    53
e    87
Name: people, dtype: int64


pd.DataFrame?


row_data = [
    ['Granger, Hermione', 'A13245986', 1],
    ['Potter, Harry', 'A17645384', 1],
    ['Weasley, Ron', 'A32438694', 1],
    ['Longbottom, Neville', 'A52342436', 1]
]

row_data

[['Granger, Hermione', 'A13245986', 1],
 ['Potter, Harry', 'A17645384', 1],
 ['Weasley, Ron', 'A32438694', 1],
 ['Longbottom, Neville', 'A52342436', 1]]


pd.DataFrame(row_data)


pd.DataFrame(row_data, columns=['Name', 'PID', 'LVL'])


column_dict = {
    'Name': ['Granger, Hermione', 'Potter, Harry', 'Weasley, Ron', 'Longbottom, Neville'],
    'PID': ['A13245986', 'A17645384', 'A32438694', 'A52342436'],
    'LVL': [1, 1, 1, 1]
}
column_dict

{'Name': ['Granger, Hermione',
  'Potter, Harry',
  'Weasley, Ron',
  'Longbottom, Neville'],
 'PID': ['A13245986', 'A17645384', 'A32438694', 'A52342436'],
 'LVL': [1, 1, 1, 1]}


enrollments = pd.DataFrame(column_dict)
enrollments


enrollments.columns

Index(['Name', 'PID', 'LVL'], dtype='object')


enrollments.index

RangeIndex(start=0, stop=4, step=1)


A = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
A


A.sum(axis=0)

A    5
B    7
C    9
dtype: int64


A.sum(axis=1)

0     6
1    15
dtype: int64


enrollments


enrollments.get('Name')

0      Granger, Hermione
1          Potter, Harry
2           Weasley, Ron
3    Longbottom, Neville
Name: Name, dtype: object


# Doesn't error
enrollments.get('billy')


enrollments


# Returns a Series
enrollments['Name']

0      Granger, Hermione
1          Potter, Harry
2           Weasley, Ron
3    Longbottom, Neville
Name: Name, dtype: object


# Returns a DataFrame
enrollments[['Name', 'PID']]


# 🤔
enrollments[['Name']]


# KeyError
enrollments['billy']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'billy'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_49367/2071819993.py in <module>
      1 # KeyError
----> 2 enrollments['billy']

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in __getitem__(self, key)
   3456             if self.columns.nlevels > 1:
   3457                 return self._getitem_multilevel(key)
-> 3458             indexer = self.columns.get_loc(key)
   3459             if is_integer(indexer):
   3460                 indexer = [indexer]

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 'billy'


enrollments.LVL

0    1
1    1
2    1
3    1
Name: LVL, dtype: int64


enrollments.mean

<bound method NDFrame._add_numeric_operations.<locals>.mean of                   Name        PID  LVL
0    Granger, Hermione  A13245986    1
1        Potter, Harry  A17645384    1
2         Weasley, Ron  A32438694    1
3  Longbottom, Neville  A52342436    1>


enrollments


enrollments.loc[3]

Name    Longbottom, Neville
PID               A52342436
LVL                       1
Name: 3, dtype: object


enrollments.loc[[1, 3]]


enrollments.loc[[3]]


enrollments


bool_arr = [
    False,  # Hermione
    True,   # Harry
    False,  # Ron
    True    # Neville
]

enrollments.loc[bool_arr]


enrollments


enrollments['Name'].str.contains('on')

0     True
1    False
2     True
3     True
Name: Name, dtype: bool


# Rows where Name includes 'on'
enrollments.loc[enrollments['Name'].str.contains('on')]


# Rows where the first letter of Name is between A and L
enrollments.loc[enrollments['Name'] < 'M']


enrollments[enrollments['Name'] < 'M']


enrollments.loc[enrollments['Name'] < 'M']['PID']

0    A13245986
3    A52342436
Name: PID, dtype: object


enrollments


enrollments.loc[enrollments['Name'] < 'M', 'PID']

0    A13245986
3    A52342436
Name: PID, dtype: object


enrollments.loc[enrollments['Name'] < 'M', ['PID']]


enrollments


enrollments.loc[2, 'LVL']

1


enrollments.loc[0:2, 'Name': 'PID']


enrollments


enrollments.iloc[2:4, 0:2]


other = enrollments.set_index('Name')
other


other.iloc[2]

PID    A32438694
LVL            1
Name: Weasley, Ron, dtype: object


other.loc[2]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 2

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_49367/776803957.py in <module>
----> 1 other.loc[2]

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    929 
    930             maybe_callable = com.apply_if_callable(key, self.obj)
--> 931             return self._getitem_axis(maybe_callable, axis=axis)
    932 
    933     def _is_scalar_access(self, key: tuple):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1162         # fall thru to straight lookup
   1163         self._validate_key(key, axis)
-> 1164         return self._get_label(key, axis=axis)
   1165 
   1166     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1111     def _get_label(self, label, axis: int):
   1112         # GH#5667 this will fail if the label is not present in the axis.
-> 1113         return self.obj.xs(label, axis=axis)
   1114 
   1115     def _handle_lowerdim_multi_index_axis0(self, tup: tuple):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3774                 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
   3775         else:
-> 3776             loc = index.get_loc(key)
   3777 
   3778             if isinstance(loc, np.ndarray):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 2


names

...


jack = pd.DataFrame({1: ['fee', 'fi'], '1': ['fo', 'fum']})
jack


# jack[1]


# jack[[1]]


# jack['1']


# jack[[1,1]]


# jack.loc[1]


# jack.loc[jack[1] == 'fo']


# jack[1, ['1', 1]]


# jack.loc[1,1]

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	354692.0	2020	NaN	San Diego	FT
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	352494.0	2020	NaN	San Diego	FT
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	347643.0	2020	NaN	San Diego	PT
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	342606.0	2020	NaN	San Diego	FT
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	342476.0	2020	NaN	San Diego	FT
...	...	...	...	...	...	...	...	...	...	...	...	...
12600	Elena Xxxx	Asst Eng-Civil	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT
12601	Gary Xxxx	Police Officer	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT
12602	Sara Xxxx	Asst Planner	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT
12603	Kevin Xxxx	Project Ofcr 1	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT
12604	Deedrick Xxxx	Utility Worker 2	0.0	0.0	1.00	0.0	1.0	1.0	2020	NaN	San Diego	PT

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	354692.0	2020	NaN	San Diego	FT
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	352494.0	2020	NaN	San Diego	FT
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	347643.0	2020	NaN	San Diego	PT
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	342606.0	2020	NaN	San Diego	FT
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	342476.0	2020	NaN	San Diego	FT

	count	year
count	1.399746e+06	1.399746e+06
mean	1.459451e+02	1.996861e+03
std	1.194298e+03	1.542586e+01
min	5.000000e+00	1.964000e+03
25%	7.000000e+00	1.985000e+03
50%	1.100000e+01	1.999000e+03
75%	3.000000e+01	2.010000e+03
max	8.529100e+04	2.020000e+03

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status	firstname
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	354692.0	2020	NaN	San Diego	FT	Michael
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	352494.0	2020	NaN	San Diego	FT	Gary
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	347643.0	2020	NaN	San Diego	PT	Eric
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	342606.0	2020	NaN	San Diego	FT	Gregg
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	342476.0	2020	NaN	San Diego	FT	Joseph
...	...	...	...	...	...	...	...	...	...	...	...	...	...
12600	Elena Xxxx	Asst Eng-Civil	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT	Elena
12601	Gary Xxxx	Police Officer	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT	Gary
12602	Sara Xxxx	Asst Planner	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Sara
12603	Kevin Xxxx	Project Ofcr 1	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Kevin
12604	Deedrick Xxxx	Utility Worker 2	0.0	0.0	1.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Deedrick

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status	firstname	gender
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	354692.0	2020	NaN	San Diego	FT	Michael	M
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	352494.0	2020	NaN	San Diego	FT	Gary	M
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	347643.0	2020	NaN	San Diego	PT	Eric	M
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	342606.0	2020	NaN	San Diego	FT	Gregg	M
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	342476.0	2020	NaN	San Diego	FT	Joseph	M
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12600	Elena Xxxx	Asst Eng-Civil	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT	Elena	F
12601	Gary Xxxx	Police Officer	0.0	2.0	0.00	0.0	2.0	2.0	2020	NaN	San Diego	PT	Gary	M
12602	Sara Xxxx	Asst Planner	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Sara	F
12603	Kevin Xxxx	Project Ofcr 1	0.0	1.0	0.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Kevin	M
12604	Deedrick Xxxx	Utility Worker 2	0.0	0.0	1.00	0.0	1.0	1.0	2020	NaN	San Diego	PT	Deedrick	M

	firstname	gender	count	year
0	Emily	F	25957	2000
1	Hannah	F	23084	2000
2	Madison	F	19968	2000
3	Ashley	F	17997	2000
4	Sarah	F	17706	2000
...	...	...	...	...
32025	Zyheem	M	5	2019
32026	Zykel	M	5	2019
32027	Zyking	M	5	2019
32028	Zyn	M	5	2019
32029	Zyran	M	5	2019

	firstname	gender	count	year
10875	Billy	F	8	2000
18042	Billy	M	670	2000
14851	Billy	F	6	2014
20007	Billy	M	287	2014
19912	Billy	M	281	2015
...	...	...	...	...
18364	Billy	M	208	2020
21031	Billy	M	458	2008
14044	Billy	F	6	2018
18949	Billy	M	262	2018
18836	Billy	M	245	2019

gender	F	M
firstname
Aaban	0.0	120.0
Aabha	46.0	0.0
Aabid	0.0	16.0
Aabidah	5.0	0.0
Aabir	0.0	10.0
...	...	...
Zyvion	0.0	5.0
Zyvon	0.0	7.0
Zyyanna	6.0	0.0
Zyyon	0.0	6.0
Zzyzx	0.0	10.0

	F	M	All
count	4075.000000	8043.000000	12605.000000
mean	63865.752883	81297.593808	75181.321618
std	43497.853002	51567.740425	49634.174460
min	1.000000	0.000000	0.000000
25%	33084.000000	44900.000000	41177.000000
50%	59975.000000	77579.000000	71354.000000
75%	89854.000000	117374.500000	106903.000000
max	295904.000000	320699.000000	320699.000000

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status	firstname	gender
23	Almis Xxxx	Asst Chief Oper Ofcr	201773.0	0.0	54824.00	33373.0	256597.0	289970.0	2020	NaN	San Diego	FT	Almis	NaN
40	Devinda Xxxx	Fire Battalion Chief	71905.0	133385.0	19829.00	42743.0	225119.0	267862.0	2020	NaN	San Diego	PT	Devinda	NaN
113	Teophilson Xxxx	Police Officer	117691.0	69222.0	32135.00	26281.0	219048.0	245329.0	2020	NaN	San Diego	FT	Teophilson	NaN
153	Tevar Xxxx	Police Officer	91773.0	88703.0	26850.00	27868.0	207326.0	235194.0	2020	NaN	San Diego	FT	Tevar	NaN
154	Junar Xxxx	Police Officer	36112.0	72625.0	101272.00	25179.0	210009.0	235188.0	2020	NaN	San Diego	PT	Junar	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12510	A Xxxx	Sanitation Driver 2	0.0	90.0	0.00	8.0	90.0	98.0	2020	NaN	San Diego	PT	A	NaN
12513	Ventice Xxxx	Plant Tech 3	0.0	90.0	0.00	5.0	90.0	95.0	2020	NaN	San Diego	PT	Ventice	NaN
12578	Anniessa Xxxx	Dispatcher 2	0.0	17.0	0.00	1.0	17.0	18.0	2020	NaN	San Diego	PT	Anniessa	NaN
12582	Navareto Xxxx	Plant Procs Cntrl Electrician	0.0	15.0	0.00	1.0	15.0	16.0	2020	NaN	San Diego	PT	Navareto	NaN
12598	Tien-Hsiang Xxxx	Plant Tech 2	0.0	4.0	0.00	0.0	4.0	4.0	2020	NaN	San Diego	PT	Tien-Hsiang	NaN

joined	False	True
count	487.000000	12118.000000
mean	68852.297741	75435.673378
std	47890.354528	49688.037713
min	4.000000	0.000000
25%	36723.500000	41348.250000
50%	63341.000000	71513.000000
75%	96587.000000	107389.250000
max	256597.000000	320699.000000

	0	1	2
0	Granger, Hermione	A13245986	1
1	Potter, Harry	A17645384	1
2	Weasley, Ron	A32438694	1
3	Longbottom, Neville	A52342436	1

	PID	LVL
Name
Granger, Hermione	A13245986	1
Potter, Harry	A17645384	1
Weasley, Ron	A32438694	1
Longbottom, Neville	A52342436	1

Lecture 2 – Pandas 🐼¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

The data science lifecycle¶

Recap: City of San Diego salary data¶

Question: Does gender influence pay?¶

Social Security Administration baby names 👶¶

Exploring names¶

Data Modeling¶

Determining the most common gender for each name¶

Determining the most common gender for each name¶

Adding a 'gender' column to salaries¶

Predictions and Inference¶

Question: Does gender influence pay?¶

A hypothesis test¶

Running the hypothesis test¶

Even more questions...¶

Is our dataset representative of all San Diego employees?¶

How reliable is our join between salaries and names?¶

How reliable is our join between salaries and names?¶

How reliable is our join between salaries and names?¶

Introduction to pandas¶

pandas¶

pandas data structures¶

Importing pandas and related libraries¶

Series are "slices"¶

Initializing a Series¶

Initializing a DataFrame¶

Method 1: Using a list of rows¶

Method 2: Using a dictionary of columns¶

DataFrame index and column labels¶

Axis¶

DataFrame methods with axis¶

Selecting rows and columns using [] and loc¶

Throwback to babypandas 👶¶

Selecting columns with []¶

Selecting columns with attribute notation¶

Selecting rows with loc¶

Boolean sequence selection¶

Querying¶

Selecting columns and rows simultaneously¶

Selecting columns and rows simultaneously¶

Even more ways of selecting rows and columns¶

Don't forget iloc!¶

Discussion Question¶

More Practice¶

Summary, next time¶

Summary¶

Exploring `names`¶

Adding a `'gender'` column to `salaries`¶

How reliable is our join between `salaries` and `names`?¶

How reliable is our join between `salaries` and `names`?¶

How reliable is our join between `salaries` and `names`?¶

Introduction to `pandas`¶

`pandas`¶

`pandas` data structures¶

Importing `pandas` and related libraries¶

DataFrame methods with `axis`¶

Selecting rows and columns using `[]` and `loc`¶

Throwback to `babypandas` 👶¶

Selecting columns with `[]`¶

Selecting rows with `loc`¶

Don't forget `iloc`!¶