import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 5)
import util
pandas.[] and loc. 
Our dataset is downloaded from Transparent California.
salary_path = util.safe_download('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
salaries = pd.read_csv(salary_path)
util.anonymize_names(salaries)
salaries
| Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Total Pay & Benefits | Year | Notes | Agency | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Michael Xxxx | Police Officer | 117691.0 | 187290.0 | 13331.00 | 36380.0 | 318312.0 | 354692.0 | 2020 | NaN | San Diego | FT |
| 1 | Gary Xxxx | Police Officer | 117691.0 | 160062.0 | 42946.00 | 31795.0 | 320699.0 | 352494.0 | 2020 | NaN | San Diego | FT |
| 2 | Eric Xxxx | Fire Engineer | 35698.0 | 204462.0 | 69121.00 | 38362.0 | 309281.0 | 347643.0 | 2020 | NaN | San Diego | PT |
| 3 | Gregg Xxxx | Retirement Administrator | 305000.0 | 0.0 | 12814.00 | 24792.0 | 317814.0 | 342606.0 | 2020 | NaN | San Diego | FT |
| 4 | Joseph Xxxx | Fire Battalion Chief | 94451.0 | 157778.0 | 48151.00 | 42096.0 | 300380.0 | 342476.0 | 2020 | NaN | San Diego | FT |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12600 | Elena Xxxx | Asst Eng-Civil | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT |
| 12601 | Gary Xxxx | Police Officer | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT |
| 12602 | Sara Xxxx | Asst Planner | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT |
| 12603 | Kevin Xxxx | Project Ofcr 1 | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT |
| 12604 | Deedrick Xxxx | Utility Worker 2 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT |
12605 rows × 12 columns
salaries.head()
| Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Total Pay & Benefits | Year | Notes | Agency | Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Michael Xxxx | Police Officer | 117691.0 | 187290.0 | 13331.00 | 36380.0 | 318312.0 | 354692.0 | 2020 | NaN | San Diego | FT |
| 1 | Gary Xxxx | Police Officer | 117691.0 | 160062.0 | 42946.00 | 31795.0 | 320699.0 | 352494.0 | 2020 | NaN | San Diego | FT |
| 2 | Eric Xxxx | Fire Engineer | 35698.0 | 204462.0 | 69121.00 | 38362.0 | 309281.0 | 347643.0 | 2020 | NaN | San Diego | PT |
| 3 | Gregg Xxxx | Retirement Administrator | 305000.0 | 0.0 | 12814.00 | 24792.0 | 317814.0 | 342606.0 | 2020 | NaN | San Diego | FT |
| 4 | Joseph Xxxx | Fire Battalion Chief | 94451.0 | 157778.0 | 48151.00 | 42096.0 | 300380.0 | 342476.0 | 2020 | NaN | San Diego | FT |
salaries dataset to infer the gender of San Diego employees.names_path = util.safe_download('https://www.ssa.gov/oact/babynames/names.zip')
import pathlib
dfs = []
for path in pathlib.Path('data/names/').glob('*.txt'):
year = int(str(path)[14:18])
if year >= 1964:
df = pd.read_csv(path, names=['firstname', 'gender', 'count']).assign(year=year)
dfs.append(df)
names = pd.concat(dfs)
names
| firstname | gender | count | year | |
|---|---|---|---|---|
| 0 | Emily | F | 25957 | 2000 |
| 1 | Hannah | F | 23084 | 2000 |
| 2 | Madison | F | 19968 | 2000 |
| 3 | Ashley | F | 17997 | 2000 |
| 4 | Sarah | F | 17706 | 2000 |
| ... | ... | ... | ... | ... |
| 32025 | Zyheem | M | 5 | 2019 |
| 32026 | Zykel | M | 5 | 2019 |
| 32027 | Zyking | M | 5 | 2019 |
| 32028 | Zyn | M | 5 | 2019 |
| 32029 | Zyran | M | 5 | 2019 |
1399746 rows × 4 columns
We began compiling the baby name list in 1997, with names dating back to 1880. At the time of a child’s birth, parents supply the name to us when applying for a child’s Social Security card, thus making Social Security America’s source for the most popular baby names. Please share this with your friends and family—and help us spread the word on social media. - Social Security’s Top Baby Names for 2020
names¶'gender' in names are 'M' and 'F'.'M' and 'F'.names.head()
| firstname | gender | count | year | |
|---|---|---|---|---|
| 0 | Emily | F | 25957 | 2000 |
| 1 | Hannah | F | 23084 | 2000 |
| 2 | Madison | F | 19968 | 2000 |
| 3 | Ashley | F | 17997 | 2000 |
| 4 | Sarah | F | 17706 | 2000 |
# Get the count of each unique value in the 'gender' column
names['gender'].value_counts()
F 838442 M 561304 Name: gender, dtype: int64
# Look at a single name
names[names['firstname'] == 'Billy']
| firstname | gender | count | year | |
|---|---|---|---|---|
| 10875 | Billy | F | 8 | 2000 |
| 18042 | Billy | M | 670 | 2000 |
| 14851 | Billy | F | 6 | 2014 |
| 20007 | Billy | M | 287 | 2014 |
| 19912 | Billy | M | 281 | 2015 |
| ... | ... | ... | ... | ... |
| 18364 | Billy | M | 208 | 2020 |
| 21031 | Billy | M | 458 | 2008 |
| 14044 | Billy | F | 6 | 2018 |
| 18949 | Billy | M | 262 | 2018 |
| 18836 | Billy | M | 245 | 2019 |
104 rows × 4 columns
# Look at various summary statistics
names.describe()
| count | year | |
|---|---|---|
| count | 1.399746e+06 | 1.399746e+06 |
| mean | 1.459451e+02 | 1.996861e+03 |
| std | 1.194298e+03 | 1.542586e+01 |
| min | 5.000000e+00 | 1.964000e+03 |
| 25% | 7.000000e+00 | 1.985000e+03 |
| 50% | 1.100000e+01 | 1.999000e+03 |
| 75% | 3.000000e+01 | 2.010000e+03 |
| max | 8.529100e+04 | 2.020000e+03 |

'firstname' that describes the total number of 'F' and 'M' babies in names for each unique 'firstname'.counts_by_gender = (
names
.groupby(['firstname', 'gender'])
.sum()
.reset_index()
.pivot('firstname', 'gender', 'count')
.fillna(0)
)
counts_by_gender
| gender | F | M |
|---|---|---|
| firstname | ||
| Aaban | 0.0 | 120.0 |
| Aabha | 46.0 | 0.0 |
| Aabid | 0.0 | 16.0 |
| Aabidah | 5.0 | 0.0 |
| Aabir | 0.0 | 10.0 |
| ... | ... | ... |
| Zyvion | 0.0 | 5.0 |
| Zyvon | 0.0 | 7.0 |
| Zyyanna | 6.0 | 0.0 |
| Zyyon | 0.0 | 6.0 |
| Zzyzx | 0.0 | 10.0 |
91360 rows × 2 columns
counts_by_gender['F'] > counts_by_gender['M']
firstname
Aaban False
Aabha True
Aabid False
Aabidah True
Aabir False
...
Zyvion False
Zyvon False
Zyyanna True
Zyyon False
Zzyzx False
Length: 91360, dtype: bool
genders = counts_by_gender.assign(gender=np.where(counts_by_gender['F'] > counts_by_gender['M'], 'F', 'M'))
genders
| gender | F | M | gender |
|---|---|---|---|
| firstname | |||
| Aaban | 0.0 | 120.0 | M |
| Aabha | 46.0 | 0.0 | F |
| Aabid | 0.0 | 16.0 | M |
| Aabidah | 5.0 | 0.0 | F |
| Aabir | 0.0 | 10.0 | M |
| ... | ... | ... | ... |
| Zyvion | 0.0 | 5.0 | M |
| Zyvon | 0.0 | 7.0 | M |
| Zyyanna | 6.0 | 0.0 | F |
| Zyyon | 0.0 | 6.0 | M |
| Zzyzx | 0.0 | 10.0 | M |
91360 rows × 3 columns
'gender' column to salaries¶This involves two steps:
'Employee Name'.salaries and genders.# Add firstname column
salaries['firstname'] = salaries['Employee Name'].str.split().str[0]
salaries
| Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Total Pay & Benefits | Year | Notes | Agency | Status | firstname | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Michael Xxxx | Police Officer | 117691.0 | 187290.0 | 13331.00 | 36380.0 | 318312.0 | 354692.0 | 2020 | NaN | San Diego | FT | Michael |
| 1 | Gary Xxxx | Police Officer | 117691.0 | 160062.0 | 42946.00 | 31795.0 | 320699.0 | 352494.0 | 2020 | NaN | San Diego | FT | Gary |
| 2 | Eric Xxxx | Fire Engineer | 35698.0 | 204462.0 | 69121.00 | 38362.0 | 309281.0 | 347643.0 | 2020 | NaN | San Diego | PT | Eric |
| 3 | Gregg Xxxx | Retirement Administrator | 305000.0 | 0.0 | 12814.00 | 24792.0 | 317814.0 | 342606.0 | 2020 | NaN | San Diego | FT | Gregg |
| 4 | Joseph Xxxx | Fire Battalion Chief | 94451.0 | 157778.0 | 48151.00 | 42096.0 | 300380.0 | 342476.0 | 2020 | NaN | San Diego | FT | Joseph |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12600 | Elena Xxxx | Asst Eng-Civil | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT | Elena |
| 12601 | Gary Xxxx | Police Officer | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT | Gary |
| 12602 | Sara Xxxx | Asst Planner | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Sara |
| 12603 | Kevin Xxxx | Project Ofcr 1 | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Kevin |
| 12604 | Deedrick Xxxx | Utility Worker 2 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Deedrick |
12605 rows × 13 columns
# Merge salaries and genders
salaries_with_gender = salaries.merge(genders[['gender']], on='firstname', how='left')
salaries_with_gender
| Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Total Pay & Benefits | Year | Notes | Agency | Status | firstname | gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Michael Xxxx | Police Officer | 117691.0 | 187290.0 | 13331.00 | 36380.0 | 318312.0 | 354692.0 | 2020 | NaN | San Diego | FT | Michael | M |
| 1 | Gary Xxxx | Police Officer | 117691.0 | 160062.0 | 42946.00 | 31795.0 | 320699.0 | 352494.0 | 2020 | NaN | San Diego | FT | Gary | M |
| 2 | Eric Xxxx | Fire Engineer | 35698.0 | 204462.0 | 69121.00 | 38362.0 | 309281.0 | 347643.0 | 2020 | NaN | San Diego | PT | Eric | M |
| 3 | Gregg Xxxx | Retirement Administrator | 305000.0 | 0.0 | 12814.00 | 24792.0 | 317814.0 | 342606.0 | 2020 | NaN | San Diego | FT | Gregg | M |
| 4 | Joseph Xxxx | Fire Battalion Chief | 94451.0 | 157778.0 | 48151.00 | 42096.0 | 300380.0 | 342476.0 | 2020 | NaN | San Diego | FT | Joseph | M |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12600 | Elena Xxxx | Asst Eng-Civil | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT | Elena | F |
| 12601 | Gary Xxxx | Police Officer | 0.0 | 2.0 | 0.00 | 0.0 | 2.0 | 2.0 | 2020 | NaN | San Diego | PT | Gary | M |
| 12602 | Sara Xxxx | Asst Planner | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Sara | F |
| 12603 | Kevin Xxxx | Project Ofcr 1 | 0.0 | 1.0 | 0.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Kevin | M |
| 12604 | Deedrick Xxxx | Utility Worker 2 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 1.0 | 2020 | NaN | San Diego | PT | Deedrick | M |
12605 rows × 14 columns

This was our original question. Let's find out!
pd.concat([
salaries_with_gender.groupby('gender')['Total Pay'].describe().T,
salaries_with_gender['Total Pay'].describe().rename('All')
], axis=1)
| F | M | All | |
|---|---|---|---|
| count | 4075.000000 | 8043.000000 | 12605.000000 |
| mean | 63865.752883 | 81297.593808 | 75181.321618 |
| std | 43497.853002 | 51567.740425 | 49634.174460 |
| min | 1.000000 | 0.000000 | 0.000000 |
| 25% | 33084.000000 | 44900.000000 | 41177.000000 |
| 50% | 59975.000000 | 77579.000000 | 71354.000000 |
| 75% | 89854.000000 | 117374.500000 | 106903.000000 |
| max | 295904.000000 | 320699.000000 | 320699.000000 |
n_female = np.count_nonzero(salaries_with_gender['gender'] == 'F')
n_female
4075
Strategy:
salaries_with_gender and compute their median salary.# Observed statistic
female_median = salaries_with_gender.loc[salaries_with_gender['gender'] == 'F']['Total Pay'].median()
# Simulate 1000 samples of size n_female from the population
medians = np.array([])
for _ in np.arange(1000):
median = salaries_with_gender.sample(n_female)['Total Pay'].median()
medians = np.append(medians, median)
medians[:10]
array([70393., 71017., 71715., 71917., 72007., 73564., 70067., 71809.,
70490., 71005.])
title='Median salary of randomly chosen groups from population'
pd.Series(medians).plot(kind='hist', density=True, ec='w', title=title);
plt.axvline(x=female_median, color='red')
plt.legend(['Observed Median Salary of Female Employees', 'Median Salaries of Random Groups']);

While trying to answer one question, many more popped up.
salaries and names?'Total Pay' is not the most relevant column, as it may include reimbursements that are separate from take-home pay (e.g. gas for driving a car).salaries and names?¶salaries and names?¶salaries_with_gender[salaries_with_gender['gender'].isnull()]
| Employee Name | Job Title | Base Pay | Overtime Pay | Other Pay | Benefits | Total Pay | Total Pay & Benefits | Year | Notes | Agency | Status | firstname | gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23 | Almis Xxxx | Asst Chief Oper Ofcr | 201773.0 | 0.0 | 54824.00 | 33373.0 | 256597.0 | 289970.0 | 2020 | NaN | San Diego | FT | Almis | NaN |
| 40 | Devinda Xxxx | Fire Battalion Chief | 71905.0 | 133385.0 | 19829.00 | 42743.0 | 225119.0 | 267862.0 | 2020 | NaN | San Diego | PT | Devinda | NaN |
| 113 | Teophilson Xxxx | Police Officer | 117691.0 | 69222.0 | 32135.00 | 26281.0 | 219048.0 | 245329.0 | 2020 | NaN | San Diego | FT | Teophilson | NaN |
| 153 | Tevar Xxxx | Police Officer | 91773.0 | 88703.0 | 26850.00 | 27868.0 | 207326.0 | 235194.0 | 2020 | NaN | San Diego | FT | Tevar | NaN |
| 154 | Junar Xxxx | Police Officer | 36112.0 | 72625.0 | 101272.00 | 25179.0 | 210009.0 | 235188.0 | 2020 | NaN | San Diego | PT | Junar | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12510 | A Xxxx | Sanitation Driver 2 | 0.0 | 90.0 | 0.00 | 8.0 | 90.0 | 98.0 | 2020 | NaN | San Diego | PT | A | NaN |
| 12513 | Ventice Xxxx | Plant Tech 3 | 0.0 | 90.0 | 0.00 | 5.0 | 90.0 | 95.0 | 2020 | NaN | San Diego | PT | Ventice | NaN |
| 12578 | Anniessa Xxxx | Dispatcher 2 | 0.0 | 17.0 | 0.00 | 1.0 | 17.0 | 18.0 | 2020 | NaN | San Diego | PT | Anniessa | NaN |
| 12582 | Navareto Xxxx | Plant Procs Cntrl Electrician | 0.0 | 15.0 | 0.00 | 1.0 | 15.0 | 16.0 | 2020 | NaN | San Diego | PT | Navareto | NaN |
| 12598 | Tien-Hsiang Xxxx | Plant Tech 2 | 0.0 | 4.0 | 0.00 | 0.0 | 4.0 | 4.0 | 2020 | NaN | San Diego | PT | Tien-Hsiang | NaN |
487 rows × 14 columns
# Proportion of employees whose names aren't in SSA dataset
salaries_with_gender['gender'].isnull().mean()
0.03863546211820706
# Description of total pay by joined vs. not joined
(
salaries_with_gender
.assign(joined=salaries_with_gender['gender'].notnull())
.groupby('joined')['Total Pay']
.describe()
.T
)
| joined | False | True |
|---|---|---|
| count | 487.000000 | 12118.000000 |
| mean | 68852.297741 | 75435.673378 |
| std | 47890.354528 | 49688.037713 |
| min | 4.000000 | 0.000000 |
| 25% | 36723.500000 | 41348.250000 |
| 50% | 63341.000000 | 71513.000000 |
| 75% | 96587.000000 | 107389.250000 |
| max | 256597.000000 | 320699.000000 |
nonjoins = salaries_with_gender.loc[salaries_with_gender['gender'].isnull()]
title = 'Distribution of Salaries'
nonjoins['Total Pay'].plot(kind='hist', bins=np.arange(0, 320000, 10000), alpha=0.5, density=True, sharex=True)
salaries_with_gender['Total Pay'].plot(kind='hist', bins=np.arange(0, 320000, 10000), alpha=0.5, density=True, sharex=True, title=title)
plt.legend(['Not in SSA','All']);
salaries and names?¶Lesson: joining to another dataset can bias your sample!
pandas¶

pandas¶
pandas is the Python library for tabular data manipulation.pandas was developed, the standard data science workflow involved using multiple languages (Python, R, Java) in a single project.pandas, wanted a library which would allow everything to be done in Python.pandas data structures¶There are three key data structures at the core of pandas:

pandas and related libraries¶We've already run this at the top of the notebook, so we won't repeat it here. But pandas is almost always imported in conjunction with numpy:
import pandas as pd
import numpy as np
pd.Series.pd.Series object is a one-dimensional sequence with labels (index).names
| firstname | gender | count | year | |
|---|---|---|---|---|
| 0 | Emily | F | 25957 | 2000 |
| 1 | Hannah | F | 23084 | 2000 |
| 2 | Madison | F | 19968 | 2000 |
| 3 | Ashley | F | 17997 | 2000 |
| 4 | Sarah | F | 17706 | 2000 |
| ... | ... | ... | ... | ... |
| 32025 | Zyheem | M | 5 | 2019 |
| 32026 | Zykel | M | 5 | 2019 |
| 32027 | Zyking | M | 5 | 2019 |
| 32028 | Zyn | M | 5 | 2019 |
| 32029 | Zyran | M | 5 | 2019 |
1399746 rows × 4 columns
names['firstname']
0 Emily
1 Hannah
2 Madison
3 Ashley
4 Sarah
...
32025 Zyheem
32026 Zykel
32027 Zyking
32028 Zyn
32029 Zyran
Name: firstname, Length: 1399746, dtype: object
names.iloc[3]
firstname Ashley gender F count 17997 year 2000 Name: 3, dtype: object
pd.Series can create a new Series, given either an existing sequence or dictionary.index and name arguments to change this behavior.pd.Series([10, 23, 45, 53, 87])
0 10 1 23 2 45 3 53 4 87 dtype: int64
pd.Series({'a': 10, 'b': 23, 'c': 45, 'd': 53, 'e': 87}, name='people')
a 10 b 23 c 45 d 53 e 87 Name: people, dtype: int64
pd.DataFrame initializes a DataFrame using either: index, columns, dtype, etc.f, run f? in a cell (e.g. pd.DataFrame?).pd.DataFrame?
row_data = [
['Granger, Hermione', 'A13245986', 1],
['Potter, Harry', 'A17645384', 1],
['Weasley, Ron', 'A32438694', 1],
['Longbottom, Neville', 'A52342436', 1]
]
row_data
[['Granger, Hermione', 'A13245986', 1], ['Potter, Harry', 'A17645384', 1], ['Weasley, Ron', 'A32438694', 1], ['Longbottom, Neville', 'A52342436', 1]]
By default, the column names are set to 0, 1, 2, ...
pd.DataFrame(row_data)
| 0 | 1 | 2 | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
You can change that using the columns argument.
pd.DataFrame(row_data, columns=['Name', 'PID', 'LVL'])
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
column_dict = {
'Name': ['Granger, Hermione', 'Potter, Harry', 'Weasley, Ron', 'Longbottom, Neville'],
'PID': ['A13245986', 'A17645384', 'A32438694', 'A52342436'],
'LVL': [1, 1, 1, 1]
}
column_dict
{'Name': ['Granger, Hermione',
'Potter, Harry',
'Weasley, Ron',
'Longbottom, Neville'],
'PID': ['A13245986', 'A17645384', 'A32438694', 'A52342436'],
'LVL': [1, 1, 1, 1]}
enrollments = pd.DataFrame(column_dict)
enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
columns attribute.index attribute.enrollments.columns
Index(['Name', 'PID', 'LVL'], dtype='object')
enrollments.index
RangeIndex(start=0, stop=4, step=1)

axis¶A = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
A
| A | B | C | |
|---|---|---|---|
| 0 | 1 | 2 | 3 |
| 1 | 4 | 5 | 6 |
If we specify axis=0, A.sum will "compress" along axis 0, and keep the column labels intact.
A.sum(axis=0)
A 5 B 7 C 9 dtype: int64
If we specify axis=1, A.sum will "compress" along axis 1, and keep the row labels (index) intact.
A.sum(axis=1)
0 6 1 15 dtype: int64

[] and loc¶babypandas 👶¶babypandas, you accessed columns using the .get method..get also works in pandas, but it is not idiomatic – people don't usually use it.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.get('Name')
0 Granger, Hermione 1 Potter, Harry 2 Weasley, Ron 3 Longbottom, Neville Name: Name, dtype: object
# Doesn't error
enrollments.get('billy')
[]¶pandas is by using the [] operator.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
# Returns a Series
enrollments['Name']
0 Granger, Hermione 1 Potter, Harry 2 Weasley, Ron 3 Longbottom, Neville Name: Name, dtype: object
# Returns a DataFrame
enrollments[['Name', 'PID']]
| Name | PID | |
|---|---|---|
| 0 | Granger, Hermione | A13245986 |
| 1 | Potter, Harry | A17645384 |
| 2 | Weasley, Ron | A32438694 |
| 3 | Longbottom, Neville | A52342436 |
# 🤔
enrollments[['Name']]
| Name | |
|---|---|
| 0 | Granger, Hermione |
| 1 | Potter, Harry |
| 2 | Weasley, Ron |
| 3 | Longbottom, Neville |
# KeyError
enrollments['billy']
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3360 try: -> 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: ~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() ~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'billy' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) /var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_49367/2071819993.py in <module> 1 # KeyError ----> 2 enrollments['billy'] ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py in __getitem__(self, key) 3456 if self.columns.nlevels > 1: 3457 return self._getitem_multilevel(key) -> 3458 indexer = self.columns.get_loc(key) 3459 if is_integer(indexer): 3460 indexer = [indexer] ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: -> 3363 raise KeyError(key) from err 3364 3365 if is_scalar(key) and isna(key) and not self.hasnans: KeyError: 'billy'
.<column name>..mean?enrollments.LVL
0 1 1 1 2 1 3 1 Name: LVL, dtype: int64
enrollments.mean
<bound method NDFrame._add_numeric_operations.<locals>.mean of Name PID LVL 0 Granger, Hermione A13245986 1 1 Potter, Harry A17645384 1 2 Weasley, Ron A32438694 1 3 Longbottom, Neville A52342436 1>
loc¶If df is a DataFrame, then:
df.loc[idx] returns the Series whose index is idx.df.loc[idx_list] returns a DataFrame containing the rows whose indexes are in idx_list.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.loc[3]
Name Longbottom, Neville PID A52342436 LVL 1 Name: 3, dtype: object
enrollments.loc[[1, 3]]
| Name | PID | LVL | |
|---|---|---|---|
| 1 | Potter, Harry | A17645384 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.loc[[3]]
| Name | PID | LVL | |
|---|---|---|---|
| 3 | Longbottom, Neville | A52342436 | 1 |
loc operator also supports Boolean sequences (lists, arrays, Series) as input. True.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
bool_arr = [
False, # Hermione
True, # Harry
False, # Ron
True # Neville
]
enrollments.loc[bool_arr]
| Name | PID | LVL | |
|---|---|---|---|
| 1 | Potter, Harry | A17645384 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
loc operator to query a DataFrame.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments['Name'].str.contains('on')
0 True 1 False 2 True 3 True Name: Name, dtype: bool
# Rows where Name includes 'on'
enrollments.loc[enrollments['Name'].str.contains('on')]
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
# Rows where the first letter of Name is between A and L
enrollments.loc[enrollments['Name'] < 'M']
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
When using a Boolean sequence, e.g. enrollments['Name'] < 'M', loc is not strictly necessary:
enrollments[enrollments['Name'] < 'M']
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
So far, we used [] to select columns and loc to select rows.
enrollments.loc[enrollments['Name'] < 'M']['PID']
0 A13245986 3 A52342436 Name: PID, dtype: object
loc can also be used to select both rows and columns. The general pattern is:
df.loc[<row selector>, <column selector>]
Examples:
df.loc[idx_list, col_list] returns a DataFrame containing the rows in idx_list and columns in col_list.df.loc[bool_arr, col_list] returns a DataFrame contaning the rows for which bool_arr is True and columns in col_list.: is used as the first input, all rows are kept. If : is used as the second input, all columns are kept.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.loc[enrollments['Name'] < 'M', 'PID']
0 A13245986 3 A52342436 Name: PID, dtype: object
enrollments.loc[enrollments['Name'] < 'M', ['PID']]
| PID | |
|---|---|
| 0 | A13245986 |
| 3 | A52342436 |
In df.loc[<row selection>, <column selection>]:
: syntax (e.g. 0:2, 'Name': 'PID').There are many, many more – see the pandas documentation for more.
enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.loc[2, 'LVL']
1
enrollments.loc[0:2, 'Name': 'PID']
| Name | PID | |
|---|---|---|
| 0 | Granger, Hermione | A13245986 |
| 1 | Potter, Harry | A17645384 |
| 2 | Weasley, Ron | A32438694 |
iloc!¶iloc stands for "integer location".iloc is like loc, but it selects rows and columns based off of integer positions only.enrollments
| Name | PID | LVL | |
|---|---|---|---|
| 0 | Granger, Hermione | A13245986 | 1 |
| 1 | Potter, Harry | A17645384 | 1 |
| 2 | Weasley, Ron | A32438694 | 1 |
| 3 | Longbottom, Neville | A52342436 | 1 |
enrollments.iloc[2:4, 0:2]
| Name | PID | |
|---|---|---|
| 2 | Weasley, Ron | A32438694 |
| 3 | Longbottom, Neville | A52342436 |
other = enrollments.set_index('Name')
other
| PID | LVL | |
|---|---|---|
| Name | ||
| Granger, Hermione | A13245986 | 1 |
| Potter, Harry | A17645384 | 1 |
| Weasley, Ron | A32438694 | 1 |
| Longbottom, Neville | A52342436 | 1 |
other.iloc[2]
PID A32438694 LVL 1 Name: Weasley, Ron, dtype: object
other.loc[2]
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3360 try: -> 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: ~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() ~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 2 The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) /var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_49367/776803957.py in <module> ----> 1 other.loc[2] ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key) 929 930 maybe_callable = com.apply_if_callable(key, self.obj) --> 931 return self._getitem_axis(maybe_callable, axis=axis) 932 933 def _is_scalar_access(self, key: tuple): ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis) 1162 # fall thru to straight lookup 1163 self._validate_key(key, axis) -> 1164 return self._get_label(key, axis=axis) 1165 1166 def _get_slice_axis(self, slice_obj: slice, axis: int): ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_label(self, label, axis) 1111 def _get_label(self, label, axis: int): 1112 # GH#5667 this will fail if the label is not present in the axis. -> 1113 return self.obj.xs(label, axis=axis) 1114 1115 def _handle_lowerdim_multi_index_axis0(self, tup: tuple): ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level) 3774 raise TypeError(f"Expected label or tuple of labels, got {key}") from e 3775 else: -> 3776 loc = index.get_loc(key) 3777 3778 if isinstance(loc, np.ndarray): ~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: -> 3363 raise KeyError(key) from err 3364 3365 if is_scalar(key) and isna(key) and not self.hasnans: KeyError: 2
Let's return to the names DataFrame.
names
| firstname | gender | count | year | |
|---|---|---|---|---|
| 0 | Emily | F | 25957 | 2000 |
| 1 | Hannah | F | 23084 | 2000 |
| 2 | Madison | F | 19968 | 2000 |
| 3 | Ashley | F | 17997 | 2000 |
| 4 | Sarah | F | 17706 | 2000 |
| ... | ... | ... | ... | ... |
| 32025 | Zyheem | M | 5 | 2019 |
| 32026 | Zykel | M | 5 | 2019 |
| 32027 | Zyking | M | 5 | 2019 |
| 32028 | Zyn | M | 5 | 2019 |
| 32029 | Zyran | M | 5 | 2019 |
1399746 rows × 4 columns
Question: How many babies were born with the name 'Billy' and gender 'M'?
...
Consider the DataFrame below.
jack = pd.DataFrame({1: ['fee', 'fi'], '1': ['fo', 'fum']})
jack
For each of the following pieces of code, predict what the output will be. Then, uncomment the line of code and see for yourself.
# jack[1]
# jack[[1]]
# jack['1']
# jack[[1,1]]
# jack.loc[1]
# jack.loc[jack[1] == 'fo']
# jack[1, ['1', 1]]
# jack.loc[1,1]
pandas is the library for tabular data manipulation in Python.pandas: DataFrame, Series, and Index.pandas documentation for tips.