import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 5)


enrollments = pd.DataFrame({
    'Name': ['Granger, Hermione', 'Potter, Harry', 'Weasley, Ron', 'Longbottom, Neville'],
    'PID': ['A13245986', 'A17645384', 'A32438694', 'A52342436'],
    'LVL': [1, 1, 1, 1]
})

enrollments


enrollments.loc[enrollments['Name'] < 'M']['PID']

0    A13245986
3    A52342436
Name: PID, dtype: object


enrollments


enrollments.loc[enrollments['Name'] < 'M', 'PID']

0    A13245986
3    A52342436
Name: PID, dtype: object


enrollments.loc[enrollments['Name'] < 'M', ['PID']]


enrollments


enrollments.loc[2, 'LVL']

1


enrollments.loc[0:2, 'Name':'PID']


enrollments


enrollments.iloc[2:4, 0:2]


other = enrollments.set_index('Name')
other


other.iloc[2]

PID    A32438694
LVL            1
Name: Weasley, Ron, dtype: object


other.loc[2]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3360             try:
-> 3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

~/opt/anaconda3/lib/python3.9/site-packages/pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 2

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
/var/folders/pd/w73mdrsj2836_7gp0brr2q7r0000gn/T/ipykernel_57577/776803957.py in <module>
----> 1 other.loc[2]

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __getitem__(self, key)
    929 
    930             maybe_callable = com.apply_if_callable(key, self.obj)
--> 931             return self._getitem_axis(maybe_callable, axis=axis)
    932 
    933     def _is_scalar_access(self, key: tuple):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1162         # fall thru to straight lookup
   1163         self._validate_key(key, axis)
-> 1164         return self._get_label(key, axis=axis)
   1165 
   1166     def _get_slice_axis(self, slice_obj: slice, axis: int):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
   1111     def _get_label(self, label, axis: int):
   1112         # GH#5667 this will fail if the label is not present in the axis.
-> 1113         return self.obj.xs(label, axis=axis)
   1114 
   1115     def _handle_lowerdim_multi_index_axis0(self, tup: tuple):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py in xs(self, key, axis, level, drop_level)
   3774                 raise TypeError(f"Expected label or tuple of labels, got {key}") from e
   3775         else:
-> 3776             loc = index.get_loc(key)
   3777 
   3778             if isinstance(loc, np.ndarray):

~/opt/anaconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3361                 return self._engine.get_loc(casted_key)
   3362             except KeyError as err:
-> 3363                 raise KeyError(key) from err
   3364 
   3365         if is_scalar(key) and isna(key) and not self.hasnans:

KeyError: 2


jack = pd.DataFrame({1: ['fee', 'fi'], '1': ['fo', 'fum']})
jack


# jack[1]


# jack[[1]]


# jack['1']


# jack[[1, 1]]


# jack.loc[1]


# jack.loc[jack[1] == 'fo']


# jack[1, ['1', 1]]


# jack.loc[1, 1]


arr = np.array([4, 2, 9, 15, -1])
arr

array([ 4,  2,  9, 15, -1])


ser = pd.Series(arr, index='a b c d e'.split(' '))
ser

a     4
b     2
c     9
d    15
e    -1
dtype: int64


conv = ser.to_numpy()
conv

array([ 4,  2,  9, 15, -1])


conv[2] = 100
conv

array([  4,   2, 100,  15,  -1])

ser

a      4
b      2
c    100
d     15
e     -1
dtype: int64


N = 2000
x_arr = np.random.random(N)
y_arr = np.random.random(N)

coordinates = pd.DataFrame({"x": x_arr, "y": y_arr})
coordinates.head()


def distances(df):
    hyp_list = []
    for i in df.index:
        dist = (df.loc[i, 'x'] ** 2 + df.loc[i, 'y'] ** 2) ** 0.5
        hyp_list.append(dist)
    return hyp_list


%timeit distances(coordinates)

20.3 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


%timeit (coordinates['x'] ** 2 + coordinates['y'] ** 2) ** 0.5

136 µs ± 752 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# Read in file
elections_fp = os.path.join('data', 'elections.csv')
elections = pd.read_csv(elections_fp)
elections.head()


elections.dtypes

Candidate     object
Party         object
%            float64
Year           int64
Result        object
dtype: object


elections['Year'].dtypes

dtype('int64')


elections['Year'].to_numpy().dtype

dtype('int64')


elections.to_numpy()

array([['Reagan', 'Republican', 50.7, 1980, 'win'],
       ['Carter', 'Democratic', 41.0, 1980, 'loss'],
       ['Anderson', 'Independent', 6.6, 1980, 'loss'],
       ['Reagan', 'Republican', 58.8, 1984, 'win'],
       ['Mondale', 'Democratic', 37.6, 1984, 'loss'],
       ['Bush', 'Republican', 53.4, 1988, 'win'],
       ['Dukakis', 'Democratic', 45.6, 1988, 'loss'],
       ['Clinton', 'Democratic', 43.0, 1992, 'win'],
       ['Bush', 'Republican', 37.4, 1992, 'loss'],
       ['Perot', 'Independent', 18.9, 1992, 'loss'],
       ['Clinton', 'Democratic', 49.2, 1996, 'win'],
       ['Dole', 'Republican', 40.7, 1996, 'loss'],
       ['Perot', 'Independent', 8.4, 1996, 'loss'],
       ['Gore', 'Democratic', 48.4, 2000, 'loss'],
       ['Bush', 'Republican', 47.9, 2000, 'win'],
       ['Kerry', 'Democratic', 48.3, 2004, 'loss'],
       ['Bush', 'Republican', 50.7, 2004, 'win'],
       ['Obama', 'Democratic', 52.9, 2008, 'win'],
       ['McCain', 'Republican', 45.7, 2008, 'loss'],
       ['Obama', 'Democratic', 51.1, 2012, 'win'],
       ['Romney', 'Republican', 47.2, 2012, 'loss'],
       ['Clinton', 'Democratic', 48.2, 2016, 'loss'],
       ['Trump', 'Republican', 46.1, 2016, 'win'],
       ['Biden', 'Democratic', 51.3, 2020, 'win'],
       ['Trump', 'Republican', 46.9, 2020, 'loss']], dtype=object)


elections['Year'] ** 7

0    -9176136658659852288
1    -9176136658659852288
2    -9176136658659852288
3    -8884621300430012416
4    -8884621300430012416
5    -6382217169766957056
6    -6382217169766957056
7    -1459841187188834304
8    -1459841187188834304
9    -1459841187188834304
10    6093279273289170944
11    6093279273289170944
12    6093279273289170944
13   -1957127470578663424
14   -1957127470578663424
15   -6950134928041361408
16   -6950134928041361408
17   -8669840355176218624
18   -8669840355176218624
19   -6898610338587885568
20   -6898610338587885568
21   -1417070411446747136
22   -1417070411446747136
23    7995905371925528576
24    7995905371925528576
Name: Year, dtype: int64


np.array(['a', 1])

array(['a', '1'], dtype='<U21')


pd.Series(['a', 1])

0    a
1    1
dtype: object


pd.Series(['a', 1]).values

array(['a', 1], dtype=object)


np.array(['a', 1], dtype=object)

array(['a', 1], dtype=object)


pd.Series([1, 1.0])

0    1.0
1    1.0
dtype: float64


ser = pd.Series(['1', '2', '3', '4'])
ser

0    1
1    2
2    3
3    4
dtype: object


ser.astype(int)

0    1
1    2
2    3
3    4
dtype: int64


ser.astype(float)

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64


import random
data = np.random.choice(np.arange(8), 10 ** 6)


ser1 = pd.Series(data, dtype=np.uint8).to_frame()
ser1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000000 non-null  uint8
dtypes: uint8(1)
memory usage: 976.7 KB


ser2 = pd.Series(data).to_frame()
ser2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000000 non-null  int64
dtypes: int64(1)
memory usage: 7.6 MB


elections.head()


elections.shape

(25, 5)


elections.size

125


elections.head()


# Distinct candidates
elections['Candidate'].unique()

array(['Reagan', 'Carter', 'Anderson', 'Mondale', 'Bush', 'Dukakis',
       'Clinton', 'Perot', 'Dole', 'Gore', 'Kerry', 'Obama', 'McCain',
       'Romney', 'Trump', 'Biden'], dtype=object)


# Number of distinct candidates
elections['Candidate'].nunique()

16


# Total number of candidates
elections['Candidate'].count()

25


# 🤔
republicans = elections.loc[elections['Party'] == 'Republican']
republicans['Result'].value_counts()

win     6
loss    5
Name: Result, dtype: int64


republicans['%'].describe()

count    11.000000
mean     47.772727
std       5.798464
min      37.400000
25%      45.900000
50%      47.200000
75%      50.700000
max      58.800000
Name: %, dtype: float64


elections.head()


elections[['%', 'Year']].mean()

%         43.04
Year    1998.72
dtype: float64


elections[['%', 'Year']].mean(axis=1)

0     1015.35
1     1010.50
2      993.30
3     1021.40
4     1010.80
5     1020.70
6     1016.80
7     1017.50
8     1014.70
9     1005.45
10    1022.60
11    1018.35
12    1002.20
13    1024.20
14    1023.95
15    1026.15
16    1027.35
17    1030.45
18    1026.85
19    1031.55
20    1029.60
21    1032.10
22    1031.05
23    1035.65
24    1033.45
dtype: float64


elections.sort_values('%', ascending=False).head(4)


# By default, drop_duplicates looks for duplicate entire rows, which elections does not have
elections.drop_duplicates(subset=['Candidate'])


elections.describe()


(
    elections
    .assign(proportion_of_vote=(elections['%'] / 100))
    .head()
)


(
    elections
    .assign(proportion_of_vote=(elections['%'] / 100))
    .assign(Result=elections['Result'].str.upper())
    .head()
)


(
    elections
    .assign(**{'Proportion of Vote': elections['%'] / 100})
    .head()
)


# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
mod_elec = elections.copy()
mod_elec.head()


mod_elec['Proportion of Vote'] = mod_elec['%'] / 100
mod_elec.head()


mod_elec['Result'] = mod_elec['Result'].str.upper()
mod_elec.head()


# 🤔
mod_elec.loc[-1, :] = ['Carter', 'Democratic', 50.1, 1976, 'WIN', 0.501]
mod_elec.loc[-2, :] = ['Ford', 'Republican', 48.0, 1976, 'LOSS', 0.48]
mod_elec


mod_elec = mod_elec.sort_index()
mod_elec.head()


# df.reset_index(drop=True) drops the current index 
# of the DataFrame and replaces it with an index of increasing integers
mod_elec.reset_index(drop=True)


salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' xxxxx'


salaries.head()


salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12605 entries, 0 to 12604
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee Name         12605 non-null  object 
 1   Job Title             12605 non-null  object 
 2   Base Pay              12605 non-null  float64
 3   Overtime Pay          12605 non-null  float64
 4   Other Pay             12605 non-null  object 
 5   Benefits              12605 non-null  float64
 6   Total Pay             12605 non-null  float64
 7   Total Pay & Benefits  12605 non-null  float64
 8   Year                  12605 non-null  int64  
 9   Notes                 0 non-null      float64
 10  Agency                12605 non-null  object 
 11  Status                12605 non-null  object 
dtypes: float64(6), int64(1), object(5)
memory usage: 1.2+ MB


# Dropping useless columns
salaries = salaries.drop(['Year', 'Notes', 'Agency'], axis=1)
salaries.head()


salaries['Other Pay'].dtype

dtype('O')


salaries['Other Pay'].unique()

array(['13331.00', '42946.00', '69121.00', ..., '119.00', '47.00', '4.00'],
      dtype=object)


salaries.loc[-salaries['Other Pay'].str.contains('.00')]


salaries = salaries.loc[salaries['Other Pay'].str.contains('.00') == True]
salaries['Other Pay'] = salaries['Other Pay'].astype(float)
salaries.head()


salaries['Status'].value_counts()

FT    8352
PT    4241
Name: Status, dtype: int64


salaries['Status'].value_counts(normalize=True)

FT    0.663226
PT    0.336774
Name: Status, dtype: float64


# Salary statistics
salaries.describe()


salaries.loc[:, ['Base Pay', 'Overtime Pay', 'Other Pay']].sum(axis=1)

0        318312.0
1        320699.0
2        309281.0
3        317814.0
4        300380.0
           ...   
12600         2.0
12601         2.0
12602         1.0
12603         1.0
12604         1.0
Length: 12593, dtype: float64


salaries['Total Pay']

0        318312.0
1        320699.0
2        309281.0
3        317814.0
4        300380.0
           ...   
12600         2.0
12601         2.0
12602         1.0
12603         1.0
12604         1.0
Name: Total Pay, Length: 12593, dtype: float64


(salaries.loc[:, ['Base Pay', 'Overtime Pay', 'Other Pay']].sum(axis=1) == salaries['Total Pay']).all()

True


(salaries.loc[:, ['Total Pay', 'Benefits']].sum(axis=1) == salaries.loc[:, 'Total Pay & Benefits']).all()

True


salaries['Total Pay & Benefits'].plot(kind='hist', density=False, bins=20, ec='w');


salaries.plot(kind='scatter', x='Base Pay', y='Overtime Pay');


pd.plotting.scatter_matrix(salaries[['Base Pay', 'Overtime Pay', 'Total Pay']], figsize=(8, 8));

	x	y
0	0.155402	0.346735
1	0.922027	0.746329
2	0.855037	0.630200
3	0.755672	0.849689
4	0.518506	0.177432

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64	NA	datetime64[ns]	DATETIME	Date and time values
timedelta[ns]	NA	NA	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

Method Name	Description
`count`	Returns the number of non-null entries in the Series
`unique`	Returns the unique values in the Series
`nunique`	Returns the number of unique values in the Series
`value_counts`	Returns a Series of counts of unique values
`describe`	Returns a Series of descriptive stats of values

Method Name	Description
`sort_values`	Returns a DataFrame sorted by the specified column
`drop_duplicates`	Returns a DataFrame with duplicate values dropped
`describe`	Returns descriptive stats of the DataFrame

	%	Year
count	25.000000	25.000000
mean	43.040000	1998.720000
std	13.046136	12.843675
min	6.600000	1980.000000
25%	41.000000	1988.000000
50%	47.200000	1996.000000
75%	50.700000	2008.000000
max	58.800000	2020.000000

Lecture 3 – More Pandas 🐼🐼¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

`loc` and `iloc`¶

Selecting columns and rows simultaneously¶

Selecting columns and rows simultaneously¶

Even more ways of selecting rows and columns¶

Don't forget `iloc`!¶

Practice Questions¶

Pandas and NumPy¶

NumPy¶

`pandas` is built upon `numpy`¶

The dangers of `for`-loops¶

Aside: generating data¶

`pandas` data types¶

`pandas` data types¶

Type conversion and the underlying `numpy` array(s)¶

⚠️ Warning: `numpy` and `pandas` don't always make the same decisions!¶

Type conversion¶

Performance and memory management¶

Useful Series and DataFrame methods¶

Shared methods and attributes¶

Series methods¶

DataFrame methods¶

Even more DataFrame methods¶

Adding and modifying columns, using a copy¶

⚠️ Warning!¶

Adding and modifying columns, in-place¶

Example: San Diego employee salaries (again)¶

Reading the data¶

Data cleaning¶

Fixing the `'Other Pay'` column¶

Full-time vs. part-time¶

Salary analysis¶

Visualization¶

Summary, next time¶

Summary¶

	Name	PID	LVL
0	Granger, Hermione	A13245986	1
1	Potter, Harry	A17645384	1
2	Weasley, Ron	A32438694	1
3	Longbottom, Neville	A52342436	1

	PID	LVL
Name
Granger, Hermione	A13245986	1
Potter, Harry	A17645384	1
Weasley, Ron	A32438694	1
Longbottom, Neville	A52342436	1

	Candidate	Party	%	Year	Result
0	Reagan	Republican	50.7	1980	win
1	Carter	Democratic	41.0	1980	loss
2	Anderson	Independent	6.6	1980	loss
3	Reagan	Republican	58.8	1984	win
4	Mondale	Democratic	37.6	1984	loss

	Candidate	Party	%	Year	Result	Proportion of Vote
-2	Ford	Republican	48.0	1976.0	LOSS	0.480
-1	Carter	Democratic	50.1	1976.0	WIN	0.501
0	Reagan	Republican	50.7	1980.0	WIN	0.507
1	Carter	Democratic	41.0	1980.0	LOSS	0.410
2	Anderson	Independent	6.6	1980.0	LOSS	0.066

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Year	Notes	Agency	Status
0	Michael xxxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	354692.0	2020	NaN	San Diego	FT
1	Gary xxxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	352494.0	2020	NaN	San Diego	FT
2	Eric xxxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	347643.0	2020	NaN	San Diego	PT
3	Gregg xxxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	342606.0	2020	NaN	San Diego	FT
4	Joseph xxxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	342476.0	2020	NaN	San Diego	FT

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits	Status
10347	Elmo xxxxx	Laborer	27288.0	489.0	Aggregate	1103.0	27774.0	28877.0	PT
10624	John xxxxx	Golf Operations Assistant	19278.0	0.0	Aggregate	1157.0	19275.0	20432.0	PT
11988	Benjamin xxxxx	Stock Clerk(Auto Parts Stock Clrk)	1234.0	54.0	Aggregate	681.0	1285.0	1966.0	PT
12079	Eric xxxxx	Traffic Signal Technician 1	0.0	1290.0	Aggregate	119.0	1287.0	1406.0	PT
12088	Teshome xxxxx	Instrumentation & Control Tech	0.0	1248.0	Aggregate	115.0	1245.0	1360.0	PT
12093	Nicholas xxxxx	Deputy Chief Oper Ofcr	1231.0	0.0	Aggregate	113.0	1228.0	1341.0	PT
12097	Jonathan xxxxx	Laborer	1198.0	0.0	Aggregate	110.0	1195.0	1305.0	PT
12098	Gilberto xxxxx	Laborer	1198.0	0.0	Aggregate	110.0	1195.0	1305.0	PT
12101	Jason xxxxx	Plant Procs Cntrl Electrician	0.0	1150.0	Aggregate	106.0	1147.0	1253.0	PT
12115	Raynaldo xxxxx	Asoc Eng-Electrical	0.0	1113.0	Aggregate	102.0	1110.0	1212.0	PT
12124	Mario xxxxx	Heavy Truck Drvr 2	0.0	1068.0	Aggregate	98.0	1065.0	1163.0	PT
12142	Ivan xxxxx	Utility Worker 1	0.0	1002.0	Aggregate	92.0	999.0	1091.0	PT

	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Total Pay & Benefits
count	12593.000000	12593.000000	12593.000000	12593.000000	12593.000000	12593.000000
mean	56116.049948	8796.886683	10335.356388	15471.286905	75248.293020	90719.579925
std	35754.156457	18514.818694	13689.953848	11179.525879	49609.659842	58801.814650
min	0.000000	-293.000000	-25236.000000	-39.000000	0.000000	1.000000
25%	32830.000000	0.000000	1249.000000	6461.000000	41266.000000	52567.000000
50%	54749.000000	588.000000	6791.000000	14582.000000	71421.000000	86301.000000
75%	77512.000000	9190.000000	13796.000000	22748.000000	107039.000000	128250.000000
max	305000.000000	204462.000000	144201.000000	86066.000000	320699.000000	354692.000000

	1	1
0	fee	fo
1	fi	fum

Lecture 3 – More Pandas 🐼🐼¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

loc and iloc¶

Selecting columns and rows simultaneously¶

Selecting columns and rows simultaneously¶

Even more ways of selecting rows and columns¶

Don't forget iloc!¶

Practice Questions¶

Pandas and NumPy¶

NumPy¶

pandas is built upon numpy¶

The dangers of for-loops¶

Aside: generating data¶

pandas data types¶

pandas data types¶

Type conversion and the underlying numpy array(s)¶

⚠️ Warning: numpy and pandas don't always make the same decisions!¶

Type conversion¶

Performance and memory management¶

Useful Series and DataFrame methods¶

Shared methods and attributes¶

Series methods¶

DataFrame methods¶

Even more DataFrame methods¶

Adding and modifying columns, using a copy¶

⚠️ Warning!¶

Adding and modifying columns, in-place¶

Example: San Diego employee salaries (again)¶

Reading the data¶

Data cleaning¶

Fixing the 'Other Pay' column¶

Full-time vs. part-time¶

Salary analysis¶

Visualization¶

Summary, next time¶

Summary¶

`loc` and `iloc`¶

Don't forget `iloc`!¶

`pandas` is built upon `numpy`¶

The dangers of `for`-loops¶

`pandas` data types¶

`pandas` data types¶

Type conversion and the underlying `numpy` array(s)¶

⚠️ Warning: `numpy` and `pandas` don't always make the same decisions!¶

Fixing the `'Other Pay'` column¶