import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_inline.backend_inline import set_matplotlib_formats

set_matplotlib_formats("svg")
sns.set_context("poster")
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
pd.set_option("display.max_rows", 8)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

2 ** arr

array([  1,   2,   4,   8,  16,  32,  64, 128, 256, 512])

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

65 ms ± 926 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
squares = np.arange(1_000_000) ** 2

426 µs ± 1.97 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

import pandas as pd
import numpy as np

all_dogs = pd.read_csv('data/all_dogs.csv')
all_dogs

all_dogs.columns

Index(['breed', 'group', 'datadog', 'popularity_all', 'popularity',
       'lifetime_cost', 'intelligence_rank', 'longevity', 'ailments', 'price',
       'food_cost', 'grooming', 'kids', 'megarank_kids', 'megarank', 'size',
       'weight', 'height'],
      dtype='object')

# Fill in this cell

dogs = pd.read_csv('data/dogs43.csv')
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

(43, 7)

# Index is 0, 1, 2, ..., 171
dogs['breed']

0                   Brittany
1              Cairn Terrier
2     English Cocker Spaniel
3             Cocker Spaniel
               ...          
39                Bloodhound
40               Bullmastiff
41                   Mastiff
42             Saint Bernard
Name: breed, Length: 43, dtype: object

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

RangeIndex(start=0, stop=43, step=1)

# This is review from DSC 10 but most people don't use .get() in practice.
# Will cover in just a few minutes...
dogs.get(['breed', 'kind', 'longevity'])

dogs.sort_values('longevity', ascending=False)

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

(43, 6)

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df"""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs, rows=43)

dogs

dogs.get('size')

breed
Brittany                  medium
Cairn Terrier              small
English Cocker Spaniel    medium
Cocker Spaniel             small
                           ...  
Bloodhound                 large
Bullmastiff                large
Mastiff                    large
Saint Bernard              large
Name: size, Length: 43, dtype: object

# This doesn't error, but sometimes we'd like it to.
dogs.get('size oops!')

dogs

# Returns a Series.
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
Cocker Spaniel            sporting
                            ...   
Bloodhound                   hound
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3361, in Index.get_loc(self, key, method, tolerance)
   3360 try:
-> 3361     return self._engine.get_loc(casted_key)
   3362 except KeyError as err:

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:76, in pandas._libs.index.IndexEngine.get_loc()

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:108, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[29], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/frame.py:3458, in DataFrame.__getitem__(self, key)
   3456 if self.columns.nlevels > 1:
   3457     return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
   3459 if is_integer(indexer):
   3460     indexer = [indexer]

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3363, in Index.get_loc(self, key, method, tolerance)
   3361         return self._engine.get_loc(casted_key)
   3362     except KeyError as err:
-> 3363         raise KeyError(key) from err
   3365 if is_scalar(key) and isna(key) and not self.hasnans:
   3366     raise KeyError(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique dog kinds?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

# What's the distribution of kinds?
dogs['kind'].value_counts()

sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: kind, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

11.340697674418605

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     43.00
mean      49.35
std       39.42
min        5.00
25%       18.00
50%       36.50
75%       67.50
max      175.00
Name: weight, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
Borzoi                        16176.0
                               ...   
Tibetan Spaniel               25549.0
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

# The first argument is the row label
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label

11.0

# Pandas Tutor setup. You'll need to run `pip install pandas_tutor` in your terminal
# for this cell to work, but you can also ignore the error and continue onward.
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

%%pt
dogs.loc['Pug', 'longevity']

dogs.loc[['Pug', 'Labrador Retriever'], ['kind', 'size']]

dogs.loc[['Pug', 'Labrador Retriever'], :]

# Shortcut for line above
dogs.loc[['Pug', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Spaniel')]

# Because filtering is so common, there's a shortcut:
dogs[dogs.index.str.contains('Spaniel')]

dogs

# Series!
dogs.loc['Maltese']

kind                 toy
lifetime_cost    19084.0
longevity          12.25
size               small
weight               5.0
height               9.0
Name: Maltese, dtype: object

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[3:7, :-1]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

5.5

# Finding the breed involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

'Chihuahua'

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

(dogs
 .assign(**{'Cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})
)

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy.head(2)

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

dogs.replace({'kind': {'sporting': "Sam's favorites"}})

A = pd.DataFrame({
    'A': [1, 4],
    'B': [2, 5],
    'C': [3, 6],
})
A

A.sum(axis=1)

0     6
1    15
dtype: int64

A.sum(0)

A    5
B    7
C    9
dtype: int64

A.sum(axis=1)

0     6
1    15
dtype: int64

A

A.sum()

A    5
B    7
C    9
dtype: int64

# Max element in each column
dogs.max()

kind             working
lifetime_cost    26686.0
longevity           16.5
size               small
weight             175.0
height              30.0
dtype: object

# The number of unique values in each column.
dogs.nunique()

kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

dogs['lifetime_cost']

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
Cocker Spaniel            24330.0
                           ...   
Bloodhound                13824.0
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

dogs.dtypes

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs.head()

dogs.dtypes

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs['lifetime_cost'].astype(np.int64)

breed
Brittany                  22589
Cairn Terrier             21992
English Cocker Spaniel    18993
Cocker Spaniel            24330
                          ...  
Bloodhound                13824
Bullmastiff               13936
Mastiff                   13581
Saint Bernard             20022
Name: lifetime_cost, Length: 43, dtype: int64

dogs = pd.read_csv('data/dogs43.csv', dtype={'lifetime_cost': int})
dogs

dogs.dtypes

breed             object
kind              object
lifetime_cost      int64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

all_dogs

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

	breed	group	datadog	popularity_all	...	megarank	size	weight	height
0	Border Collie	herding	3.64	45	...	29.0	medium	NaN	20.0
1	Border Terrier	terrier	3.61	80	...	1.0	small	13.5	NaN
2	Brittany	sporting	3.54	30	...	11.0	medium	35.0	19.0
3	Cairn Terrier	terrier	3.53	59	...	2.0	small	14.0	10.0
...	...	...	...	...	...	...	...	...	...
168	Welsh Terrier	terrier	NaN	99	...	NaN	small	20.0	15.0
169	Wire Fox Terrier	terrier	NaN	100	...	NaN	small	17.5	15.0
170	Wirehaired Pointing Griffon	sporting	NaN	92	...	NaN	medium	NaN	22.0
171	Xoloitzcuintli	non-sporting	NaN	155	...	NaN	medium	NaN	16.5

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
3	Cocker Spaniel	sporting	24330.0	12.50	small	25.0	14.5
...	...	...	...	...	...	...	...
39	Bloodhound	hound	13824.0	6.75	large	85.0	25.0
40	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	breed	kind	lifetime_cost	longevity	size	weight	height
8	Chihuahua	toy	26250.0	16.50	small	5.5	5.0
12	Tibetan Spaniel	non-sporting	25549.0	14.42	small	12.0	10.0
6	Lhasa Apso	non-sporting	22031.0	13.92	small	15.0	10.5
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
...	...	...	...	...	...	...	...
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5
40	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
39	Bloodhound	hound	13824.0	6.75	large	85.0	25.0
41	Mastiff	working	13581.0	6.50	large	175.0	30.0

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
Cocker Spaniel	sporting	24330.0	12.50	small	25.0	14.5
...	...	...	...	...	...	...
Bloodhound	hound	13824.0	6.75	large	85.0	25.0
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Pug	toy	18527.0	11.00	medium	16.0	16.0
Labrador Retriever	sporting	21299.0	12.04	medium	67.5	23.0

	kind	lifetime_cost	longevity	size	weight	height	cost_per_year
breed
Maltese	toy	19084.0	12.25	small	5.0	9.00	1557.88
Lhasa Apso	non-sporting	22031.0	13.92	small	15.0	10.50	1582.69
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.00	1589.02
Chihuahua	toy	26250.0	16.50	small	5.5	5.00	1590.91
Shih Tzu	toy	21152.0	13.20	small	12.5	9.75	1602.42

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	Sam's favorites	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	Sam's favorites	18993.0	11.66	medium	30.0	16.0
Cocker Spaniel	Sam's favorites	24330.0	12.50	small	25.0	14.5
...	...	...	...	...	...	...
Bloodhound	hound	13824.0	6.75	large	85.0	25.0
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	lifetime_cost	longevity	weight	height
count	43.00	43.00	43.00	43.00
mean	20532.84	11.34	49.35	18.34
std	3290.78	2.05	39.42	6.83
min	13581.00	6.50	5.00	5.00
25%	18508.50	10.05	18.00	11.75
50%	21006.00	11.81	36.50	18.50
75%	22072.50	12.52	67.50	25.00
max	26686.00	16.50	175.00	30.00

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993	11.66	medium	30.0	16.0
3	Cocker Spaniel	sporting	24330	12.50	small	25.0	14.5
...	...	...	...	...	...	...	...
39	Bloodhound	hound	13824	6.75	large	85.0	25.0
40	Bullmastiff	working	13936	7.57	large	115.0	25.5
41	Mastiff	working	13581	6.50	large	175.0	30.0
42	Saint Bernard	working	20022	7.78	large	155.0	26.5

	1	1
0	fee	fo
1	fi	fum

Lecture 2 – DataFrame Fundamentals¶

DSC 80, Fall 2023¶

Announcements 📣¶

Agenda¶

Review: numpy¶

⚠️ The dangers of for-loops¶

Introduction to pandas 🐼¶

Baby pandas¶

pandas¶

pandas¶

pandas data structures¶

Importing pandas and related libraries¶

Example: Dog Breeds (woof!) 🐶¶

But...¶

Discussion Question¶

A Smaller Dogs Dataframe¶

Review: head, tail, shape, index, get, sort_values¶

Setting the index¶

💡 Pro-tip: Displaying more rows/columns¶

Selecting columns¶

Selecting columns in babypandas 👶🐼¶

Selecting columns with []¶

Useful Series methods¶

Subsetting rows (and columns)¶

Using loc to slice rows and columns using labels¶

💡 Pro-Tip: Using Pandas Tutor¶

.loc is flexible¶

Review: Filtering (aka Querying)¶

Filtering with Multiple Conditions¶

💡 Pro-Tip: Using .query (optional)¶

Don't forget iloc!¶

More Practice¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

💡 Pro-Tip: assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

Replacing values¶

Axes¶

Axes¶

DataFrame methods with axis¶

DataFrame methods with axis¶

pandas and numpy¶

pandas is built upon numpy¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Putting it all together¶

Summary, next time¶

Summary¶

Review: `numpy`¶

⚠️ The dangers of `for`-loops¶

Introduction to `pandas` 🐼¶

`pandas`¶

`pandas` data structures¶

Importing `pandas` and related libraries¶

Review: `head`, `tail`, `shape`, `index`, `get`, `sort_values`¶

Selecting columns in `babypandas` 👶🐼¶

Selecting columns with `[]`¶

Using `loc` to slice rows and columns using labels¶

`.loc` is flexible¶

💡 Pro-Tip: Using `.query` (optional)¶

Don't forget `iloc`!¶

💡 Pro-Tip: `assign` for column names with special characters¶

DataFrame methods with `axis`¶

DataFrame methods with `axis`¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting dtypes in `read_csv`¶