from dsc80_utils import *

# You'll see the Path(...) / subpath syntax a lot.
# It creates the correct path to your file,
# whether you're using Windows, macOS, or Linux.
dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

(43, 7)

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

RangeIndex(start=0, stop=43, step=1)

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

# This sorts by 'height',
# then breaks ties by 'longevity'.
# Note the difference in the last three rows between
# this DataFrame and the one above.
dogs.sort_values(['height', 'longevity'],
                 ascending=False)

dogs

dogs.set_index('breed')

# The above cell didn't involve an assignment statement,
# so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

(43, 6)

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False),
           rows=43)

# Returns a Series.
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/repos/dsc80/private/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:3805, in Index.get_loc(self, key)
   3804 try:
-> 3805     return self._engine.get_loc(casted_key)
   3806 except KeyError as err:

File index.pyx:167, in pandas._libs.index.IndexEngine.get_loc()

File index.pyx:196, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:7081, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:7089, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[19], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/repos/dsc80/private/.venv/lib/python3.13/site-packages/pandas/core/frame.py:4102, in DataFrame.__getitem__(self, key)
   4100 if self.columns.nlevels > 1:
   4101     return self._getitem_multilevel(key)
-> 4102 indexer = self.columns.get_loc(key)
   4103 if is_integer(indexer):
   4104     indexer = [indexer]

File ~/repos/dsc80/private/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:3812, in Index.get_loc(self, key)
   3807     if isinstance(casted_key, slice) or (
   3808         isinstance(casted_key, abc.Iterable)
   3809         and any(isinstance(x, slice) for x in casted_key)
   3810     ):
   3811         raise InvalidIndexError(key)
-> 3812     raise KeyError(key) from err
   3813 except TypeError:
   3814     # If we have a listlike key, _check_indexing_error will raise
   3815     #  InvalidIndexError. Otherwise we fall through and re-raise
   3816     #  the TypeError.
   3817     self._check_indexing_error(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

7

# What's the distribution of kinds?
dogs['kind'].value_counts()

kind
sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: count, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

np.float64(11.340697674418605)

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

# Gives us the index of the largest value, not the largest value itself.
dogs['lifetime_cost'].idxmax()

'Giant Schnauzer'

dogs

# The first argument is the row label.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

np.float64(11.0)

type(dogs.loc)

pandas.core.indexing._LocIndexer

type(dogs.sort_values)

method

%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

%%pt
dogs.loc['Pug', 'longevity']

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

dogs

# Series!
dogs.loc['Maltese']

kind                 toy
lifetime_cost    19084.0
longevity          12.25
size               small
weight               5.0
height               9.0
Name: Maltese, dtype: object

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

np.float64(5.5)

# Finding the breed itself involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

'Chihuahua'

jack = pd.DataFrame({1: ['fee', 'fi'],
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

baby = pd.read_csv('data/baby.csv')
baby

baby['Count'].sum()

np.int64(365296191)

baby.groupby('Year').size()

Year
1880     2000
1881     1934
1882     2127
        ...  
2020    31517
2021    31685
2022    31915
Length: 143, dtype: int64

baby['Year'].value_counts()

Year
2008    35094
2007    34966
2009    34724
        ...  
1883     2084
1880     2000
1881     1934
Name: count, Length: 143, dtype: int64

baby[baby['Year'] == 1880]

baby[baby['Year'] == 1880].value_counts('Name')

Name
Grace      2
Emma       2
Clair      2
          ..
Evaline    1
Evalena    1
Zula       1
Name: count, Length: 1889, dtype: int64

baby.groupby('Year')['Count'].sum()

Year
1880     201484
1881     192690
1882     221533
         ...   
2020    3333981
2021    3379713
2022    3361896
Name: Count, Length: 143, dtype: int64

baby.groupby('Year')['Count'].sum().plot()

(baby
 .assign(first_letter=baby['Name'].str[0])
 .query('first_letter == "L"')
 .groupby('Year')
 ['Count']
 .sum()
 .plot(title='Number of Babies Born with an "L" Name Per Year')
)

(baby
 .query('Name == "Siri"')
 .groupby('Year')
 ['Count']
 .sum()
 .plot(title='Number of Babies Born Named "Siri" Per Year')
)

def name_graph(name):
    return (baby
     .query(f'Name == "{name}"')
     .groupby('Year')
     ['Count']
     .sum()
     .plot(title=f'Number of Babies Born Named "{name}" Per Year')
    )

name_graph('Samuel')

name_graph(...)

	Name	Sex	Count	Year
2083158	John	M	9655	1880
2083159	William	M	9532	1880
2083160	Mary	F	7065	1880
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	size	height
breed
Cocker Spaniel	sporting	small	14.5
Labrador Retriever	sporting	medium	23.0

	lifetime_cost	longevity	size	weight
breed
Cocker Spaniel	24330.0	12.50	small	25.0
Labrador Retriever	21299.0	12.04	medium	67.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Chihuahua	toy	26250.0	16.50	small	5.5	5.0
Maltese	toy	19084.0	12.25	small	5.0	9.0

	Name	Sex	Count	Year
0	Liam	M	20456	2022
1	Noah	M	18621	2022
2	Olivia	F	16573	2022
...	...	...	...	...
2085155	Wright	M	5	1880
2085156	York	M	5	1880
2085157	Zachariah	M	5	1880

	1	1
0	fee	fo
1	fi	fum

Lecture 1 – Introduction, Data Science Lifecycle¶

Welcome to DSC 259R! 🎉

Agenda¶

Instructor: Samuel Lau (call me Sam)¶

Prof. Sam Lau¶

Course staff¶

What is data science? 🤔¶

What is data science?¶

One common definition¶

What is data science?¶

What is data science?¶

What does a data scientist do?¶

What does a data scientist do?¶

Do people care about climate change?¶

Do people care about climate change?¶

Data science involves people 🧍¶

What is this course really about, then?¶

Course content¶

Course goals¶

Course outcomes¶

Topics¶

The data science lifecycle 🚴¶

The scientific method¶

The data science lifecycle¶

DataFrame Fundamentals¶

Example: Dog Breeds (woof!) 🐶¶

Review: head, tail, shape, index, and sort_values¶

Setting the index¶

💡 Pro-Tip: Displaying more rows/columns¶

Subsetting¶

Useful Series methods¶

Use loc to slice rows and columns using labels¶

💡 Pro-Tip: Using Pandas Tutor¶

.loc is flexible 🧘¶

Filtering (or Querying)¶

Filtering with multiple conditions¶

💡 Pro-Tip: Using .query¶

Don't forget iloc!¶

Practice¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

Example: What's in a name?¶

Lilith, Lilibet … Lucifer? How Baby Names Went to 'L'¶

The data¶

How many unique names were there per year?¶

How many babies were recorded per year?¶

"'L' has to be like the consonant of the decade."¶

What about individual names?¶

What about other names?¶

Review: `head`, `tail`, `shape`, `index`, and `sort_values`¶

Use `loc` to slice rows and columns using labels¶

`.loc` is flexible 🧘¶

💡 Pro-Tip: Using `.query`¶

Don't forget `iloc`!¶