from course_utils import *

# You'll see the Path(...) / subpath syntax a lot.
# It creates the correct path to your file,
# whether you're using Windows, macOS, or Linux.
dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

# This sorts by 'height',
# then breaks ties by 'longevity'.
# Note the difference in the last three rows between
# this DataFrame and the one above.
dogs.sort_values(['height', 'longevity'],
                 ascending=False)

dogs

dogs.set_index('breed')

# The above cell didn't involve an assignment statement,
# so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False),
           rows=43)

# Returns a Series.
dogs['kind']

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

dogs.index

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

# What's the distribution of kinds?
dogs['kind'].value_counts()

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

# Tell me more about the 'weight' column.
dogs['weight'].describe()

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

# Gives us the index of the largest value, not the largest value itself.
dogs['lifetime_cost'].idxmax()

dogs

# The first argument is the row label.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

type(dogs.loc)

type(dogs.sort_values)

%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

%%pt
dogs.loc['Pug', 'longevity']

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

dogs

# Series!
dogs.loc['Maltese']

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

# Finding the breed itself involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

jack = pd.DataFrame({1: ['fee', 'fi'],
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

baby = pd.read_csv('data/baby.csv')
baby

baby['Count'].sum()

baby.groupby('Year').size()

baby['Year'].value_counts()

baby[baby['Year'] == 1880]

baby[baby['Year'] == 1880].value_counts('Name')

baby.groupby('Year')['Count'].sum()

baby.groupby('Year')['Count'].sum().plot()

(baby
 .assign(first_letter=baby['Name'].str[0])
 .query('first_letter == "L"')
 .groupby('Year')
 ['Count']
 .sum()
 .plot(title='Number of Babies Born with an "L" Name Per Year')
)

(baby
 .query('Name == "Siri"')
 .groupby('Year')
 ['Count']
 .sum()
 .plot(title='Number of Babies Born Named "Siri" Per Year')
)

def name_graph(name):
    return (baby
     .query(f'Name == "{name}"')
     .groupby('Year')
     ['Count']
     .sum()
     .plot(title=f'Number of Babies Born Named "{name}" Per Year')
    )

name_graph('Samuel')

name_graph(...)

Lecture 1 – Introduction, Data Science Lifecycle¶

Welcome to DSC 259R! 🎉

Agenda¶

Instructor: Samuel Lau (call me Sam)¶

Prof. Sam Lau¶

What is data science? 🤔¶

What is data science?¶

One common definition¶

What is data science?¶

What is data science?¶

What does a data scientist do?¶

What does a data scientist do?¶

Do people care about climate change?¶

Do people care about climate change?¶

Data science involves people 🧍¶

What is this course really about, then?¶

Course content¶

Course goals¶

Course outcomes¶

Topics¶

The data science lifecycle 🚴¶

The scientific method¶

The data science lifecycle¶

DataFrame Fundamentals¶

Example: Dog Breeds (woof!) 🐶¶

Review: head, tail, shape, index, and sort_values¶

Setting the index¶

💡 Pro-Tip: Displaying more rows/columns¶

Subsetting¶

Useful Series methods¶

Use loc to slice rows and columns using labels¶

💡 Pro-Tip: Using Pandas Tutor¶

.loc is flexible 🧘¶

Filtering (or Querying)¶

Filtering with multiple conditions¶

💡 Pro-Tip: Using .query¶

Don't forget iloc!¶

Practice¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

Example: What's in a name?¶

Lilith, Lilibet … Lucifer? How Baby Names Went to 'L'¶

The data¶

How many unique names were there per year?¶

How many babies were recorded per year?¶

"'L' has to be like the consonant of the decade."¶

What about individual names?¶

What about other names?¶

Review: `head`, `tail`, `shape`, `index`, and `sort_values`¶

Use `loc` to slice rows and columns using labels¶

`.loc` is flexible 🧘¶

💡 Pro-Tip: Using `.query`¶

Don't forget `iloc`!¶