# You'll start seeing this cell in most lectures.
# It exists to hide all of the import statements and other setup
# code we need in lecture notebooks.
from dsc80_utils import *

dogs = pd.read_csv('data/dogs43.csv')
dogs.head(2)

whoa = np.random.choice([True, False], size=len(dogs))
(dogs[whoa]
 .groupby('size')
 .max()
 .get('longevity')
)

arr = ...
arr

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

%%timeit
squares = np.arange(1_000_000) ** 2

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])

nums

# nums has 3 rows and 4 columns.
...

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7) # ...
a

a

a.sum(axis=0)

a.sum(axis=1)

a

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
# grid[-1, 1:].sum()

from PIL import Image
img_path = Path('imgs') / 'bentley.jpg'
img = np.asarray(Image.open(img_path)) / 255

img

img.shape

plt.imshow(img)
plt.axis('off');

mean_2d = ...
mean_2d

plt.imshow(mean_2d)
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5) # ...

mean_3d = ...

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = ...
filtered

plt.imshow(filtered)
plt.axis('off');

import pandas as pd
import numpy as np

# You'll see the Path(...) / subpath syntax a lot.
# It creates the correct path to your file, 
# whether you're using Windows, macOS, or Linux.
dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

dogs.get('breed')

dogs.get(['breed', 'kind', 'longevity'])

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

# This sorts by 'height', 
# then breaks ties by 'longevity'.
# Note the difference in the last three rows between
# this DataFrame and the one above.
dogs.sort_values(['height', 'longevity'],
                 ascending=False)

dogs

dogs # .set_index('breed')

# The above cell didn't involve an assignment statement,
# so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False),
           rows=43)

dogs

dogs.get('size')

# This doesn't error, but sometimes we'd like it to.
dogs.get('size oops!')

dogs

dogs

dogs

# The first argument is the row label.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

type(dogs.loc)

type(dogs.sort_values)

# Pandas Tutor setup. You'll need to run `pip install pandas_tutor` in your Terminal
# for this cell to work, but you can also ignore the error and continue onward.
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

%%pt
dogs.loc['Pug', 'longevity']

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

dogs

# Series!
dogs.loc['Maltese']

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

# Finding the breed itself involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

dogs['lifetime_cost']

dogs['lifetime_cost'].to_numpy()

dogs

dogs.dtypes

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

dogs['lifetime_cost'] = dogs['lifetime_cost'].astype('uint32')

dogs.info()

dog_path

dogs = pd.read_csv(dog_path, dtype={'lifetime_cost': 'uint32'})
dogs

dogs.dtypes

dogs

# Max element in each column.
dogs.max()

# Max element in each row – a little nonsensical, since there are different types in each row.
dogs.max(axis=1)

# The number of unique values in each column.
dogs.nunique()

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv(Path('data') / 'all_dogs.csv')
all_dogs

# Your code goes here.

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

Lecture 2 – DataFrame Fundamentals¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda¶

Throughout lecture, ask questions!¶

q.dsc80.com

Bookmark it!

Question 🤔 (Answer at q.dsc80.com)

numpy arrays¶

numpy overview¶

⚠️ The dangers of for-loops¶

Multi-dimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Question 🤔 (Answer at q.dsc80.com)

Ask ChatGPT: 🧐

Example: Image processing¶

Applying a greyscale filter¶

Applying a sepia filter¶

Key takeaway: avoid for-loops whenever possible!¶

From babypandas to pandas 🐼¶

babypandas¶

pandas¶

pandas¶

pandas data structures¶

Importing pandas and related libraries¶

Example: Dog Breeds (woof!) 🐶¶

Review: head, tail, shape, index, get, and sort_values¶

Setting the index¶

Ask ChatGPT: 🧐

💡 Pro-Tip: Displaying more rows/columns¶

Selecting columns¶

Selecting columns in babypandas 👶🐼¶

Selecting columns with []¶

Useful Series methods¶

Selecting subsets of rows (and columns)¶

Use loc to slice rows and columns using labels¶

💡 Pro-Tip: Using Pandas Tutor¶

.loc is flexible 🧘¶

Review: Querying¶

Querying with multiple conditions¶

💡 Pro-Tip: Using .query¶

Ask ChatGPT: 🧐

Don't forget iloc!¶

Question 🤔 (Answer at q.dsc80.com)

Question 🤔 (Answer at q.dsc80.com)

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

💡 Pro-Tip: assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

pandas and numpy¶

pandas is built upon numpy!¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Exercise

Summary, next time¶

Summary¶

`numpy` arrays¶

`numpy` overview¶

⚠️ The dangers of `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

From `babypandas` to `pandas` 🐼¶

`babypandas`¶

`pandas`¶

`pandas` data structures¶

Importing `pandas` and related libraries¶

Review: `head`, `tail`, `shape`, `index`, `get`, and `sort_values`¶

Selecting columns in `babypandas` 👶🐼¶

Selecting columns with `[]`¶

Use `loc` to slice rows and columns using labels¶

`.loc` is flexible 🧘¶

💡 Pro-Tip: Using `.query`¶

Don't forget `iloc`!¶

💡 Pro-Tip: `assign` for column names with special characters¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶