# You'll start seeing this cell in most lectures.
# It exists to hide all of the import statements and other setup
# code we need in lecture notebooks.
from dsc80_utils import *

%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# The shape (10,) means that the array only has a single dimension,
# of size 10.
arr.shape

(10,)

2 ** arr

array([  1,   2,   4,   8,  16,  32,  64, 128, 256, 512])

(2 ** arr).sum()

np.int64(1023)

(2 ** arr).mean()

np.float64(102.3)

(2 ** arr).max()

np.int64(512)

(2 ** arr).argmax()

np.int64(9)

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

32 ms ± 767 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
squares = np.arange(1_000_000) ** 2

1.1 ms ± 35.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])

nums

array([[5, 1, 9, 7],
       [9, 8, 2, 3],
       [2, 5, 0, 4]])

# nums has 3 rows and 4 columns.
nums.shape

(3, 4)

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7).reshape((2, 3))
a

array([[1, 2, 3],
       [4, 5, 6]])

a

array([[1, 2, 3],
       [4, 5, 6]])

a.sum(axis=0)

array([5, 7, 9])

a.sum(axis=1)

array([ 6, 15])

a

array([[1, 2, 3],
       [4, 5, 6]])

# Accesses row 0 and all columns.
a[0, :]

array([1, 2, 3])

# Same as the above.
a[0]

array([1, 2, 3])

# Accesses all rows and column 1.
a[:, 1]

array([2, 5])

# Accesses row 0 and columns 1 and onwards.
a[0, 1:]

array([2, 3])

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
# grid[-1, 1:].sum()

from PIL import Image
img_path = Path('imgs') / 'bentley.jpg'
img = np.asarray(Image.open(img_path)) / 255

img

array([[[0.4 , 0.33, 0.24],
        [0.42, 0.35, 0.25],
        [0.43, 0.36, 0.26],
        ...,
        [0.5 , 0.44, 0.36],
        [0.51, 0.44, 0.36],
        [0.51, 0.44, 0.36]],

       [[0.39, 0.33, 0.23],
        [0.42, 0.36, 0.26],
        [0.44, 0.37, 0.27],
        ...,
        [0.51, 0.44, 0.36],
        [0.52, 0.45, 0.37],
        [0.52, 0.45, 0.38]],

       [[0.38, 0.31, 0.21],
        [0.41, 0.35, 0.24],
        [0.44, 0.37, 0.27],
        ...,
        [0.52, 0.45, 0.38],
        [0.53, 0.46, 0.39],
        [0.53, 0.47, 0.4 ]],

       ...,

       [[0.71, 0.64, 0.55],
        [0.71, 0.65, 0.55],
        [0.68, 0.62, 0.52],
        ...,
        [0.58, 0.49, 0.41],
        [0.56, 0.47, 0.39],
        [0.56, 0.47, 0.39]],

       [[0.5 , 0.44, 0.34],
        [0.42, 0.37, 0.26],
        [0.44, 0.38, 0.28],
        ...,
        [0.4 , 0.33, 0.25],
        [0.55, 0.48, 0.4 ],
        [0.58, 0.5 , 0.42]],

       [[0.38, 0.33, 0.22],
        [0.49, 0.44, 0.33],
        [0.56, 0.51, 0.4 ],
        ...,
        [0.15, 0.08, 0.  ],
        [0.28, 0.21, 0.13],
        [0.42, 0.35, 0.27]]])

img.shape

(200, 263, 3)

plt.imshow(img)
plt.axis('off');

mean_2d = img.mean(axis=2)
mean_2d

array([[0.32, 0.34, 0.35, ..., 0.43, 0.44, 0.44],
       [0.31, 0.35, 0.36, ..., 0.44, 0.45, 0.45],
       [0.3 , 0.33, 0.36, ..., 0.45, 0.46, 0.47],
       ...,
       [0.64, 0.64, 0.6 , ..., 0.49, 0.47, 0.47],
       [0.43, 0.35, 0.37, ..., 0.32, 0.48, 0.5 ],
       [0.31, 0.42, 0.49, ..., 0.07, 0.21, 0.34]])

plt.imshow(mean_2d)
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5)[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4]])

np.repeat(np.arange(5)[:, np.newaxis], 3, axis=1)

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])

mean_3d = np.repeat(mean_2d[:, :, np.newaxis], 3, axis=2)

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = (img @ sepia_filter.T).clip(0, 1)
filtered

array([[[0.46, 0.41, 0.32],
        [0.48, 0.43, 0.33],
        [0.5 , 0.44, 0.35],
        ...,
        [0.6 , 0.53, 0.42],
        [0.6 , 0.54, 0.42],
        [0.61, 0.54, 0.42]],

       [[0.45, 0.4 , 0.31],
        [0.49, 0.43, 0.34],
        [0.5 , 0.45, 0.35],
        ...,
        [0.61, 0.54, 0.42],
        [0.62, 0.55, 0.43],
        [0.63, 0.56, 0.43]],

       [[0.43, 0.38, 0.3 ],
        [0.47, 0.42, 0.33],
        [0.51, 0.45, 0.35],
        ...,
        [0.63, 0.56, 0.44],
        [0.64, 0.57, 0.44],
        [0.64, 0.57, 0.45]],

       ...,

       [[0.88, 0.78, 0.61],
        [0.89, 0.79, 0.61],
        [0.84, 0.75, 0.58],
        ...,
        [0.68, 0.61, 0.47],
        [0.65, 0.58, 0.45],
        [0.65, 0.58, 0.45]],

       [[0.6 , 0.53, 0.42],
        [0.5 , 0.44, 0.35],
        [0.52, 0.46, 0.36],
        ...,
        [0.45, 0.4 , 0.31],
        [0.66, 0.59, 0.46],
        [0.69, 0.62, 0.48]],

       [[0.45, 0.4 , 0.31],
        [0.59, 0.53, 0.41],
        [0.69, 0.61, 0.48],
        ...,
        [0.12, 0.1 , 0.08],
        [0.3 , 0.26, 0.21],
        [0.48, 0.43, 0.33]]])

plt.imshow(filtered)
plt.axis('off');

dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs['lifetime_cost']

0     22589.0
1     21992.0
2     18993.0
       ...   
40    13936.0
41    13581.0
42    20022.0
Name: lifetime_cost, Length: 43, dtype: float64

dogs['lifetime_cost'].to_numpy()

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

dogs

dogs.dtypes

breed             object
kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   breed          43 non-null     object 
 1   kind           43 non-null     object 
 2   lifetime_cost  43 non-null     float64
 3   longevity      43 non-null     float64
 4   size           43 non-null     object 
 5   weight         43 non-null     float64
 6   height         43 non-null     float64
dtypes: float64(4), object(3)
memory usage: 2.5+ KB

dogs = dogs.assign(lifetime_cost=dogs['lifetime_cost'].astype('uint32'))

dogs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   breed          43 non-null     object 
 1   kind           43 non-null     object 
 2   lifetime_cost  43 non-null     uint32 
 3   longevity      43 non-null     float64
 4   size           43 non-null     object 
 5   weight         43 non-null     float64
 6   height         43 non-null     float64
dtypes: float64(3), object(3), uint32(1)
memory usage: 2.3+ KB

dog_path

PosixPath('data/dogs43.csv')

dogs = pd.read_csv(dog_path, dtype={'lifetime_cost': 'uint32'})
dogs

dogs.dtypes

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

# Max element in each column.
dogs.max()

breed            Tibetan Terrier
kind                     working
lifetime_cost              26686
longevity                   16.5
size                       small
weight                     175.0
height                      30.0
dtype: object

# Max element in each row – throws an error since there are different types in each row.
# dogs.max(axis=1)

# The number of unique values in each column.
dogs.nunique()

breed            43
kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv(Path('data') / 'all_dogs.csv')
all_dogs

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['body_mass_g'].mean()

np.float64(4207.057057057057)

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
dtype: float64

# Before:
penguins['body_mass_g'].mean()

np.float64(4207.057057057057)

# After:
penguins.groupby('species')['body_mass_g'].mean()

species
Adelie       3706.16
Chinstrap    3733.09
Gentoo       5092.44
Name: body_mass_g, dtype: float64

%%pt

penguins.groupby('species')['body_mass_g'].mean()

/Users/sam/repos/dsc80/private/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py:2565: FutureWarning: DataFrameGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  result = fn(*args, **kwargs)
/Users/sam/repos/dsc80/private/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py:2565: FutureWarning: DataFrameGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  result = fn(*args, **kwargs)
/Users/sam/repos/dsc80/private/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py:2565: FutureWarning: SeriesGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  result = fn(*args, **kwargs)

# Fill this in, then respond on dsc80.com/q

penguins

penguins.groupby('species')['bill_length_mm'].mean()

species
Adelie       38.82
Chinstrap    48.83
Gentoo       47.57
Name: bill_length_mm, dtype: float64

penguins.groupby('species')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x174cc8950>

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x174cc8150>

%%pt
penguin_groups

/Users/sam/repos/dsc80/private/.venv/lib/python3.13/site-packages/IPython/core/interactiveshell.py:2565: FutureWarning: DataFrameGroupBy.grouper is deprecated and will be removed in a future version of pandas.
  result = fn(*args, **kwargs)

penguin_groups.groups

{'Adelie': [0, 1], 'Chinstrap': [156, 157], 'Gentoo': [308, 258, 309]}

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species')['body_mass_g'].mean()

species
Adelie       3775.0
Chinstrap    3837.5
Gentoo       4925.0
Name: body_mass_g, dtype: float64

# Whoa, what happened in the sex column?
penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Your code goes here.

# Back to the big penguins dataset!
penguins

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').sum()['bill_length_mm']

species
Adelie       5668.3
Chinstrap    3320.7
Gentoo       5660.6
Name: bill_length_mm, dtype: float64

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x172e61640>

# Saves time!
penguins.groupby('species')['bill_length_mm'].sum()

species
Adelie       5668.3
Chinstrap    3320.7
Gentoo       5660.6
Name: bill_length_mm, dtype: float64

%%timeit
penguins.groupby('species').sum()['bill_length_mm']

251 μs ± 2.72 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
penguins.groupby('species')['bill_length_mm'].sum()

82.3 μs ± 1.63 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

603 μs ± 8.79 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

# Slower
penguins.groupby('species').sum()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].sum()

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

species
Adelie       637.5
Chinstrap    462.5
Gentoo       800.0
Name: body_mass_g, dtype: float64

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

	lifetime_cost	longevity	weight	height
count	43.00	43.00	43.00	43.00
mean	20532.84	11.34	49.35	18.34
std	3290.78	2.05	39.42	6.83
...	...	...	...	...
50%	21006.00	11.81	36.50	18.50
75%	22072.50	12.52	67.50	25.00
max	26686.00	16.50	175.00	30.00

	breed	group	datadog	popularity_all	...	megarank	size	weight	height
0	Border Collie	herding	3.64	45	...	29.0	medium	NaN	20.0
1	Border Terrier	terrier	3.61	80	...	1.0	small	13.5	NaN
2	Brittany	sporting	3.54	30	...	11.0	medium	35.0	19.0
...	...	...	...	...	...	...	...	...	...
169	Wire Fox Terrier	terrier	NaN	100	...	NaN	small	17.5	15.0
170	Wirehaired Pointing Griffon	sporting	NaN	92	...	NaN	medium	NaN	22.0
171	Xoloitzcuintli	non-sporting	NaN	155	...	NaN	medium	NaN	16.5

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	species	island	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	sex
0	Adelie	Torgersen	39.1	18.7	181.0	3750.0	Male
1	Adelie	Torgersen	39.5	17.4	186.0	3800.0	Female
2	Adelie	Torgersen	40.3	18.0	195.0	3250.0	Female
...	...	...	...	...	...	...	...
341	Gentoo	Biscoe	50.4	15.7	222.0	5750.0	Male
342	Gentoo	Biscoe	45.2	14.8	212.0	5200.0	Female
343	Gentoo	Biscoe	49.9	16.1	213.0	5400.0	Male

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936.0	7.57	large	115.0	25.5
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936	7.57	large	115.0	25.5
41	Mastiff	working	13581	6.50	large	175.0	30.0
42	Saint Bernard	working	20022	7.78	large	155.0	26.5

	body_mass_g	sex
species
Adelie	7550.0	MaleFemale
Chinstrap	7675.0	MaleFemale
Gentoo	14775.0	FemaleFemaleMale

	bill_length_mm	island
species
Adelie	46.0	[Torgersen, Biscoe, Dream]
Chinstrap	58.0	[Dream]
Gentoo	59.6	[Biscoe]

Lecture 2 – DataFrame Fundamentals¶

Agenda¶

numpy arrays¶

numpy overview¶

⚠️ The dangers of for-loops¶

Multi-dimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Question 🤔

Example: Image processing¶

Applying a greyscale filter¶

Applying a sepia filter¶

Key takeaway: avoid for-loops whenever possible!¶

pandas and numpy¶

pandas is built upon numpy!¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Exercise

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Question 🤔

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Question 🤔

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

`numpy` arrays¶

`numpy` overview¶

⚠️ The dangers of `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶