# You'll start seeing this cell in most lectures.
# It exists to hide all of the import statements and other setup
# code we need in lecture notebooks.
from dsc80_utils import *

%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

arr = np.arange(10)
arr

# The shape (10,) means that the array only has a single dimension,
# of size 10.
arr.shape

2 ** arr

(2 ** arr).sum()

(2 ** arr).mean()

(2 ** arr).max()

(2 ** arr).argmax()

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

%%timeit
squares = np.arange(1_000_000) ** 2

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])

nums

# nums has 3 rows and 4 columns.
nums.shape

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7).reshape((2, 3))
a

a

a.sum(axis=0)

a.sum(axis=1)

a

# Accesses row 0 and all columns.
a[0, :]

# Same as the above.
a[0]

# Accesses all rows and column 1.
a[:, 1]

# Accesses row 0 and columns 1 and onwards.
a[0, 1:]

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
# grid[-1, 1:].sum()

from PIL import Image
img_path = Path('imgs') / 'bentley.jpg'
img = np.asarray(Image.open(img_path)) / 255

img

img.shape

plt.imshow(img)
plt.axis('off');

mean_2d = img.mean(axis=2)
mean_2d

plt.imshow(mean_2d)
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5)[:, np.newaxis]

np.repeat(np.arange(5)[:, np.newaxis], 3, axis=1)

mean_3d = np.repeat(mean_2d[:, :, np.newaxis], 3, axis=2)

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = (img @ sepia_filter.T).clip(0, 1)
filtered

plt.imshow(filtered)
plt.axis('off');

dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs['lifetime_cost']

dogs['lifetime_cost'].to_numpy()

dogs

dogs.dtypes

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

dogs = dogs.assign(lifetime_cost=dogs['lifetime_cost'].astype('uint32'))

dogs.info()

dog_path

dogs = pd.read_csv(dog_path, dtype={'lifetime_cost': 'uint32'})
dogs

dogs.dtypes

dogs

# Max element in each column.
dogs.max()

# Max element in each row – throws an error since there are different types in each row.
# dogs.max(axis=1)

# The number of unique values in each column.
dogs.nunique()

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv(Path('data') / 'all_dogs.csv')
all_dogs

IFrame('https://www.youtube-nocookie.com/embed/CCrNAHXUstU?si=-DntSyUNp5Kwitjm&amp;start=11',
       width=560, height=315)

import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()
penguins

penguins['body_mass_g'].mean()

# ???

species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

# Before:
penguins['body_mass_g'].mean()

# After:
penguins.groupby('species')['body_mass_g'].mean()

%%pt

penguins.groupby('species')['body_mass_g'].mean()

# Fill this in, then respond on dsc80.com/q

penguins

penguins.groupby('species')['bill_length_mm'].mean()

penguins.groupby('species')

# Simplified DataFrame for demonstration:
penguins_small = penguins.iloc[[0, 150, 300, 1, 251, 151, 301], [0, 5, 6]]
penguins_small

# Creates one group for each unique value in the species column.
penguin_groups = penguins_small.groupby('species')
penguin_groups

%%pt
penguin_groups

penguin_groups.groups

penguin_groups.get_group('Chinstrap')

# Same as the above!
penguins_small.query('species == "Chinstrap"')

penguins_small

penguins_small.groupby('species')['body_mass_g'].mean()

# Whoa, what happened in the sex column?
penguins_small.groupby('species').sum()

penguins_small.groupby('species').last()

penguins_small.groupby('species').max()

penguins_small.groupby('species').max()

# This penguin is Female!
penguins_small.loc[(penguins['species'] == 'Adelie') & (penguins['body_mass_g'] == 3800.0)]

# Your code goes here.

# Back to the big penguins dataset!
penguins

# Works, but involves wasted effort since the other columns had to be aggregated for no reason.
penguins.groupby('species').sum()['bill_length_mm']

# This is a SeriesGroupBy object!
penguins.groupby('species')['bill_length_mm']

# Saves time!
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
penguins.groupby('species').sum()['bill_length_mm']

%%timeit
penguins.groupby('species')['bill_length_mm'].sum()

%%timeit
species_map = pd.Series([], dtype=float)

for species in penguins['species'].unique():
    species_only = penguins.loc[penguins['species'] == species]
    species_map.loc[species] = species_only['body_mass_g'].mean()

species_map

# Slower
penguins.groupby('species').sum()['bill_length_mm']

# Faster
penguins.groupby('species')['bill_length_mm'].sum()

(penguins
 .groupby('species')
 ['body_mass_g']
 .aggregate(['count', 'mean'])
)

(penguins
 .groupby('species')
 .aggregate({'bill_length_mm': 'max', 'island': 'unique'})
)

# Here, the argument to agg is a function,
# which takes in a pd.Series and returns a scalar.

def iqr(s):
    return np.percentile(s, 75) - np.percentile(s, 25)

(penguins
 .groupby('species')
 ['body_mass_g']
 .agg(iqr)
)

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

Lecture 2 – DataFrame Fundamentals¶

Agenda¶

numpy arrays¶

numpy overview¶

⚠️ The dangers of for-loops¶

Multi-dimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Question 🤔

Example: Image processing¶

Applying a greyscale filter¶

Applying a sepia filter¶

Key takeaway: avoid for-loops whenever possible!¶

pandas and numpy¶

pandas is built upon numpy!¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Exercise

Data granularity and the groupby method¶

Example: Palmer Penguins¶

Granularity¶

Aggregating¶

Naive approach: looping through unique values¶

Grouping¶

"Split-apply-combine" paradigm¶

More examples¶

Question 🤔

DataFrameGroupBy objects and aggregation¶

DataFrameGroupBy objects¶

Peeking under the hood¶

Aggregation¶

Column independence¶

Question 🤔

Column selection and performance implications¶

Takeaways¶

Beyond default aggregation methods¶

The aggregate method¶

Example¶

Example¶

Example¶

`numpy` arrays¶

`numpy` overview¶

⚠️ The dangers of `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶

Data granularity and the `groupby` method¶

`DataFrameGroupBy` objects and aggregation¶

`DataFrameGroupBy` objects¶

The `aggregate` method¶