# You'll start seeing this cell in most lectures.
# It exists to hide all of the import statements and other setup
# code we need in lecture notebooks.
from dsc80_utils import *

dogs = pd.read_csv('data/dogs43.csv')
dogs.head(2)

whoa = np.random.choice([True, False], size=len(dogs))
(dogs[whoa]
 .groupby('size')
 .max()
 .get('longevity')
)

size
large     12.42
medium    12.54
small     16.50
Name: longevity, dtype: float64

arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# The shape (10,) means that the array only has a single dimension,
# of size 10.
arr.shape

(10,)

2 ** arr

array([  1,   2,   4,   8,  16,  32,  64, 128, 256, 512])

(2 ** arr).sum()

1023

(2 ** arr).mean()

102.3

(2 ** arr).max()

512

(2 ** arr).argmax()

9

%%timeit
squares = []
for i in range(1_000_000):
    squares.append(i * i)

64.3 ms ± 751 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
squares = np.arange(1_000_000) ** 2

421 µs ± 6.13 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

nums = np.array([
    [5, 1, 9, 7],
    [9, 8, 2, 3],
    [2, 5, 0, 4]
])

nums

array([[5, 1, 9, 7],
       [9, 8, 2, 3],
       [2, 5, 0, 4]])

# nums has 3 rows and 4 columns.
nums.shape

(3, 4)

# Here, we're asking to reshape np.arange(1, 7)
# so that it has 2 rows and 3 columns.
a = np.arange(1, 7).reshape((2, 3))
a

array([[1, 2, 3],
       [4, 5, 6]])

a

array([[1, 2, 3],
       [4, 5, 6]])

a.sum(axis=0)

array([5, 7, 9])

a.sum(axis=1)

array([ 6, 15])

a

array([[1, 2, 3],
       [4, 5, 6]])

# Accesses row 0 and all columns.
a[0, :]

array([1, 2, 3])

# Same as the above.
a[0]

array([1, 2, 3])

# Accesses all rows and column 1.
a[:, 1]

array([2, 5])

# Accesses row 0 and columns 1 and onwards.
a[0, 1:]

array([2, 3])

s = (5, 3)
grid = np.ones(s) * 2 * np.arange(1, 16).reshape(s)
# grid[-1, 1:].sum()

from PIL import Image
img_path = Path('imgs') / 'bentley.jpg'
img = np.asarray(Image.open(img_path)) / 255

img

array([[[0.4 , 0.33, 0.24],
        [0.42, 0.35, 0.25],
        [0.43, 0.36, 0.27],
        ...,
        [0.5 , 0.44, 0.36],
        [0.51, 0.44, 0.36],
        [0.51, 0.44, 0.36]],

       [[0.39, 0.33, 0.22],
        [0.42, 0.36, 0.25],
        [0.44, 0.37, 0.27],
        ...,
        [0.51, 0.44, 0.37],
        [0.52, 0.45, 0.38],
        [0.52, 0.45, 0.38]],

       [[0.38, 0.31, 0.21],
        [0.41, 0.35, 0.24],
        [0.44, 0.37, 0.27],
        ...,
        [0.52, 0.45, 0.38],
        [0.53, 0.46, 0.39],
        [0.53, 0.47, 0.4 ]],

       ...,

       [[0.71, 0.65, 0.55],
        [0.72, 0.65, 0.55],
        [0.68, 0.62, 0.52],
        ...,
        [0.58, 0.49, 0.42],
        [0.56, 0.47, 0.39],
        [0.56, 0.47, 0.39]],

       [[0.5 , 0.44, 0.34],
        [0.43, 0.36, 0.26],
        [0.44, 0.38, 0.28],
        ...,
        [0.4 , 0.33, 0.25],
        [0.55, 0.48, 0.4 ],
        [0.58, 0.5 , 0.42]],

       [[0.38, 0.33, 0.22],
        [0.49, 0.44, 0.33],
        [0.56, 0.51, 0.4 ],
        ...,
        [0.14, 0.08, 0.01],
        [0.27, 0.22, 0.14],
        [0.41, 0.35, 0.27]]])

img.shape

(200, 263, 3)

plt.imshow(img)
plt.axis('off');

mean_2d = img.mean(axis=2)
mean_2d

array([[0.32, 0.34, 0.35, ..., 0.43, 0.44, 0.44],
       [0.31, 0.34, 0.36, ..., 0.44, 0.45, 0.45],
       [0.3 , 0.33, 0.36, ..., 0.45, 0.46, 0.47],
       ...,
       [0.64, 0.64, 0.6 , ..., 0.49, 0.47, 0.47],
       [0.43, 0.35, 0.37, ..., 0.32, 0.48, 0.5 ],
       [0.31, 0.42, 0.49, ..., 0.08, 0.21, 0.34]])

plt.imshow(mean_2d)
plt.axis('off');

# np.newaxis is an alias for None.
# It helps us introduce an additional axis.
np.arange(5)[:, np.newaxis]

array([[0],
       [1],
       [2],
       [3],
       [4]])

np.repeat(np.arange(5)[:, np.newaxis], 3, axis=1)

array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])

mean_3d = np.repeat(mean_2d[:, :, np.newaxis], 3, axis=2)

plt.imshow(mean_3d)
plt.axis('off');

sepia_filter = np.array([
    [0.393, 0.769, 0.189],
    [0.349, 0.686, 0.168],
    [0.272, 0.534, 0.131]
])

# Multiplies each pixel by the sepia_filter matrix.
# Then, clips each RGB value to be between 0 and 1.
filtered = (img @ sepia_filter.T).clip(0, 1)
filtered

array([[[0.46, 0.41, 0.32],
        [0.48, 0.43, 0.33],
        [0.5 , 0.44, 0.34],
        ...,
        [0.6 , 0.53, 0.42],
        [0.6 , 0.54, 0.42],
        [0.61, 0.54, 0.42]],

       [[0.45, 0.4 , 0.31],
        [0.49, 0.43, 0.34],
        [0.5 , 0.45, 0.35],
        ...,
        [0.61, 0.54, 0.42],
        [0.62, 0.55, 0.43],
        [0.63, 0.56, 0.44]],

       [[0.43, 0.38, 0.3 ],
        [0.47, 0.42, 0.33],
        [0.51, 0.45, 0.35],
        ...,
        [0.63, 0.56, 0.44],
        [0.64, 0.57, 0.44],
        [0.64, 0.57, 0.45]],

       ...,

       [[0.88, 0.78, 0.61],
        [0.88, 0.79, 0.61],
        [0.84, 0.75, 0.58],
        ...,
        [0.68, 0.6 , 0.47],
        [0.65, 0.58, 0.45],
        [0.65, 0.58, 0.45]],

       [[0.6 , 0.53, 0.42],
        [0.5 , 0.44, 0.35],
        [0.52, 0.46, 0.36],
        ...,
        [0.45, 0.4 , 0.31],
        [0.66, 0.59, 0.46],
        [0.69, 0.62, 0.48]],

       [[0.45, 0.4 , 0.31],
        [0.59, 0.53, 0.41],
        [0.69, 0.61, 0.48],
        ...,
        [0.12, 0.11, 0.08],
        [0.3 , 0.27, 0.21],
        [0.48, 0.43, 0.33]]])

plt.imshow(filtered)
plt.axis('off');

import pandas as pd
import numpy as np

# You'll see the Path(...) / subpath syntax a lot.
# It creates the correct path to your file, 
# whether you're using Windows, macOS, or Linux.
dog_path = Path('data') / 'dogs43.csv'
dogs = pd.read_csv(dog_path)
dogs

dogs.head(3)

dogs.tail(2)

dogs.shape

(43, 7)

# The default index of a DataFrame is 0, 1, 2, 3, ...
dogs.index

RangeIndex(start=0, stop=43, step=1)

dogs.get('breed')

0                   Brittany
1              Cairn Terrier
2     English Cocker Spaniel
               ...          
40               Bullmastiff
41                   Mastiff
42             Saint Bernard
Name: breed, Length: 43, dtype: object

dogs.get(['breed', 'kind', 'longevity'])

# Note that the index is no longer 0, 1, 2, ...!
dogs.sort_values('height', ascending=False)

# This sorts by 'height', 
# then breaks ties by 'longevity'.
# Note the difference in the last three rows between
# this DataFrame and the one above.
dogs.sort_values(['height', 'longevity'],
                 ascending=False)

dogs

dogs.set_index('breed')

# The above cell didn't involve an assignment statement,
# so dogs was unchanged.
dogs

# By reassigning dogs, our changes will persist.
dogs = dogs.set_index('breed')
dogs

# There used to be 7 columns, but now there are only 6!
dogs.shape

(43, 6)

from IPython.display import display
def display_df(df, rows=pd.options.display.max_rows, cols=pd.options.display.max_columns):
    """Displays n rows and cols from df."""
    with pd.option_context("display.max_rows", rows,
                           "display.max_columns", cols):
        display(df)

display_df(dogs.sort_values('weight', ascending=False),
           rows=43)

dogs

dogs.get('size')

breed
Brittany                  medium
Cairn Terrier              small
English Cocker Spaniel    medium
                           ...  
Bullmastiff                large
Mastiff                    large
Saint Bernard              large
Name: size, Length: 43, dtype: object

# This doesn't error, but sometimes we'd like it to.
dogs.get('size oops!')

dogs

# Returns a Series.
dogs['kind']

breed
Brittany                  sporting
Cairn Terrier              terrier
English Cocker Spaniel    sporting
                            ...   
Bullmastiff                working
Mastiff                    working
Saint Bernard              working
Name: kind, Length: 43, dtype: object

# Returns a DataFrame.
dogs[['kind', 'size']]

# 🤔
dogs[['kind']]

# Breeds are stored in the index, which is not a column!
dogs['breed']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3361, in Index.get_loc(self, key, method, tolerance)
   3360 try:
-> 3361     return self._engine.get_loc(casted_key)
   3362 except KeyError as err:

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:76, in pandas._libs.index.IndexEngine.get_loc()

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/_libs/index.pyx:108, in pandas._libs.index.IndexEngine.get_loc()

File pandas/_libs/hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas/_libs/hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'breed'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[62], line 2
      1 # Breeds are stored in the index, which is not a column!
----> 2 dogs['breed']

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/frame.py:3458, in DataFrame.__getitem__(self, key)
   3456 if self.columns.nlevels > 1:
   3457     return self._getitem_multilevel(key)
-> 3458 indexer = self.columns.get_loc(key)
   3459 if is_integer(indexer):
   3460     indexer = [indexer]

File ~/mambaforge/envs/dsc80/lib/python3.8/site-packages/pandas/core/indexes/base.py:3363, in Index.get_loc(self, key, method, tolerance)
   3361         return self._engine.get_loc(casted_key)
   3362     except KeyError as err:
-> 3363         raise KeyError(key) from err
   3365 if is_scalar(key) and isna(key) and not self.hasnans:
   3366     raise KeyError(key)

KeyError: 'breed'

dogs.index

Index(['Brittany', 'Cairn Terrier', 'English Cocker Spaniel', 'Cocker Spaniel',
       'Shetland Sheepdog', 'Siberian Husky', 'Lhasa Apso',
       'Miniature Schnauzer', 'Chihuahua', 'English Springer Spaniel',
       'German Shorthaired Pointer', 'Pointer', 'Tibetan Spaniel',
       'Labrador Retriever', 'Maltese', 'Shih Tzu', 'Irish Setter',
       'Golden Retriever', 'Chesapeake Bay Retriever', 'Tibetan Terrier',
       'Gordon Setter', 'Pug', 'Norfolk Terrier', 'English Toy Spaniel',
       'Cavalier King Charles Spaniel', 'Basenji',
       'Staffordshire Bull Terrier', 'Pembroke Welsh Corgi', 'Clumber Spaniel',
       'Dandie Dinmont Terrier', 'Giant Schnauzer', 'Scottish Terrier',
       'Kerry Blue Terrier', 'Afghan Hound', 'Newfoundland',
       'Rhodesian Ridgeback', 'Borzoi', 'Bull Terrier', 'Alaskan Malamute',
       'Bloodhound', 'Bullmastiff', 'Mastiff', 'Saint Bernard'],
      dtype='object', name='breed')

dogs

# What are the unique kinds of dogs?
dogs['kind'].unique()

array(['sporting', 'terrier', 'herding', 'working', 'non-sporting', 'toy',
       'hound'], dtype=object)

# How many unique kinds of dogs are there?
dogs['kind'].nunique()

7

# What's the distribution of kinds?
dogs['kind'].value_counts()

sporting        12
terrier          8
working          7
toy              6
hound            5
non-sporting     3
herding          2
Name: kind, dtype: int64

# What's the mean of the 'longevity' column?
dogs['longevity'].mean()

11.340697674418605

# Tell me more about the 'weight' column.
dogs['weight'].describe()

count     43.00
mean      49.35
std       39.42
          ...  
50%       36.50
75%       67.50
max      175.00
Name: weight, Length: 8, dtype: float64

# Sort the 'lifetime_cost' column. Note that here we're using sort_values on a Series, not a DataFrame!
dogs['lifetime_cost'].sort_values()

breed
Mastiff                       13581.0
Bloodhound                    13824.0
Bullmastiff                   13936.0
                               ...   
German Shorthaired Pointer    25842.0
Chihuahua                     26250.0
Giant Schnauzer               26686.0
Name: lifetime_cost, Length: 43, dtype: float64

# Gives us the index of the largest value, not the largest value itself.
dogs['lifetime_cost'].idxmax()

'Giant Schnauzer'

dogs

# The first argument is the row label.
#        ↓
dogs.loc['Pug', 'longevity']
#                  ↑
# The second argument is the column label.

11.0

type(dogs.loc)

pandas.core.indexing._LocIndexer

type(dogs.sort_values)

method

# Pandas Tutor setup. You'll need to run `pip install pandas_tutor` in your Terminal
# for this cell to work, but you can also ignore the error and continue onward.
%reload_ext pandas_tutor
%set_pandas_tutor_options {"maxDisplayCols": 8, "nohover": True, "projectorMode": True}

%%pt
dogs.loc['Pug', 'longevity']

dogs

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'size']

breed
Cocker Spaniel         small
Labrador Retriever    medium
Name: size, dtype: object

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], ['kind', 'size', 'height']]

# Note that the 'weight' column is included!
dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], 'lifetime_cost': 'weight']

dogs.loc[['Cocker Spaniel', 'Labrador Retriever'], :]

# Shortcut for the line above.
dogs.loc[['Cocker Spaniel', 'Labrador Retriever']]

dogs

dogs.loc[dogs['weight'] < 10]

dogs.loc[dogs.index.str.contains('Retriever')]

# Because querying is so common, there's a shortcut:
dogs[dogs.index.str.contains('Retriever')]

# Empty DataFrame – not an error!
dogs.loc[dogs['kind'] == 'beaver']

dogs

# Series!
dogs.loc['Maltese']

kind                 toy
lifetime_cost    19084.0
longevity          12.25
size               small
weight               5.0
height               9.0
Name: Maltese, dtype: object

dogs_reset = dogs.reset_index()
dogs_reset

# DataFrame!
dogs_reset[dogs_reset['breed'] == 'Maltese']

dogs

dogs[(dogs['weight'] < 20) & (dogs['kind'] == 'terrier')]

dogs

dogs.query('weight < 20 and kind == "terrier"')

dogs.query('kind in ["sporting", "terrier"] and lifetime_cost < 20000')

dogs

dogs.iloc[1:15, :-2]

dogs.sort_values('longevity', ascending=False)['weight'].iloc[0]

5.5

# Finding the breed itself involves sorting, but not iloc.
dogs.sort_values('longevity', ascending=False).index[0]

'Chihuahua'

jack = pd.DataFrame({1: ['fee', 'fi'], 
                     '1': ['fo', 'fum']})
jack

# jack[1]

# jack[[1]]

# jack['1']

# jack[[1, 1]]

# jack.loc[1]

# jack.loc[jack[1] == 'fo']

# jack[1, ['1', 1]]

# jack.loc[1,1]

dogs.assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])

dogs

# Finds the rows corresponding to the five cheapest to own breeds on a per-year basis.
(dogs
 .assign(cost_per_year=dogs['lifetime_cost'] / dogs['longevity'])
 .sort_values('cost_per_year')
 .iloc[:5]
)

dogs.assign(**{'cost per year 💵': dogs['lifetime_cost'] / dogs['longevity']})

# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
dogs_copy = dogs.copy()
dogs_copy.head(2)

dogs_copy['cost_per_year'] = dogs_copy['lifetime_cost'] / dogs_copy['longevity']
dogs_copy

dogs_copy

def cost_in_thousands():
    dogs_copy['lifetime_cost'] = dogs_copy['lifetime_cost'] / 1000

# What happens when we run this twice?
cost_in_thousands()

dogs_copy

dogs['lifetime_cost']

breed
Brittany                  22589.0
Cairn Terrier             21992.0
English Cocker Spaniel    18993.0
                           ...   
Bullmastiff               13936.0
Mastiff                   13581.0
Saint Bernard             20022.0
Name: lifetime_cost, Length: 43, dtype: float64

dogs['lifetime_cost'].to_numpy()

array([22589., 21992., 18993., ..., 13936., 13581., 20022.])

dogs

dogs.dtypes

kind              object
lifetime_cost    float64
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

# Gives the types as well as the space taken up by the DataFrame.
dogs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     float64
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(4), object(2)
memory usage: 3.4+ KB

dogs['lifetime_cost'] = dogs['lifetime_cost'].astype('uint32')

dogs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43 entries, Brittany to Saint Bernard
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   kind           43 non-null     object 
 1   lifetime_cost  43 non-null     uint32 
 2   longevity      43 non-null     float64
 3   size           43 non-null     object 
 4   weight         43 non-null     float64
 5   height         43 non-null     float64
dtypes: float64(3), object(2), uint32(1)
memory usage: 3.2+ KB

dog_path

PosixPath('data/dogs43.csv')

dogs = pd.read_csv(dog_path, dtype={'lifetime_cost': 'uint32'})
dogs

dogs.dtypes

breed             object
kind              object
lifetime_cost     uint32
longevity        float64
size              object
weight           float64
height           float64
dtype: object

dogs

# Max element in each column.
dogs.max()

breed            Tibetan Terrier
kind                     working
lifetime_cost              26686
longevity                   16.5
size                       small
weight                     175.0
height                      30.0
dtype: object

# Max element in each row – a little nonsensical, since there are different types in each row.
dogs.max(axis=1)

/var/folders/63/35_wxty956bfzx41wxtfm3pc0000gn/T/ipykernel_30329/342781375.py:2: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  dogs.max(axis=1)

0     22589.0
1     21992.0
2     18993.0
       ...   
40    13936.0
41    13581.0
42    20022.0
Length: 43, dtype: float64

# The number of unique values in each column.
dogs.nunique()

breed            43
kind              7
lifetime_cost    43
longevity        40
size              3
weight           37
height           30
dtype: int64

# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
dogs.describe()

all_dogs = pd.read_csv(Path('data') / 'all_dogs.csv')
all_dogs

# Your code goes here.

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64 or Timestamp	datetime.datetime	datetime64	DATETIME	Date and time values
timedelta64 or Timedelta	datetime.timedelta	timedelta64	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589.0	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0

	breed	kind	lifetime_cost	longevity	size	weight	height
41	Mastiff	working	13581.0	6.50	large	175.0	30.0
42	Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Brittany	sporting	22589.0	12.92	medium	35.0	19.0
Cairn Terrier	terrier	21992.0	13.84	small	14.0	10.0
English Cocker Spaniel	sporting	18993.0	11.66	medium	30.0	16.0
...	...	...	...	...	...	...
Bullmastiff	working	13936.0	7.57	large	115.0	25.5
Mastiff	working	13581.0	6.50	large	175.0	30.0
Saint Bernard	working	20022.0	7.78	large	155.0	26.5

	kind	size	height
breed
Cocker Spaniel	sporting	small	14.5
Labrador Retriever	sporting	medium	23.0

	lifetime_cost	longevity	size	weight
breed
Cocker Spaniel	24330.0	12.50	small	25.0
Labrador Retriever	21299.0	12.04	medium	67.5

	kind	lifetime_cost	longevity	size	weight	height
breed
Chihuahua	toy	26250.0	16.50	small	5.5	5.0
Maltese	toy	19084.0	12.25	small	5.0	9.0

	breed	kind	lifetime_cost	longevity	size	weight	height
0	Brittany	sporting	22589	12.92	medium	35.0	19.0
1	Cairn Terrier	terrier	21992	13.84	small	14.0	10.0
2	English Cocker Spaniel	sporting	18993	11.66	medium	30.0	16.0
...	...	...	...	...	...	...	...
40	Bullmastiff	working	13936	7.57	large	115.0	25.5
41	Mastiff	working	13581	6.50	large	175.0	30.0
42	Saint Bernard	working	20022	7.78	large	155.0	26.5

	lifetime_cost	longevity	weight	height
count	43.00	43.00	43.00	43.00
mean	20532.84	11.34	49.35	18.34
std	3290.78	2.05	39.42	6.83
...	...	...	...	...
50%	21006.00	11.81	36.50	18.50
75%	22072.50	12.52	67.50	25.00
max	26686.00	16.50	175.00	30.00

	breed	group	datadog	popularity_all	...	megarank	size	weight	height
0	Border Collie	herding	3.64	45	...	29.0	medium	NaN	20.0
1	Border Terrier	terrier	3.61	80	...	1.0	small	13.5	NaN
2	Brittany	sporting	3.54	30	...	11.0	medium	35.0	19.0
...	...	...	...	...	...	...	...	...	...
169	Wire Fox Terrier	terrier	NaN	100	...	NaN	small	17.5	15.0
170	Wirehaired Pointing Griffon	sporting	NaN	92	...	NaN	medium	NaN	22.0
171	Xoloitzcuintli	non-sporting	NaN	155	...	NaN	medium	NaN	16.5

	1	1
0	fee	fo
1	fi	fum

Lecture 2 – DataFrame Fundamentals¶

DSC 80, Spring 2024¶

Announcements 📣¶

Agenda¶

Throughout lecture, ask questions!¶

q.dsc80.com

Bookmark it!

Question 🤔 (Answer at q.dsc80.com)

numpy arrays¶

numpy overview¶

⚠️ The dangers of for-loops¶

Multi-dimensional arrays¶

Operations along axes¶

Selecting rows and columns from 2D arrays¶

Question 🤔 (Answer at q.dsc80.com)

Ask ChatGPT: 🧐

Example: Image processing¶

Applying a greyscale filter¶

Applying a sepia filter¶

Key takeaway: avoid for-loops whenever possible!¶

From babypandas to pandas 🐼¶

babypandas¶

pandas¶

pandas¶

pandas data structures¶

Importing pandas and related libraries¶

Example: Dog Breeds (woof!) 🐶¶

Review: head, tail, shape, index, get, and sort_values¶

Setting the index¶

Ask ChatGPT: 🧐

💡 Pro-Tip: Displaying more rows/columns¶

Selecting columns¶

Selecting columns in babypandas 👶🐼¶

Selecting columns with []¶

Useful Series methods¶

Selecting subsets of rows (and columns)¶

Use loc to slice rows and columns using labels¶

💡 Pro-Tip: Using Pandas Tutor¶

.loc is flexible 🧘¶

Review: Querying¶

Querying with multiple conditions¶

💡 Pro-Tip: Using .query¶

Ask ChatGPT: 🧐

Don't forget iloc!¶

More practice¶

Question 🤔 (Answer at q.dsc80.com)

We ended lecture 2 here.

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

💡 Pro-Tip: Method chaining¶

💡 Pro-Tip: assign for column names with special characters¶

Adding and modifying columns, in-place¶

Mutability¶

⚠️ Avoid mutation when possible¶

pandas and numpy¶

pandas is built upon numpy!¶

pandas data types¶

pandas data types¶

Type conversion¶

💡 Pro-Tip: Setting dtypes in read_csv¶

Axes¶

DataFrame methods with axis¶

Exercise

Summary, next time¶

Summary¶

`numpy` arrays¶

`numpy` overview¶

⚠️ The dangers of `for`-loops¶

Key takeaway: avoid `for`-loops whenever possible!¶

From `babypandas` to `pandas` 🐼¶

`babypandas`¶

`pandas`¶

`pandas` data structures¶

Importing `pandas` and related libraries¶

Review: `head`, `tail`, `shape`, `index`, `get`, and `sort_values`¶

Selecting columns in `babypandas` 👶🐼¶

Selecting columns with `[]`¶

Use `loc` to slice rows and columns using labels¶

`.loc` is flexible 🧘¶

💡 Pro-Tip: Using `.query`¶

Don't forget `iloc`!¶

💡 Pro-Tip: `assign` for column names with special characters¶

`pandas` and `numpy`¶

`pandas` is built upon `numpy`!¶

`pandas` data types¶

`pandas` data types¶

💡 Pro-Tip: Setting `dtype`s in `read_csv`¶

DataFrame methods with `axis`¶