# Set up packages for lecture. Don't worry about understanding this code,
# but make sure to run it if you're following along.
import numpy as np
import babypandas as bpd

import matplotlib.pyplot as plt
plt.style.use('ggplot')

np.set_printoptions(threshold=20, precision=2, suppress=True)
import pandas as pd
pd.set_option("display.max_rows", 7)
pd.set_option("display.max_columns", 8)
pd.set_option("display.precision", 2)

from IPython.display import display, IFrame, YouTubeVideo

def show_grouping_animation():
    src = "https://docs.google.com/presentation/d/e/2PACX-1vTgVlFngQcLMYHP-z1vq5lVXjsBgcHebc-3TX7SW6L_gjX6TD1gsflvVDQUpWiDdeEPqJASenUIfBVd/embed?start=false&loop=false&delayms=60000&rm=minimal"
    width = 960
    height = 509
    display(IFrame(src, width, height))


states = bpd.read_csv('data/states.csv')
states = states.assign(Density=states.get('Population') / states.get('Land Area'))
states


states


# Which one is Pennsylvania?
states.get('Density')

0      99.21
1       1.29
2      62.96
       ...  
47     74.62
48    108.82
49      5.94
Name: Density, Length: 50, dtype: float64


bpd.read_csv('data/states.csv')


states


states.set_index('State')


states


states = states.set_index('State')
states


# Which one is Pennsylvania? The one whose row label is "Pennsylvania"!
states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
                  ...  
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64


states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
                  ...  
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64


states.get('Density').loc['Pennsylvania']

290.60858681804973


states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
                  ...  
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64


states.get('Density').iloc[2]

62.956687853231685


states.get('Density').loc['Arizona']

62.956687853231685


bpd.read_csv('data/states.csv')


bpd.read_csv('data/states.csv').get('Capital City').loc[35]

'Oklahoma City'


bpd.read_csv('data/states.csv').get('Capital City').iloc[35]

'Oklahoma City'


# This DataFrame only contains rows where the 'Region' is 'West'!
only_west = states[states.get('Region') == 'West']
only_west


5 == 6

False


type(5 == 6)

bool


9 + 10 < 21

True


states


states.get('Region') == 'West'

State
Alabama          False
Alaska            True
Arizona           True
                 ...  
West Virginia    False
Wisconsin        False
Wyoming           True
Name: Region, Length: 50, dtype: bool


states[states.get('Region') == 'West']


states[states.get('Region') == 'Pacific Northwest']


only_rep = states[states.get('Party') == 'Republican']
only_rep


only_rep.shape

(31, 6)


# Number of rows.
only_rep.shape[0]

31


# Number of columns.
only_rep.shape[1]

6


# What proportion of US states are Republican?
only_rep.shape[0] / states.shape[0]

0.62


midwest = states[states.get('Region') == 'Midwest']
midwest


midwest_sorted = midwest.sort_values(by='Land Area', ascending=False)
midwest_sorted


midwest_sorted.get('State').iloc[0]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/var/folders/28/vs8cp38n1r1520g8bhzr4v5h0000gn/T/ipykernel_87309/3899766623.py in <module>
----> 1 midwest_sorted.get('State').iloc[0]

~/opt/anaconda3/lib/python3.9/site-packages/babypandas/utils.py in wrapper(*args, **kwargs)
     18         with warnings.catch_warnings():
     19             warnings.simplefilter("ignore")
---> 20             return func(*args, **kwargs)
     21 
     22     return wrapper

~/opt/anaconda3/lib/python3.9/site-packages/babypandas/bpd.py in get(self, key)
    325         if any(mask):
    326             k = [key] if isinstance(key, str) else key
--> 327             raise KeyError("{} not found in columns".format(np.array(k)[mask]))
    328 
    329         f = _lift_to_pd(self._pd.get)

KeyError: "['State'] not found in columns"


midwest_sorted.index

Index(['Kansas', 'Minnesota', 'Nebraska', 'South Dakota', 'North Dakota',
       'Missouri', 'Michigan', 'Iowa', 'Illinois', 'Wisconsin', 'Ohio',
       'Indiana'],
      dtype='object', name='State')


midwest_sorted.index[0]

'Kansas'


# Final solution, which you should build up one step at a time.
states[states.get('Region') == 'Midwest'].sort_values(by='Land Area', ascending=False).index[0]

'Kansas'


# You can space your code out like this if needed.
(
    states[states.get('Region') == 'Midwest']
    .sort_values(by='Land Area', ascending=False)
    .index[0]
)

'Kansas'

...

Ellipsis


states[(states.get('Party') == 'Republican') & (states.get('Region') == 'South')]


# You can also add line breaks within brackets.
states[(states.get('Party') == 'Republican') & 
       (states.get('Region') == 'South')]


(states.get('Party') == 'Republican')

State
Alabama          True
Alaska           True
Arizona          True
                 ... 
West Virginia    True
Wisconsin        True
Wyoming          True
Name: Party, Length: 50, dtype: bool


(states.get('Region') == 'South')

State
Alabama           True
Alaska           False
Arizona          False
                 ...  
West Virginia     True
Wisconsin        False
Wyoming          False
Name: Region, Length: 50, dtype: bool


(states.get('Party') == 'Republican') & (states.get('Region') == 'South')

State
Alabama           True
Alaska           False
Arizona          False
                 ...  
West Virginia     True
Wisconsin        False
Wyoming          False
Length: 50, dtype: bool


(
    states[(states.get('Party') == 'Republican') & 
       (states.get('Region') == 'South')]
    .sort_values(by='Population', ascending=False)
)


(
    states[(states.get('Party') == 'Republican') & 
       (states.get('Region')=='South')]
    .sort_values(by='Population', ascending=False)
    .take([0, 1, 2])
)

...

Ellipsis


states[states.get('Region') == 'West'].get('Population').sum()

78588572


states[states.get('Region') == 'Midwest'].get('Population').sum()

68985454


states.groupby('Region').sum()


show_grouping_animation()


pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets


pets.groupby('Species').mean()


states


states.groupby('Region').sum()


# Note the use of .index – remember, the index isn't a column!
(
    states
    .groupby('Region')
    .sum()
    .sort_values(by='Population', ascending=False)
    .index[0]
)

'South'


states


states.groupby('Region').sum()


states.groupby('Region').max()


12812508 / 81759 == 288.77

False


states.groupby('Region').count()


states_by_region = states.groupby('Region').count()
states_by_region = states_by_region.assign(
                    States=states_by_region.get('Capital City')
                    ).get(['States'])
states_by_region


from IPython.display import YouTubeVideo
YouTubeVideo('xg7rnjWnZ48')


imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb


imdb.groupby('Decade').count()


# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');


imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]

'The Shawshank Redemption'


imdb.reset_index().groupby('Decade').max()

symbol	meaning
`==`	equal to
`!=`	not equal to
`<`	less than
`<=`	less than or equal to
`>`	greater than
`>=`	greater than or equal to

	Region	Capital City	Population	Land Area	Party	Density
State
Texas	South	Austin	29145505	261232	Republican	111.57
Florida	South	Tallahassee	21538187	53625	Republican	401.64
Georgia	South	Atlanta	10711908	57513	Republican	186.25

	Population	Land Area	Density
Region
Midwest	68985454	750524	1298.78
Northeast	57609148	161912	4957.49
South	125576562	868356	3189.37
West	78588572	1751054	881.62

	Population	Land Area	Density
Region
Midwest	68985454	750524	1298.78
Northeast	57609148	161912	4957.49
South	125576562	868356	3189.37
West	78588572	1751054	881.62

	Population	Land Area	Density
Region
Midwest	68985454	750524	1298.78
Northeast	57609148	161912	4957.49
South	125576562	868356	3189.37
West	78588572	1751054	881.62

	State	Region	Capital City	Population	Land Area	Party	Density
0	Alabama	South	Montgomery	5024279	50645	Republican	99.21
1	Alaska	West	Juneau	733391	570641	Republican	1.29
2	Arizona	West	Phoenix	7151502	113594	Republican	62.96
...	...	...	...	...	...	...	...
47	West Virginia	South	Charleston	1793716	24038	Republican	74.62
48	Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82
49	Wyoming	West	Cheyenne	576851	97093	Republican	5.94

	Region	Capital City	Population	Land Area	Party	Density
State
Illinois	Midwest	Springfield	12812508	55519	Democratic	230.78
Indiana	Midwest	Indianapolis	6785528	35826	Republican	189.40
Iowa	Midwest	Des Moines	3190369	55857	Republican	57.12
...	...	...	...	...	...	...
Ohio	Midwest	Columbus	11799448	40861	Republican	288.77
South Dakota	Midwest	Pierre	886667	75811	Republican	11.70
Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82

	Region	Capital City	Population	Land Area	Party	Density
State
Kansas	Midwest	Topeka	2937880	81759	Republican	35.93
Minnesota	Midwest	Saint Paul	5706494	79627	Democratic	71.67
Nebraska	Midwest	Lincoln	1961504	76824	Republican	25.53
...	...	...	...	...	...	...
Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82
Ohio	Midwest	Columbus	11799448	40861	Republican	288.77
Indiana	Midwest	Indianapolis	6785528	35826	Republican	189.40

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	Votes	Rating	Year	Decade
Title
Akira	91652	8.0	1988	1980
Per un pugno di dollari	124671	8.0	1964	1960
Guardians of the Galaxy	527349	8.0	2014	2010
...	...	...	...	...
The Godfather: Part II	692753	9.0	1974	1970
The Shawshank Redemption	1498733	9.2	1994	1990
The Godfather	1027398	9.2	1972	1970

	Votes	Rating	Year
Decade
1920	4	4	4
1930	7	7	7
1940	14	14	14
...	...	...	...
1990	42	42	42
2000	50	50	50
2010	29	29	29

	Title	Votes	Rating	Year
Decade
1920	The Kid	98794	8.3	1927
1930	The Wizard of Oz	259235	8.5	1939
1940	The Treasure of the Sierra Madre	350551	8.6	1949
...	...	...	...	...
1990	Unforgiven	1498733	9.2	1999
2000	Yip Man	1473049	8.9	2009
2010	X-Men: Days of Future Past	1271949	8.7	2015

Lecture 5 – Querying and Grouping¶

DSC 10, Fall 2023¶

Announcements¶

Agenda¶

Don't forget about these resources!¶

You belong here! 🤝¶

The data: US states 🗽¶

Example 4: What is the population density of Pennsylvania?¶

Population density of Pennsylvania¶

Utilizing the index¶

Setting the index¶

Accessing using the row label¶

Summary: Accessing elements of a DataFrame¶

Note¶

Example 5: Which states are in the West?¶

The problem¶

The solution¶

Aside: Booleans¶

Comparison operators¶

What is a query? 🤔¶

How do we query a DataFrame?¶

What if the condition isn't satisfied?¶

Example 6: What proportion of US states are Republican?¶

Strategy¶

Shape of a DataFrame¶

Example 7: Which Midwestern state has the most land area?¶

Strategy¶

Working with the index¶

Combining multiple steps¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example 8: What are the top three most-populated Republican states in the South?¶

Multiple conditions¶

The & and | operators work element-wise!¶

Original Question: What are the top three most-populated Republican states in the South?¶

Using .take to select rows by position¶

Extra Practice¶

Example 9: Which region is most populated?¶

Organizing states by region¶

A new method: .groupby¶

An illustrative example: Pets 🐱 🐶🐹¶

Let's try it out!¶

Back to states: which region is most populated?¶

Using .groupby in general¶

Observations on grouping¶

Dropping, renaming, and reordering columns¶

Challenge problems: IMDb dataset 🎞️¶

Extra practice¶

Question: How many movies appear from each decade?¶

Question: What was the highest rated movie of the 1990s?¶

Without grouping¶

With grouping¶

Question: How many years have more than 3 movies rated above 8.5?¶

Aside: Using .sum() on a boolean array¶

Question: Out of the years with more than 3 movies, which had the highest average rating?¶

Question: Which year had the longest movie titles, on average?¶

Question: What is the average rating of movies from years that had at least 3 movies in the Top 250?¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

The `&` and `|` operators work element-wise!¶

Using `.take` to select rows by position¶

A new method: `.groupby`¶

Using `.groupby` in general¶

Aside: Using `.sum()` on a boolean array¶