# Run this cell to set up packages for lecture.
from lec05_imports import *

states = bpd.read_csv('data/states.csv')
states = states.assign(Density=states.get('Population') / states.get('Land Area'))
states

states = states.set_index('State')

# This DataFrame only contains rows where the 'Region' is 'West'!
only_west = states[states.get('Region') == 'West']
only_west

5 == 6

False

type(5 == 6)

bool

9 + 10 < 21

True

states

states.get('Region') == 'West'

State
Alabama          False
Alaska            True
Arizona           True
                 ...  
West Virginia    False
Wisconsin        False
Wyoming           True
Name: Region, Length: 50, dtype: bool

states[states.get('Region') == 'West']

states[states.get('Region') == 'Pacific Northwest']

states.get('Region') == 'Pacific Northwest'

State
Alabama          False
Alaska           False
Arizona          False
                 ...  
West Virginia    False
Wisconsin        False
Wyoming          False
Name: Region, Length: 50, dtype: bool

only_rep = states[states.get('Party') == 'Republican']
only_rep

only_rep.shape

(31, 6)

# Number of rows.
only_rep.shape[0]

31

# Number of columns.
only_rep.shape[1]

6

# What proportion of US states are Republican?
only_rep.shape[0] / states.shape[0]

0.62

midwest = states[states.get('Region') == 'Midwest']
midwest

midwest_sorted = midwest.sort_values(by='Land Area', ascending=False)
midwest_sorted

midwest_sorted.get('State').iloc[0]

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[21], line 1
----> 1 midwest_sorted.get('State').iloc[0]

File ~/miniforge3/lib/python3.11/site-packages/babypandas/utils.py:20, in suppress_warnings.<locals>.wrapper(*args, **kwargs)
     18 with warnings.catch_warnings():
     19     warnings.simplefilter("ignore")
---> 20     return func(*args, **kwargs)

File ~/miniforge3/lib/python3.11/site-packages/babypandas/bpd.py:327, in DataFrame.get(self, key)
    325 if any(mask):
    326     k = [key] if isinstance(key, str) else key
--> 327     raise KeyError("{} not found in columns".format(np.array(k)[mask]))
    329 f = _lift_to_pd(self._pd.get)
    330 return f(key=key)

KeyError: "['State'] not found in columns"

midwest_sorted.index

Index(['Kansas', 'Minnesota', 'Nebraska', 'South Dakota', 'North Dakota',
       'Missouri', 'Michigan', 'Iowa', 'Illinois', 'Wisconsin', 'Ohio',
       'Indiana'],
      dtype='object', name='State')

midwest_sorted.index[0]

'Kansas'

# Full answer, which you should build up one step at a time.
states[states.get('Region') == 'Midwest'].sort_values(by='Land Area', ascending=False).index[0]

'Kansas'

# You can space your code out like this if needed.
(
    states[states.get('Region') == 'Midwest']
    .sort_values(by='Land Area', ascending=False)
    .index[0]
)

'Kansas'

...

Ellipsis

states[(states.get('Party') == 'Republican') & (states.get('Region') == 'South')]

# You can also add line breaks within brackets.
states[(states.get('Party') == 'Republican') & 
       (states.get('Region') == 'South')]

(states.get('Party') == 'Republican')

State
Alabama          True
Alaska           True
Arizona          True
                 ... 
West Virginia    True
Wisconsin        True
Wyoming          True
Name: Party, Length: 50, dtype: bool

(states.get('Region') == 'South')

State
Alabama           True
Alaska           False
Arizona          False
                 ...  
West Virginia     True
Wisconsin        False
Wyoming          False
Name: Region, Length: 50, dtype: bool

(states.get('Party') == 'Republican') & (states.get('Region') == 'South')

State
Alabama           True
Alaska           False
Arizona          False
                 ...  
West Virginia     True
Wisconsin        False
Wyoming          False
Length: 50, dtype: bool

(
    states[(states.get('Party') == 'Republican') & 
       (states.get('Region') == 'South')]
    .sort_values(by='Population', ascending=False)
)

(
    states[(states.get('Party') == 'Republican') & 
       (states.get('Region')=='South')]
    .sort_values(by='Population', ascending=False)
    .take([0, 1, 2])
)

...

Ellipsis

states[states.get('Region') == 'West'].get('Population').sum()

78588572

states[states.get('Region') == 'Midwest'].get('Population').sum()

68985454

states.groupby('Region').sum()

show_grouping_animation()

pets = bpd.DataFrame().assign(
    Species=['dog', 'cat', 'cat', 'dog', 'dog', 'hamster'],
    Color=['black', 'golden', 'black', 'white', 'golden', 'golden'],
    Weight=[40, 15, 20, 80, 25, 1],
    Age=[5, 8, 9, 2, 0.5, 3]
)
pets

pets.groupby('Species').mean()

states

states.groupby('Region').sum()

# Note the use of .index – remember, the index isn't a column!
(
    states
    .groupby('Region')
    .sum()
    .sort_values(by='Population', ascending=False)
    .index[0]
)

'South'

states

states.groupby('Region').sum()

states.groupby('Region').max()

12812508 / 81759 == 288.77

False

states.groupby('Region').count()

states_by_region = states.groupby('Region').count()
states_by_region = states_by_region.assign(
                    States=states_by_region.get('Capital City')
                    ).get(['States'])
states_by_region

from IPython.display import YouTubeVideo
YouTubeVideo('xg7rnjWnZ48')

imdb = bpd.read_csv('data/imdb.csv').set_index('Title').sort_values(by='Rating')
imdb

imdb.groupby('Decade').count()

# We'll learn how to make plots like this in the next lecture!
imdb.groupby('Decade').count().plot(y='Year');

imdb[imdb.get('Decade') == 1990].sort_values('Rating', ascending=False).index[0]

'The Shawshank Redemption'

imdb.reset_index().groupby('Decade').max()

	Region	Capital City	Population	Land Area	Party	Density
State
Alaska	West	Juneau	733391	570641	Republican	1.29
Arizona	West	Phoenix	7151502	113594	Republican	62.96
California	West	Sacramento	39538223	155779	Democratic	253.81
...	...	...	...	...	...	...
Utah	West	Salt Lake City	3271616	82170	Republican	39.82
Washington	West	Olympia	7705281	66456	Democratic	115.95
Wyoming	West	Cheyenne	576851	97093	Republican	5.94

symbol	meaning
`==`	equal to
`!=`	not equal to
`<`	less than
`<=`	less than or equal to
`>`	greater than
`>=`	greater than or equal to

	Region	Capital City	Population	Land Area	Party	Density
State
Alaska	West	Juneau	733391	570641	Republican	1.29
Arizona	West	Phoenix	7151502	113594	Republican	62.96
California	West	Sacramento	39538223	155779	Democratic	253.81
...	...	...	...	...	...	...
Utah	West	Salt Lake City	3271616	82170	Republican	39.82
Washington	West	Olympia	7705281	66456	Democratic	115.95
Wyoming	West	Cheyenne	576851	97093	Republican	5.94

	Region	Capital City	Population	Land Area	Party	Density
State
Texas	South	Austin	29145505	261232	Republican	111.57
Florida	South	Tallahassee	21538187	53625	Republican	401.64
Georgia	South	Atlanta	10711908	57513	Republican	186.25
...	...	...	...	...	...	...
Arkansas	South	Little Rock	3011524	52035	Republican	57.87
Mississippi	South	Jackson	2961279	46923	Republican	63.11
West Virginia	South	Charleston	1793716	24038	Republican	74.62

	Region	Capital City	Population	Land Area	Party	Density
State
Texas	South	Austin	29145505	261232	Republican	111.57
Florida	South	Tallahassee	21538187	53625	Republican	401.64
Georgia	South	Atlanta	10711908	57513	Republican	186.25

	State	Region	Capital City	Population	Land Area	Party	Density
0	Alabama	South	Montgomery	5024279	50645	Republican	99.21
1	Alaska	West	Juneau	733391	570641	Republican	1.29
2	Arizona	West	Phoenix	7151502	113594	Republican	62.96
...	...	...	...	...	...	...	...
47	West Virginia	South	Charleston	1793716	24038	Republican	74.62
48	Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82
49	Wyoming	West	Cheyenne	576851	97093	Republican	5.94

	Region	Capital City	Population	Land Area	Party	Density
State
Illinois	Midwest	Springfield	12812508	55519	Democratic	230.78
Indiana	Midwest	Indianapolis	6785528	35826	Republican	189.40
Iowa	Midwest	Des Moines	3190369	55857	Republican	57.12
...	...	...	...	...	...	...
Ohio	Midwest	Columbus	11799448	40861	Republican	288.77
South Dakota	Midwest	Pierre	886667	75811	Republican	11.70
Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82

	Region	Capital City	Population	Land Area	Party	Density
State
Kansas	Midwest	Topeka	2937880	81759	Republican	35.93
Minnesota	Midwest	Saint Paul	5706494	79627	Democratic	71.67
Nebraska	Midwest	Lincoln	1961504	76824	Republican	25.53
...	...	...	...	...	...	...
Wisconsin	Midwest	Madison	5893718	54158	Republican	108.82
Ohio	Midwest	Columbus	11799448	40861	Republican	288.77
Indiana	Midwest	Indianapolis	6785528	35826	Republican	189.40

	Population	Land Area	Density
Region
Midwest	68985454	750524	1298.78
Northeast	57609148	161912	4957.49
South	125576562	868356	3189.37
West	78588572	1751054	881.62

	Species	Color	Weight	Age
0	dog	black	40	5.0
1	cat	golden	15	8.0
2	cat	black	20	9.0
3	dog	white	80	2.0
4	dog	golden	25	0.5
5	hamster	golden	1	3.0

	Votes	Rating	Year	Decade
Title
Akira	91652	8.0	1988	1980
Per un pugno di dollari	124671	8.0	1964	1960
Guardians of the Galaxy	527349	8.0	2014	2010
...	...	...	...	...
The Godfather: Part II	692753	9.0	1974	1970
The Shawshank Redemption	1498733	9.2	1994	1990
The Godfather	1027398	9.2	1972	1970

	Votes	Rating	Year
Decade
1920	4	4	4
1930	7	7	7
1940	14	14	14
...	...	...	...
1990	42	42	42
2000	50	50	50
2010	29	29	29

	Title	Votes	Rating	Year
Decade
1920	The Kid	98794	8.3	1927
1930	The Wizard of Oz	259235	8.5	1939
1940	The Treasure of the Sierra Madre	350551	8.6	1949
...	...	...	...	...
1990	Unforgiven	1498733	9.2	1999
2000	Yip Man	1473049	8.9	2009
2010	X-Men: Days of Future Past	1271949	8.7	2015

Lecture 5 – Querying and Grouping¶

DSC 10, Summer 2024¶

Announcements¶

Agenda¶

Don't forget about these resources!¶

You belong here! 🤝¶

The data: US states 🗽¶

Example 5: Which states are in the West?¶

The problem¶

The solution¶

Aside: Booleans (another type)¶

Comparison operators¶

What is a query? 🤔¶

How do we query a DataFrame?¶

What if the condition isn't satisfied?¶

Example 6: What proportion of US states are Republican?¶

Strategy¶

Shape of a DataFrame¶

Example 7: Which Midwestern state has the most land area?¶

Strategy¶

Working with the index¶

Combining multiple steps¶

Concept Check ✅ – Answer at cc.dsc10.com¶

Example 8: What are the top three most-populated Republican states in the South?¶

Multiple conditions¶

The & and | operators work element-wise!¶

Original Question: What are the top three most-populated Republican states in the South?¶

Using .take to select rows by position¶

Extra Practice¶

Example 9: Which region is most populated?¶

Organizing states by region¶

A new method: .groupby¶

An illustrative example: Pets 🐱 🐶🐹¶

Let's try it out!¶

Back to states: which region is most populated?¶

Using .groupby in general¶

Observations on grouping¶

Dropping, renaming, and reordering columns¶

Challenge problems: IMDb dataset 🎞️¶

Extra practice¶

Question: How many movies appear from each decade?¶

Question: What was the highest rated movie of the 1990s?¶

Without grouping¶

With grouping¶

Question: How many years have more than 3 movies rated above 8.5?¶

Aside: Using .sum() on a boolean array¶

Question: Out of the years with more than 3 movies, which had the highest average rating?¶

Question: Which year had the longest movie titles, on average?¶

Question: What is the average rating of movies from years that had at least 3 movies in the Top 250?¶

Summary, next time¶

Summary¶

Next time¶

Concept Check ✅ – Answer at cc.dsc10.com ¶

The `&` and `|` operators work element-wise!¶

Using `.take` to select rows by position¶

A new method: `.groupby`¶

Using `.groupby` in general¶

Aside: Using `.sum()` on a boolean array¶