# Run this cell, which imports packages and sets up some formatting to make lectures display nicely.
from lec04_imports import *

import numpy as np

temperature_array = np.array([68, 73, 70, 74, 76, 72, 74])
temperature_array

array([68, 73, 70, 74, 76, 72, 74])

temperature_array

array([68, 73, 70, 74, 76, 72, 74])

temperature_array[1]

73

temperature_array

array([68, 73, 70, 74, 76, 72, 74])

# Convert all temperatures to Celsius.
(5 / 9) * (temperature_array - 32)

array([20.  , 22.78, 21.11, 23.33, 24.44, 22.22, 23.33])

a = np.array([4, 5, -1])
b = np.array([2, 3, 2])

a ** 2 + b ** 2

array([20, 34,  5])

temperature_array.max()

76

temperature_array.mean()

72.42857142857143

views = np.array([158, 352, 195, 1423916, 46])

views - views.mean()

array([-284775.4, -284581.4, -284738.4, 1138982.6, -284887.4])

(views - views.mean()).max()

1138982.6

views.max() * 0.03 / 1000

42.717479999999995

day_of_month = np.array([
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 
    23, 24, 25, 26, 27, 28, 29, 30, 31
])

# Start at 0, end before 8, step by 1.
# This will be our most common use-case!
np.arange(8)

array([0, 1, 2, 3, 4, 5, 6, 7])

# Start at 5, end before 10, step by 1.
np.arange(5, 10)

array([5, 6, 7, 8, 9])

# Start at 3, end before 32, step by 5.
np.arange(3, 32, 5)

array([ 3,  8, 13, 18, 23, 28])

...

Ellipsis

...

Ellipsis

...

Ellipsis

import babypandas as bpd

# Our CSV file is stored not in the same folder as our notebook, 
# but within a folder called data.
states = bpd.read_csv('data/states.csv')
states

# This DataFrame has 50 rows and 6 columns.
states

states

states

states.get('Population')

0      5024279
1       733391
2      7151502
3      3011524
4     39538223
        ...   
45     8631393
46     7705281
47     1793716
48     5893718
49      576851
Name: Population, Length: 50, dtype: int64

states.get('Population')

0      5024279
1       733391
2      7151502
3      3011524
4     39538223
        ...   
45     8631393
46     7705281
47     1793716
48     5893718
49      576851
Name: Population, Length: 50, dtype: int64

type(states.get('Population'))

babypandas.bpd.Series

states.get('Land Area')

0      50645
1     570641
2     113594
3      52035
4     155779
       ...  
45     39490
46     66456
47     24038
48     54158
49     97093
Name: Land Area, Length: 50, dtype: int64

states.get('Population') / states.get('Land Area')

0      99.21
1       1.29
2      62.96
3      57.87
4     253.81
       ...  
45    218.57
46    115.95
47     74.62
48    108.82
49      5.94
Length: 50, dtype: float64

states.assign(
    Density=states.get('Population') / states.get('Land Area')
)

states

states = states.assign(
    Density=states.get('Population') / states.get('Land Area')
)
states

states.get('Density').max()

1263.1212945335872

states.get('Density').min()

1.2852055845969708

states.get('Density').mean()

206.54513507096468

states.get('Density').median()

108.31649013462203

# Lots of information at once!
states.get('Density').describe()

count      50.00
mean      206.55
std       274.93
min         1.29
25%        47.06
50%       108.32
75%       224.57
max      1263.12
Name: Density, dtype: float64

states.sort_values(by='Density')

ordered_states = states.sort_values(by='Density', ascending=False)
ordered_states

# We must specify the role of False by using ascending=, 
# otherwise Python does not know how to interpret this.
states.sort_values(by='Density', False)

ordered_states

ordered_states.get('State')

29       New Jersey
38     Rhode Island
20    Massachusetts
6       Connecticut
19         Maryland
          ...      
40     South Dakota
33     North Dakota
25          Montana
49          Wyoming
1            Alaska
Name: State, Length: 50, dtype: object

# We want the first entry of the Series, which is at "integer location" 0.
ordered_states.get('State').iloc[0]

'New Jersey'

ordered_states.get('State').iloc[29]

'Minnesota'

states

# Which one is Pennsylvania?
states.get('Density')

0      99.21
1       1.29
2      62.96
3      57.87
4     253.81
       ...  
45    218.57
46    115.95
47     74.62
48    108.82
49      5.94
Name: Density, Length: 50, dtype: float64

bpd.read_csv('data/states.csv')

states

states.set_index('State')

states

states = states.set_index('State')
states

# Which one is Pennsylvania? The one whose row label is "Pennsylvania"!
states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
Arkansas          57.87
California       253.81
                  ...  
Virginia         218.57
Washington       115.95
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64

states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
Arkansas          57.87
California       253.81
                  ...  
Virginia         218.57
Washington       115.95
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64

states.get('Density').loc['Pennsylvania']

290.60858681804973

states.get('Density')

State
Alabama           99.21
Alaska             1.29
Arizona           62.96
Arkansas          57.87
California       253.81
                  ...  
Virginia         218.57
Washington       115.95
West Virginia     74.62
Wisconsin        108.82
Wyoming            5.94
Name: Density, Length: 50, dtype: float64

states.get('Density').iloc[4]

253.80971119342146

states.get('Density').loc['California']

253.80971119342146

bpd.read_csv('data/states.csv')

bpd.read_csv('data/states.csv').get('Capital City').loc[35]

'Oklahoma City'

bpd.read_csv('data/states.csv').get('Capital City').iloc[35]

'Oklahoma City'

	State	Region	Capital City	Population	Land Area	Party	Density
1	Alaska	West	Juneau	733391	570641	Republican	1.29
49	Wyoming	West	Cheyenne	576851	97093	Republican	5.94
25	Montana	West	Helena	1084225	145546	Republican	7.45
33	North Dakota	Midwest	Bismarck	779094	69001	Republican	11.29
40	South Dakota	Midwest	Pierre	886667	75811	Republican	11.70
...	...	...	...	...	...	...	...
19	Maryland	South	Annapolis	6177224	9707	Democratic	636.37
6	Connecticut	Northeast	Hartford	3605944	4842	Democratic	744.72
20	Massachusetts	Northeast	Boston	7029917	7800	Democratic	901.27
38	Rhode Island	Northeast	Providence	1097379	1034	Democratic	1061.29
29	New Jersey	Northeast	Trenton	9288994	7354	Democratic	1263.12

	State	Region	Capital City	Population	Land Area	Party	Density
29	New Jersey	Northeast	Trenton	9288994	7354	Democratic	1263.12
38	Rhode Island	Northeast	Providence	1097379	1034	Democratic	1061.29
20	Massachusetts	Northeast	Boston	7029917	7800	Democratic	901.27
6	Connecticut	Northeast	Hartford	3605944	4842	Democratic	744.72
19	Maryland	South	Annapolis	6177224	9707	Democratic	636.37
...	...	...	...	...	...	...	...
40	South Dakota	Midwest	Pierre	886667	75811	Republican	11.70
33	North Dakota	Midwest	Bismarck	779094	69001	Republican	11.29
25	Montana	West	Helena	1084225	145546	Republican	7.45
49	Wyoming	West	Cheyenne	576851	97093	Republican	5.94
1	Alaska	West	Juneau	733391	570641	Republican	1.29

	State	Region	Capital City	Population	Land Area	Party	Density
29	New Jersey	Northeast	Trenton	9288994	7354	Democratic	1263.12
38	Rhode Island	Northeast	Providence	1097379	1034	Democratic	1061.29
20	Massachusetts	Northeast	Boston	7029917	7800	Democratic	901.27
6	Connecticut	Northeast	Hartford	3605944	4842	Democratic	744.72
19	Maryland	South	Annapolis	6177224	9707	Democratic	636.37
...	...	...	...	...	...	...	...
40	South Dakota	Midwest	Pierre	886667	75811	Republican	11.70
33	North Dakota	Midwest	Bismarck	779094	69001	Republican	11.29
25	Montana	West	Helena	1084225	145546	Republican	7.45
49	Wyoming	West	Cheyenne	576851	97093	Republican	5.94
1	Alaska	West	Juneau	733391	570641	Republican	1.29

	State	Region	Capital City	Population	Land Area	Party
0	Alabama	South	Montgomery	5024279	50645	Republican
1	Alaska	West	Juneau	733391	570641	Republican
2	Arizona	West	Phoenix	7151502	113594	Republican
3	Arkansas	South	Little Rock	3011524	52035	Republican
4	California	West	Sacramento	39538223	155779	Democratic
...	...	...	...	...	...	...
45	Virginia	South	Richmond	8631393	39490	Democratic
46	Washington	West	Olympia	7705281	66456	Democratic
47	West Virginia	South	Charleston	1793716	24038	Republican
48	Wisconsin	Midwest	Madison	5893718	54158	Republican
49	Wyoming	West	Cheyenne	576851	97093	Republican

Lecture 4 – Arrays and DataFrames¶

DSC 10, Winter 2024¶

Announcements¶

Agenda¶

Note:¶

Arrays¶

Recap: arrays¶

Array-number arithmetic¶

Element-wise arithmetic¶

Array methods¶

Example: TikTok views 🎬¶

Ranges¶

Motivation¶

Ranges¶

Extra practice¶

Challenge¶

DataFrames¶

pandas¶

But pandas is not so cute...¶

Enter babypandas!¶

DataFrames in babypandas 🐼¶

Reading data from a file 📖¶

About the data 🗽¶

Structure of a DataFrame¶

Example 1: Population density¶

Finding population density¶

Step 1 – Getting the 'Population' column¶

Digression: Series¶

Steps 2 and 3 – Getting the 'Land Area' column and dividing element-wise¶

Step 4 – Adding the densities to the DataFrame as a new column¶

Example 2: Exploring population density¶

Questions¶

Example 3: Which state has the highest population density?¶

Step 1 – Sorting the DataFrame¶

Step 1 – Sorting the DataFrame in descending order¶

Step 2 – Extracting the state name¶

Example 4: What is the population density of Pennsylvania?¶

Population density of Pennsylvania¶

Utilizing the index¶

Setting the index¶

Accessing using the row label¶

Summary: Accessing elements of a DataFrame¶

Note¶

Summary, next time¶

Summary¶

Next time¶

`pandas`¶

But `pandas` is not so cute...¶

Enter `babypandas`!¶

DataFrames in `babypandas` 🐼¶

Step 1 – Getting the `'Population'` column¶

Steps 2 and 3 – Getting the `'Land Area'` column and dividing element-wise¶