import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 9)
plt.style.use('ggplot')


schools_path = os.path.join('data', 'california_universities.csv')
schools = pd.read_csv(schools_path)
schools.head()


schools.head()


schools.loc[1:5]


schools.iloc[1:5]


schools.sort_values('Founded', ascending=False).iloc[0]['Name']

'University of California, Merced'


schools.sort_values('Founded', ascending=False).loc[0]['Name']

'Humboldt State University'


schools.head()


schools.assign(Age=2023 - schools['Founded'])


schools.head()


(
    schools
    .assign(Age=(2023 - schools['Founded']))
    .assign(is_UC=schools['Name'].str.contains('University of California'))
)


(
    schools
    .assign(**{'Years since Founding': 2023 - schools['Founded']})
)


# By default, .copy() returns a deep copy of the object it is called on,
# meaning that if you change the copy the original remains unmodified.
schools_copy = schools.copy()
schools_copy.head()


schools_copy['Age'] = 2023 - schools_copy['Founded']


schools_copy['Name'] = schools_copy['Name'].str.replace('University of California,', 'UC')


schools_copy.head()


schools.head()


def calculate_age(df):
    df['Age'] = 2023 - df['Founded']
    return df


calculate_age(schools)


schools.head()


def calculate_age(df):
    df = df.copy()
    # Now, the df referenced below is a fresh copy that is unrelated to the df passed in.
    df['Age'] = 2023 - df['Founded']
    return df


schools_copy.iloc[-1, :] = ['University of California, La Jolla', 
                            'La Jolla', 
                            'San Diego', 
                            '80', 
                            2023, 
                            0]
schools_copy.tail()


schools_copy.loc[-1, :] = ['La Jolla State University', 
                           'La Jolla', 
                           'San Diego', 
                           '10', 
                           2023, 
                           0]
schools_copy.tail()


A = pd.DataFrame({
    'A': [1, 4],
    'B': [2, 5],
    'C': [3, 6]
})
A


A.sum(axis=0)

A    5
B    7
C    9
dtype: int64


A.sum(axis=1)

0     6
1    15
dtype: int64

A


A.sum()

A    5
B    7
C    9
dtype: int64


schools.head()


# The maximum element in each column.
schools.max()

Name          University of California, Santa Cruz
City                                       Vallejo
County                                        Yolo
Enrollment                                   9,201
Founded                                       2005
Age                                            166
dtype: object


# The number of unique values in each column.
schools.nunique()

Name          32
City          30
County        23
Enrollment    32
Founded       25
Age           25
dtype: int64


# Why is this meaningless?
schools[['Founded', 'Age']].mean(axis=1)

0     1011.5
1     1011.5
2     1011.5
3     1011.5
       ...  
28    1011.5
29    1011.5
30    1011.5
31    1011.5
Length: 32, dtype: float64


# describe doesn't accept an axis argument; it works on every numeric column in the DataFrame it is called on.
schools.describe()


schools.head()


arr = np.array([4, 2, 9, 15, -1])
arr

array([ 4,  2,  9, 15, -1])


ser = pd.Series(arr, index=['a', 'b', 'c', 'd', 'e'])
ser

a     4
b     2
c     9
d    15
e    -1
dtype: int64


conv = ser.to_numpy()
conv

array([ 4,  2,  9, 15, -1])


conv[2] = 100
conv

array([  4,   2, 100,  15,  -1])

ser

a      4
b      2
c    100
d     15
e     -1
dtype: int64


N = 1000
x_arr = np.random.random(N)
y_arr = np.random.random(N)

coordinates = pd.DataFrame({'x': x_arr, 'y': y_arr})
coordinates.head()


coordinates.plot(kind='scatter', x='x', y='y');


def distances(df):
    hyp_list = []
    for i in df.index:
        dist = (df.loc[i, 'x'] ** 2 + df.loc[i, 'y'] ** 2) ** 0.5
        hyp_list.append(dist)
    return hyp_list

distances(coordinates)[:5]

[0.936504204228518,
 1.0176141363645677,
 0.24073093040462956,
 1.0038440141341902,
 0.8929297176488756]


%timeit distances(coordinates)

11.1 ms ± 23.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


%timeit (coordinates['x'] ** 2 + coordinates['y'] ** 2) ** 0.5

134 µs ± 359 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


schools.head()


schools.dtypes

Name          object
City          object
County        object
Enrollment    object
Founded        int64
Age            int64
dtype: object


schools['Founded'].dtypes

dtype('int64')


schools['Founded'] ** 7

0    -7565314117704225911
1     4171128846908943813
2    -6937220880839396315
3    -7155102599657796480
             ...         
28    4171128846908943813
29    8892165444775224960
30   -7645595510875556467
31   -4969729423693371399
Name: Founded, Length: 32, dtype: int64


np.array(['a', 1])

array(['a', '1'], dtype='<U21')


pd.Series(['a', 1])

0    a
1    1
dtype: object


pd.Series(['a', 1]).values

array(['a', 1], dtype=object)


np.array(['a', 1], dtype=object)

array(['a', 1], dtype=object)


pd.Series([1, 1.0])

0    1.0
1    1.0
dtype: float64


schools.head()


schools.dtypes

Name          object
City          object
County        object
Enrollment    object
Founded        int64
Age            int64
dtype: object


schools['Enrollment'] = schools['Enrollment'].str.replace(',', '').astype(int)
schools.head()


schools.dtypes

Name          object
City          object
County        object
Enrollment     int64
Founded        int64
Age            int64
dtype: object


data = np.random.choice(np.arange(8), 10 ** 6)


ser1 = pd.Series(data, dtype=np.uint8).to_frame()
ser1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000000 non-null  uint8
dtypes: uint8(1)
memory usage: 976.7 KB


ser2 = pd.Series(data).to_frame()
ser2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype
---  ------  --------------    -----
 0   0       1000000 non-null  int64
dtypes: int64(1)
memory usage: 7.6 MB


schools['Founded'].std()

38.95468831778126


np.std(schools['Founded'])

38.34119031798569


schools['Founded'].std()

38.95468831778126


schools['Founded'].std(ddof=1)

38.95468831778126


schools['Founded'].std(ddof=0)

38.34119031798569


schools.head()


schools.dtypes

Name          object
City          object
County        object
Enrollment     int64
Founded        int64
Age            int64
dtype: object


schools['Enrollment'].describe()

count       32.000000
mean     23649.781250
std      12477.750595
min       1017.000000
25%      13506.500000
50%      23812.000000
75%      34965.750000
max      45428.000000
Name: Enrollment, dtype: float64


schools.plot(kind='scatter', x='Founded', y='Enrollment', figsize=(10, 5));


!pip install plotly

Requirement already satisfied: plotly in /Users/surajrampure/opt/anaconda3/lib/python3.9/site-packages (5.4.0)
Requirement already satisfied: tenacity>=6.2.0 in /Users/surajrampure/opt/anaconda3/lib/python3.9/site-packages (from plotly) (8.0.1)
Requirement already satisfied: six in /Users/surajrampure/opt/anaconda3/lib/python3.9/site-packages (from plotly) (1.16.0)


import plotly.express as px


px.scatter(schools, 
           x='Founded', 
           y='Enrollment', 
           hover_name='Name', 
           color=schools['Name'].str.contains('University of California')
           )


pd.options.plotting.backend = 'plotly'


schools.plot(kind='scatter', 
             x='Founded', 
             y='Enrollment', 
             hover_name='Name')

	Name	City	County	Enrollment	Founded
0	Humboldt State University	Arcata	Humboldt	7,774	1913
1	California State University, Bakersfield	Bakersfield	Kern	10,493	1965
2	University of California, Berkeley	Berkeley	Alameda	42,519	1869
3	California State University Channel Islands	Camarillo	Ventura	7,095	2002
4	California State University, Dominguez Hills	Carson	Los Angeles	15,741	1960

	Name	City	County	Enrollment	Founded
0	Humboldt State University	Arcata	Humboldt	7,774	1913
1	California State University, Bakersfield	Bakersfield	Kern	10,493	1965
2	University of California, Berkeley	Berkeley	Alameda	42,519	1869
3	California State University Channel Islands	Camarillo	Ventura	7,095	2002
4	California State University, Dominguez Hills	Carson	Los Angeles	15,741	1960

	Name	City	County	Enrollment	Founded
1	California State University, Bakersfield	Bakersfield	Kern	10,493	1965
2	University of California, Berkeley	Berkeley	Alameda	42,519	1869
3	California State University Channel Islands	Camarillo	Ventura	7,095	2002
4	California State University, Dominguez Hills	Carson	Los Angeles	15,741	1960
5	California State University, Chico	Chico	Butte	17,488	1887

	Name	City	County	Enrollment	Founded
1	California State University, Bakersfield	Bakersfield	Kern	10,493	1965
2	University of California, Berkeley	Berkeley	Alameda	42,519	1869
3	California State University Channel Islands	Camarillo	Ventura	7,095	2002
4	California State University, Dominguez Hills	Carson	Los Angeles	15,741	1960

	Name	City	County	Enrollment	Founded
0	Humboldt State University	Arcata	Humboldt	7,774	1913
1	California State University, Bakersfield	Bakersfield	Kern	10,493	1965
2	University of California, Berkeley	Berkeley	Alameda	42,519	1869
3	California State University Channel Islands	Camarillo	Ventura	7,095	2002
4	California State University, Dominguez Hills	Carson	Los Angeles	15,741	1960

Lecture 3 – More DataFrame Fundamentals¶

DSC 80, Winter 2023¶

Announcements 📣¶

Agenda¶

Recap: `loc` and `iloc`¶

Example: Universities in California 📚¶

`loc` and `iloc` with the default index¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

Adding and modifying columns, in-place¶

Mutability¶

What about rows?¶

Axes¶

Axes¶

DataFrame methods with `axis`¶

DataFrame methods with `axis`¶

Discussion Question¶

`pandas` and `numpy`¶

`numpy`¶

`pandas` is built upon `numpy`¶

The dangers of `for`-loops¶

Aside: Generating data¶

`pandas` data types¶

`pandas` data types¶

⚠️ Warning: `numpy` and `pandas` don't always make the same decisions!¶

Type conversion¶

Performance and memory management¶

Aside: `std`¶

Aside: `std`¶

Extra: Data cleaning and `plotly`¶

Example: Universities in California 📚¶

Enrollment vs. year founded¶

`plotly`¶

Enrollment vs. year founded, but interactive¶

Summary, next time¶

Summary, next time¶

	Name	City	County	Enrollment	Founded	Age
27	UC Santa Barbara	Santa Barbara	Santa Barbara	24,346	1891	132
28	UC Santa Cruz	Santa Cruz	Santa Cruz	19,700	1965	58
29	California State University, Monterey Bay	Seaside-Marina	Monterey	7,079	1994	29
30	California State University, Stanislaus	Turlock	Stanislaus	10,214	1957	66
31	University of California, La Jolla	La Jolla	San Diego	80	2023	0

	Founded	Age
count	32.000000	32.000000
mean	1938.625000	84.375000
std	38.954688	38.954688
min	1857.000000	18.000000
25%	1904.000000	61.750000
50%	1951.500000	71.500000
75%	1961.250000	119.000000
max	2005.000000	166.000000

	x	y
0	0.335446	0.874366
1	0.952098	0.359232
2	0.137791	0.197395
3	0.909567	0.424725
4	0.608166	0.653802

Pandas dtype	Python type	NumPy type	SQL type	Usage
int64	int	int_, int8,...,int64, uint8,...,uint64	INT, BIGINT	Integer numbers
float64	float	float_, float16, float32, float64	FLOAT	Floating point numbers
bool	bool	bool_	BOOL	True/False values
datetime64	NA	datetime64[ns]	DATETIME	Date and time values
timedelta[ns]	NA	NA	NA	Differences between two datetimes
category	NA	NA	ENUM	Finite list of text values
object	str	string, unicode	NA	Text
object	NA	object	NA	Mixed types

Lecture 3 – More DataFrame Fundamentals¶

DSC 80, Winter 2023¶

Announcements 📣¶

Agenda¶

Recap: loc and iloc¶

Example: Universities in California 📚¶

loc and iloc with the default index¶

Adding and modifying columns¶

Adding and modifying columns, using a copy¶

Adding and modifying columns, in-place¶

Mutability¶

What about rows?¶

Axes¶

Axes¶

DataFrame methods with axis¶

DataFrame methods with axis¶

Discussion Question¶

pandas and numpy¶

numpy¶

pandas is built upon numpy¶

The dangers of for-loops¶

Aside: Generating data¶

pandas data types¶

pandas data types¶

⚠️ Warning: numpy and pandas don't always make the same decisions!¶

Type conversion¶

Performance and memory management¶

Aside: std¶

Aside: std¶

Extra: Data cleaning and plotly¶

Example: Universities in California 📚¶

Enrollment vs. year founded¶

plotly¶

Enrollment vs. year founded, but interactive¶

Summary, next time¶

Summary, next time¶

Recap: `loc` and `iloc`¶

`loc` and `iloc` with the default index¶

DataFrame methods with `axis`¶

DataFrame methods with `axis`¶

`pandas` and `numpy`¶

`numpy`¶

`pandas` is built upon `numpy`¶

The dangers of `for`-loops¶

`pandas` data types¶

`pandas` data types¶

⚠️ Warning: `numpy` and `pandas` don't always make the same decisions!¶

Aside: `std`¶

Aside: `std`¶

Extra: Data cleaning and `plotly`¶

`plotly`¶