import pandas as pd
import numpy as np
import os


section_A = pd.DataFrame({
    'Name': ['Annie', 'Billy', 'Sally', 'Tommy'],
    'Midterm': [98, 82, 23, 45],
    'Final': [88, 100, 99, 67]
})

section_A


section_B = pd.DataFrame({
    'Name': ['Junior', 'Rex', 'Flash'],
    'Midterm': [70, 99, 81],
    'Final': [42, 25, 90]
})

section_B


pd.concat([section_A, section_B])


exams = section_A.copy()
exams


assignments = exams[['Name']].assign(Homeworks=[99, 45, 23, 81],
                                     Labs=[100, 100, 99, 100])

assignments


pd.concat([exams, assignments])


pd.concat([exams, assignments], axis=1)


# .loc[::-1] reverses the rows of the DataFrame
exams_by_name = exams.set_index('Name').loc[::-1]
exams_by_name


assignments_by_name = assignments.set_index('Name')
assignments_by_name


pd.concat([exams_by_name, assignments_by_name], axis=1)


exams_reversed = exams.loc[::-1].reset_index(drop=True)
exams_reversed


assignments


pd.concat([exams_reversed, assignments], axis=1)


exams_extra = exams.copy()
exams_extra.loc[4] = ['Junior', 100, 100]
exams_extra


assignments


pd.concat([exams_extra, assignments], axis=1)


temps = pd.DataFrame({
    'City': ['San Diego', 'Toronto', 'Rome'],
    'Temperature': [76, 28, 56]
})

temps


countries = pd.DataFrame({
    'City': ['Toronto', 'Shanghai', 'San Diego'],
    'Country': ['Canada', 'China', 'USA']
})

countries


temps.merge(countries)


temps


countries


temps.merge(countries, how='outer')


# merge is also a pandas function
pd.merge(temps, countries, how='outer')


pd.concat([temps.set_index('City'), countries.set_index('City')], axis=1)


temps


countries


temps.merge(countries, how='left')


temps.merge(countries, how='right')


countries.merge(temps, how='left')


exams


overall = pd.DataFrame({
    'PID': ['A15253545', 'A10348245', 'A13349069', 'A18485824', 'A10094857'],
    'Student': ['Billy', 'Sally', 'Annie', 'Larry', 'Johnny'],
    'Final': [88, 64, 91, 45, 89]
})

overall


exams.merge(overall)


exams.merge(overall, left_on='Name', right_on='Student')


exams.merge(overall, left_on='Name', right_on='Student', suffixes=('_Exam', '_Overall'))


exams


overall_by_student = overall.set_index('Student')
overall_by_student


exams.merge(overall_by_student, left_on='Name', right_index=True, suffixes=('_Exam', '_Overall'))


profs = pd.DataFrame(
[['Brad', 'UCB', 8],
 ['Janine', 'UCSD', 7],
 ['Marina', 'UIC', 6],
 ['Justin', 'OSU', 4],
 ['Aaron', 'UCB', 4],
 ['Soohyun', 'UCSD', 1],
 ['Suraj', 'UCB', 1]],
    columns=['Name', 'School', 'Years']
)

profs


schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California, San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

schools


profs.merge(schools, left_on='School', right_on='Abr', how='left')


profs


programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})

programs


profs.merge(programs, left_on='School', right_on='uni')


os.listdir('data')

['orders.csv',
 '.DS_Store',
 'stops_2017.csv',
 'military.csv',
 'stops_2016.csv',
 'race_codes.csv']


import pathlib
file_list = list(pathlib.Path().glob('data/stops*.csv')) # glob allows for pattern matching
file_list

[PosixPath('data/stops_2017.csv'), PosixPath('data/stops_2016.csv')]


!ls data/stops*.csv

data/stops_2016.csv data/stops_2017.csv


file_list

[PosixPath('data/stops_2017.csv'), PosixPath('data/stops_2016.csv')]


list_of_dfs = [pd.read_csv(file) for file in file_list]


for df in list_of_dfs:
    display(df.head())


stops = pd.concat(list_of_dfs, ignore_index=True)


stops.head()


len(stops)

206413


stops.head()


stops['subject_race'].value_counts(normalize=True)

W    0.411358
H    0.309249
B    0.112620
O    0.086896
A    0.046397
F    0.011923
V    0.005768
C    0.004899
I    0.004123
X    0.001446
K    0.001222
P    0.000990
J    0.000912
Z    0.000558
D    0.000543
L    0.000461
S    0.000349
G    0.000189
U    0.000097
Name: subject_race, dtype: float64


races = pd.read_csv('data/race_codes.csv')
races


race_percentages = stops['subject_race'].value_counts(normalize=True).rename('Proportion').to_frame()
race_percentages.merge(races, left_index=True, right_on='Race Code')


race_dict = {'A':'Asian',
             'B':'Black',
             'C':'Asian',
             'D':'Asian',
             'F':'Asian',
             'G':'Asian',
             'H':'Hispanic',
             'I':'Native American',
             'J':'Asian',
             'K':'Asian',
             'L':'Asian',
             'O':'Other',
             'P':'Asian',
             'S':'Asian',
             'U':'Hawaiian',
             'V':'Asian',
             'W':'White',
             'Z':'Asian'
            }


races['Race_Category'] = races['Race Code'].replace(race_dict)
races


stops.head()


stops['subject_race'].isna().mean()

0.0012547659304404277


stops_merged = stops.merge(races, left_on='subject_race', right_on='Race Code', how='left')
stops_merged.head()


dist = stops_merged['Race_Category'].value_counts(normalize=True)
dist

White              0.411953
Hispanic           0.309697
Black              0.112783
Other              0.087022
Asian              0.074319
Native American    0.004129
Hawaiian           0.000097
Name: Race_Category, dtype: float64


dist.plot(kind='bar', figsize=(10, 5));


import datetime


datetime.datetime.now()

datetime.datetime(2022, 4, 15, 13, 3, 11, 143648)


datetime.datetime.now() + datetime.timedelta(days=3, hours=5)

datetime.datetime(2022, 4, 18, 18, 3, 11, 147079)


datetime.datetime.now().timestamp()

1650052991.150548


pd.Timestamp(year=1998, month=11, day=26)

Timestamp('1998-11-26 00:00:00')


final_start = pd.to_datetime('June 4th, 2022, 11:30AM')
final_start

Timestamp('2022-06-04 11:30:00')


final_finish = pd.to_datetime('June 4th, 2022, 2:30PM')
final_finish

Timestamp('2022-06-04 14:30:00')


final_finish.dayofweek

5


final_finish.year

2022


final_finish - final_start

Timedelta('0 days 03:00:00')


times = pd.DataFrame({'finish': pd.to_datetime(['Sun, Jan 01, 1989', 
                                                '2022-04-15T11:00', 
                                                '1/1/1970'])})
times


times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   finish  3 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 152.0 bytes


times.iloc[0, 0]

Timestamp('1989-01-01 00:00:00')


times.sort_values('finish')


times_A = pd.DataFrame({
    'Name': ['Annie', 'Billy', 'Sally', 'Tommy'],
    'start_exam': ['15:00', '15:02', '15:01', '15:00'],
    'finish_exam': ['16:00', '17:58', '17:05', '16:55']
})

times_B = pd.DataFrame({
    'Name': ['Junior', 'Rex', 'Flash'],
    'start_exam': ['18:00', '18:06', '19:07'],
    'finish_exam': ['20:00', '20:50', '20:59']
})

display(times_A)
display(times_B)


# Step 1
both_versions = pd.concat([times_A, times_B])
both_versions


# Step 2
both_versions = both_versions.assign(
    start_exam=pd.to_datetime(both_versions['start_exam']),
    finish_exam=pd.to_datetime(both_versions['finish_exam'])
)

both_versions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 2
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Name         7 non-null      object        
 1   start_exam   7 non-null      datetime64[ns]
 2   finish_exam  7 non-null      datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 224.0+ bytes


# Step 3
both_versions = both_versions.assign(
    elapsed=both_versions['finish_exam'] - both_versions['start_exam']
)

both_versions


# Steps 4 and 5
both_versions.sort_values('elapsed').iloc[0].loc['Name']

'Annie'

	Name	Midterm	Final	Homeworks	Labs
0	Annie	98.0	88.0	NaN	NaN
1	Billy	82.0	100.0	NaN	NaN
2	Sally	23.0	99.0	NaN	NaN
3	Tommy	45.0	67.0	NaN	NaN
0	Annie	NaN	NaN	99.0	100.0
1	Billy	NaN	NaN	45.0	100.0
2	Sally	NaN	NaN	23.0	99.0
3	Tommy	NaN	NaN	81.0	100.0

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1444799	Moving Violation	120	I	M	37	2017-01-01 00:03:00	2017-01-01	00:03:00	N	N	N	NaN	NaN	NaN
1	1444821	Equipment Violation	520	W	M	22	2017-01-01 00:25:00	2017-01-01	00:25:00	N	N	N	NaN	NaN	NaN
2	1447102	Moving Violation	520	W	M	29	2017-01-01 01:45:00	2017-01-01	01:45:00	N	N	N	NaN	NaN	NaN
3	1444801	Equipment Violation	720	H	F	61	2017-01-01 03:10:00	2017-01-01	03:10:00	N	N	N	NaN	NaN	NaN
4	1444802	Equipment Violation	120	H	M	24	2017-01-01 03:30:00	2017-01-01	03:30:00	Y	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1308198	Equipment Violation	530	W	M	28	2016-01-01 00:06:00	2016-01-01	0:06	Y	N	N	N	N	N
1	1308172	Moving Violation	520	B	M	25	2016-01-01 00:10:00	2016-01-01	0:10	N	N	N	NaN	NaN	NaN
2	1308171	Moving Violation	110	H	F	31	2016-01-01 00:14:00	2016-01-01	0:14	NaN	NaN	NaN	NaN	NaN	NaN
3	1308170	Moving Violation	Unknown	W	F	29	2016-01-01 00:16:00	2016-01-01	0:16	N	N	N	NaN	NaN	NaN
4	1308197	Moving Violation	230	W	M	52	2016-01-01 00:30:00	2016-01-01	0:30	N	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1444799	Moving Violation	120	I	M	37	2017-01-01 00:03:00	2017-01-01	00:03:00	N	N	N	NaN	NaN	NaN
1	1444821	Equipment Violation	520	W	M	22	2017-01-01 00:25:00	2017-01-01	00:25:00	N	N	N	NaN	NaN	NaN
2	1447102	Moving Violation	520	W	M	29	2017-01-01 01:45:00	2017-01-01	01:45:00	N	N	N	NaN	NaN	NaN
3	1444801	Equipment Violation	720	H	F	61	2017-01-01 03:10:00	2017-01-01	03:10:00	N	N	N	NaN	NaN	NaN
4	1444802	Equipment Violation	120	H	M	24	2017-01-01 03:30:00	2017-01-01	03:30:00	Y	N	N	NaN	NaN	NaN

	stop_id	stop_cause	service_area	subject_race	subject_sex	subject_age	timestamp	stop_date	stop_time	sd_resident	arrested	searched	obtained_consent	contraband_found	property_seized
0	1444799	Moving Violation	120	I	M	37	2017-01-01 00:03:00	2017-01-01	00:03:00	N	N	N	NaN	NaN	NaN
1	1444821	Equipment Violation	520	W	M	22	2017-01-01 00:25:00	2017-01-01	00:25:00	N	N	N	NaN	NaN	NaN
2	1447102	Moving Violation	520	W	M	29	2017-01-01 01:45:00	2017-01-01	01:45:00	N	N	N	NaN	NaN	NaN
3	1444801	Equipment Violation	720	H	F	61	2017-01-01 03:10:00	2017-01-01	03:10:00	N	N	N	NaN	NaN	NaN
4	1444802	Equipment Violation	120	H	M	24	2017-01-01 03:30:00	2017-01-01	03:30:00	Y	N	N	NaN	NaN	NaN

Lecture 9 – Combining Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: Concatenating vertically¶

Example: Grades¶

⚠️ Warning: No loops!¶

Concatenating horizontally¶

Example: Grades (again)¶

Concatenating horizontally¶

Missing rows?¶

Summary: `pd.concat`¶

Joining and merging¶

Concatenating horizontally¶

Joins¶

Example¶

The `merge` method¶

Join types: inner joins¶

Different join types handle mismatches differently¶

Examples of join types¶

Specifying join keys¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Example: SDPD vehicle stops¶

Aside: accessing file names programmatically¶

Loading in the data¶

Investigating races¶

Adjusting granularity¶

Aside: Working with time series data¶

Time series – why now?¶

Datetime types¶

The `datetime` module¶

Times in `pandas`¶

Timestamps in DataFrames¶

Example: Exam speeds¶

Summary, next time¶

Summary¶

	City	Temperature	Country
0	San Diego	76.0	USA
1	Toronto	28.0	Canada
2	Rome	56.0	NaN
3	Shanghai	NaN	China

	Temperature	Country
City
San Diego	76.0	USA
Toronto	28.0	Canada
Rome	56.0	NaN
Shanghai	NaN	China

	PID	Student	Final
0	A15253545	Billy	88
1	A10348245	Sally	64
2	A13349069	Annie	91
3	A18485824	Larry	45
4	A10094857	Johnny	89

	PID	Final
Student
Billy	A15253545	88
Sally	A10348245	64
Annie	A13349069	91
Larry	A18485824	45
Johnny	A10094857	89

	Name	School	Years
0	Brad	UCB	8
1	Janine	UCSD	7
2	Marina	UIC	6
3	Justin	OSU	4
4	Aaron	UCB	4
5	Soohyun	UCSD	1
6	Suraj	UCB	1

	Abr	Full
0	UCSD	University of California, San Diego
1	UCLA	University of California, Los Angeles
2	UCB	University of California, Berkeley
3	UIC	University of Illinois Chicago

	uni	dept	grad_students
0	UCSD	Math	205
1	UCSD	HDSI	54
2	UCSD	COGS	281
3	UCB	CS	439
4	OSU	Math	304
5	OSU	CS	193

	Race Code	Description
0	A	OTHER ASIAN
1	B	BLACK
2	C	CHINESE
3	D	CAMBODIAN
4	F	FILIPINO
5	G	GUAMANIAN
6	H	HISPANIC
7	I	INDIAN
8	J	JAPANESE
9	K	KOREAN
10	L	LAOTIAN
11	O	OTHER
12	P	PACIFIC ISLANDER
13	S	SAMOAN
14	U	HAWAIIAN
15	V	VIETNAMESE
16	W	WHITE
17	Z	ASIAN INDIAN

	Proportion	Race Code	Description
16	0.411358	W	WHITE
6	0.309249	H	HISPANIC
1	0.112620	B	BLACK
11	0.086896	O	OTHER
0	0.046397	A	OTHER ASIAN
4	0.011923	F	FILIPINO
15	0.005768	V	VIETNAMESE
2	0.004899	C	CHINESE
7	0.004123	I	INDIAN
9	0.001222	K	KOREAN
12	0.000990	P	PACIFIC ISLANDER
8	0.000912	J	JAPANESE
17	0.000558	Z	ASIAN INDIAN
3	0.000543	D	CAMBODIAN
10	0.000461	L	LAOTIAN
13	0.000349	S	SAMOAN
5	0.000189	G	GUAMANIAN
14	0.000097	U	HAWAIIAN

	Race Code	Description	Race_Category
0	A	OTHER ASIAN	Asian
1	B	BLACK	Black
2	C	CHINESE	Asian
3	D	CAMBODIAN	Asian
4	F	FILIPINO	Asian

	Name	Midterm	Final
0	Junior	70	42
1	Rex	99	25
2	Flash	81	90

	finish
0	1989-01-01 00:00:00
1	2022-04-15 11:00:00
2	1970-01-01 00:00:00

	Name	start_exam	finish_exam
0	Annie	15:00	16:00
1	Billy	15:02	17:58
2	Sally	15:01	17:05
3	Tommy	15:00	16:55

	Name	start_exam	finish_exam	elapsed
0	Annie	2022-04-15 15:00:00	2022-04-15 16:00:00	0 days 01:00:00
1	Billy	2022-04-15 15:02:00	2022-04-15 17:58:00	0 days 02:56:00
2	Sally	2022-04-15 15:01:00	2022-04-15 17:05:00	0 days 02:04:00
3	Tommy	2022-04-15 15:00:00	2022-04-15 16:55:00	0 days 01:55:00
0	Junior	2022-04-15 18:00:00	2022-04-15 20:00:00	0 days 02:00:00
1	Rex	2022-04-15 18:06:00	2022-04-15 20:50:00	0 days 02:44:00
2	Flash	2022-04-15 19:07:00	2022-04-15 20:59:00	0 days 01:52:00

Lecture 9 – Combining Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Recap: Concatenating vertically¶

Example: Grades¶

⚠️ Warning: No loops!¶

Concatenating horizontally¶

Example: Grades (again)¶

Concatenating horizontally¶

Missing rows?¶

Summary: pd.concat¶

Joining and merging¶

Concatenating horizontally¶

Joins¶

Example¶

The merge method¶

Join types: inner joins¶

Different join types handle mismatches differently¶

Examples of join types¶

Specifying join keys¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Example: SDPD vehicle stops¶

Aside: accessing file names programmatically¶

Loading in the data¶

Investigating races¶

Adjusting granularity¶

Aside: Working with time series data¶

Time series – why now?¶

Datetime types¶

The datetime module¶

Times in pandas¶

Timestamps in DataFrames¶

Example: Exam speeds¶

Summary, next time¶

Summary¶

Summary: `pd.concat`¶

The `merge` method¶

The `datetime` module¶

Times in `pandas`¶