import pandas as pd
import numpy as np
import os


import datetime


datetime.datetime.now()

datetime.datetime(2023, 1, 23, 1, 35, 46, 972404)


datetime.datetime.now() + datetime.timedelta(days=3, hours=5)

datetime.datetime(2023, 1, 26, 6, 35, 46, 980282)


datetime.datetime.now().timestamp()

1674466546.983726


pd.Timestamp(year=1998, month=11, day=26)

Timestamp('1998-11-26 00:00:00')


final_start = pd.to_datetime('March 22nd, 2023, 11:30AM')
final_start

Timestamp('2023-03-22 11:30:00')


final_finish = pd.to_datetime('March 22nd, 2023, 2:30PM')
final_finish

Timestamp('2023-03-22 14:30:00')


# 0 is Monday, 1 is Tuesday, etc.
final_finish.dayofweek

2


final_finish.year

2023


final_finish - final_start

Timedelta('0 days 03:00:00')


exam_times = pd.read_csv(os.path.join('data', 'exam-times.csv'))
exam_times


# Step 1: Convert the time columns to timestamps, using pd.to_datetime.
exam_times['start_exam'] = pd.to_datetime(exam_times['start_exam'])
exam_times['finish_exam'] = pd.to_datetime(exam_times['finish_exam'])
exam_times


# Note that datetime64[ns] is the data type pandas uses to store timestamps in a Series/DataFrame.
exam_times.dtypes

name                   object
start_exam     datetime64[ns]
finish_exam    datetime64[ns]
section                object
dtype: object


# Step 2: Find the difference between the two time columns.
exam_times['difference'] = exam_times['finish_exam'] - exam_times['start_exam']
exam_times


exam_times.dtypes

name                    object
start_exam      datetime64[ns]
finish_exam     datetime64[ns]
section                 object
difference     timedelta64[ns]
dtype: object


# Step 3: Sort by the difference in descending order and take the first row.
exam_times.sort_values('difference', ascending=False)['name'].iloc[0]

'Billy'


# Run this cell to set up the next example.

section_A = pd.DataFrame({
    'Name': ['Annie', 'Billy', 'Sally', 'Tommy'],
    'Midterm': [98, 82, 23, 45],
    'Final': [88, 100, 99, 67]
})

section_B = pd.DataFrame({
    'Name': ['Junior', 'Rex', 'Flash'],
    'Midterm': [70, 99, 81],
    'Final': [42, 25, 90]
})

section_C = pd.DataFrame({
    'Name': ['Justin', 'Marina'],
    'Final': [98, 52]
})

section_D = pd.DataFrame({
    'Midterm': [10, 30, 80],
    'Name': ['Janine', 'Sooh', 'Suraj']
})


section_A


section_B


section_A


section_B


pd.concat([section_A, section_B])


pd.concat([section_A, section_B], ignore_index=True)


combined = pd.concat([section_A, section_B], keys=['Section A', 'Section B'])
combined


combined.loc['Section A']


new_row_data = {'Name': 'King Triton', 'Midterm': 21, 'Final': 94}
new_row_df = pd.DataFrame([new_row_data]) # Note the list!
new_row_df


pd.concat([section_A, new_row_df])


section_C


section_D


# Note that the 'Name' columns were combined, despite not being in the same position!
pd.concat([section_C, section_D])


import os
os.listdir('data')

['.DS_Store', 'exam-times.csv']


os.listdir('../')

['.DS_Store', 'src']


!ls ../

src


# Run this cell to set up the next example.

exams = section_A.copy()

assignments = exams[['Name']].assign(Homeworks=[99, 45, 23, 81],
                                     Labs=[100, 100, 99, 100])

overall = pd.DataFrame({
    'PID': ['A15253545', 'A10348245', 'A13349069', 'A18485824', 'A10094857'],
    'Student': ['Billy', 'Sally', 'Annie', 'Larry', 'Johnny'],
    'Final': [88, 64, 91, 45, 89]
})


exams


assignments


pd.concat([exams, assignments])


pd.concat([exams, assignments], axis=1)


# .loc[::-1] reverses the rows of the DataFrame.
exams_by_name = exams.set_index('Name').iloc[::-1]
exams_by_name


assignments_by_name = assignments.set_index('Name')
assignments_by_name


pd.concat([exams_by_name, assignments_by_name], axis=1)


exams_reversed = exams.iloc[::-1].reset_index(drop=True)
exams_reversed


assignments


pd.concat([exams_reversed, assignments], axis=1)


# Run these two cells to set up the next example.

temps = pd.DataFrame({
    'City': ['San Diego', 'Toronto', 'Rome'],
    'Temperature': [76, 28, 56]
})

countries = pd.DataFrame({
    'City': ['Toronto', 'Shanghai', 'San Diego'],
    'Country': ['Canada', 'China', 'USA']
})


%reload_ext pandas_tutor


temps


countries


%%pt

temps.merge(countries)


temps


countries


# The default value of how is 'inner'.
temps.merge(countries, how='inner')


# Note the NaNs!
temps.merge(countries, how='left')


temps.merge(countries, how='right')


%%pt

temps.merge(countries, how='outer')


pd.concat([temps.set_index('City'), countries.set_index('City')], axis=1)


temps.merge(countries, how='left')


countries.merge(temps, how='right')


exams


overall


exams.merge(overall)


exams.merge(overall, left_on='Name', right_on='Student')


exams.merge(overall, left_on='Name', right_on='Student', suffixes=('_Exam', '_Overall'))


# Run this cell to set up the next example.

profs = pd.DataFrame(
[['Brad', 'UCB', 9],
 ['Janine', 'UCSD', 8],
 ['Marina', 'UIC', 7],
 ['Justin', 'OSU', 5],
 ['Soohyun', 'UCSD', 2],
 ['Suraj', 'UCB', 2]],
    columns=['Name', 'School', 'Years']
)

schools = pd.DataFrame({
    'Abr': ['UCSD', 'UCLA', 'UCB', 'UIC'],
    'Full': ['University of California, San Diego', 'University of California, Los Angeles', 'University of California, Berkeley', 'University of Illinois Chicago']
})

programs = pd.DataFrame({
    'uni': ['UCSD', 'UCSD', 'UCSD', 'UCB', 'OSU', 'OSU'],
    'dept': ['Math', 'HDSI', 'COGS', 'CS', 'Math', 'CS'],
    'grad_students': [205, 54, 281, 439, 304, 193]
})


profs


schools


# Why is a left merge most appropriate here?
profs.merge(schools, left_on='School', right_on='Abr', how='left')


profs


programs


%%pt

profs.merge(programs, left_on='School', right_on='uni')

	name	start_exam	finish_exam	section
0	Annie	15:00	16:00	A
1	Billy	15:02	17:58	A
2	Sally	15:01	17:05	A
3	Tommy	15:00	16:55	A
4	Junior	18:00	20:00	B
5	Rex	18:06	20:50	B
6	Flash	19:07	20:59	B

	name	start_exam	finish_exam	section
0	Annie	2023-01-23 15:00:00	2023-01-23 16:00:00	A
1	Billy	2023-01-23 15:02:00	2023-01-23 17:58:00	A
2	Sally	2023-01-23 15:01:00	2023-01-23 17:05:00	A
3	Tommy	2023-01-23 15:00:00	2023-01-23 16:55:00	A
4	Junior	2023-01-23 18:00:00	2023-01-23 20:00:00	B
5	Rex	2023-01-23 18:06:00	2023-01-23 20:50:00	B
6	Flash	2023-01-23 19:07:00	2023-01-23 20:59:00	B

	name	start_exam	finish_exam	section	difference
0	Annie	2023-01-23 15:00:00	2023-01-23 16:00:00	A	0 days 01:00:00
1	Billy	2023-01-23 15:02:00	2023-01-23 17:58:00	A	0 days 02:56:00
2	Sally	2023-01-23 15:01:00	2023-01-23 17:05:00	A	0 days 02:04:00
3	Tommy	2023-01-23 15:00:00	2023-01-23 16:55:00	A	0 days 01:55:00
4	Junior	2023-01-23 18:00:00	2023-01-23 20:00:00	B	0 days 02:00:00
5	Rex	2023-01-23 18:06:00	2023-01-23 20:50:00	B	0 days 02:44:00
6	Flash	2023-01-23 19:07:00	2023-01-23 20:59:00	B	0 days 01:52:00

Lecture 6 – Concatenating and Merging¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Aside: Working with time series data¶

Time series – why now?¶

Datetime types¶

The `datetime` module¶

Times in `pandas`¶

Example: Exam speeds¶

Concatenating vertically¶

Example: Grades¶

Concatenating vertically¶

Example: Grades¶

Adding a single row¶

Missing columns?¶

⚠️ Warning: No loops!¶

Aside: Accessing file names programmatically¶

Concatenating horizontally¶

Example: Grades (again)¶

Concatenating horizontally¶

Summary: `pd.concat`¶

Merging¶

Joining¶

The `merge` method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Symmetry¶

Specifying join keys¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Summary, next time¶

Summary¶

Next time¶

		Name	Midterm	Final
Section A	0	Annie	98	88
	1	Billy	82	100
	2	Sally	23	99
	3	Tommy	45	67
Section B	0	Junior	70	42
	1	Rex	99	25
	2	Flash	81	90

	Name	Final	Midterm
0	Justin	98.0	NaN
1	Marina	52.0	NaN
0	Janine	NaN	10.0
1	Sooh	NaN	30.0
2	Suraj	NaN	80.0

	Name	Midterm	Final	Homeworks	Labs
0	Annie	98.0	88.0	NaN	NaN
1	Billy	82.0	100.0	NaN	NaN
2	Sally	23.0	99.0	NaN	NaN
3	Tommy	45.0	67.0	NaN	NaN
0	Annie	NaN	NaN	99.0	100.0
1	Billy	NaN	NaN	45.0	100.0
2	Sally	NaN	NaN	23.0	99.0
3	Tommy	NaN	NaN	81.0	100.0

	Temperature	Country
City
San Diego	76.0	USA
Toronto	28.0	Canada
Rome	56.0	NaN
Shanghai	NaN	China

	PID	Student	Final
0	A15253545	Billy	88
1	A10348245	Sally	64
2	A13349069	Annie	91
3	A18485824	Larry	45
4	A10094857	Johnny	89

	Name	School	Years
0	Brad	UCB	9
1	Janine	UCSD	8
2	Marina	UIC	7
3	Justin	OSU	5
4	Soohyun	UCSD	2
5	Suraj	UCB	2

	Abr	Full
0	UCSD	University of California, San Diego
1	UCLA	University of California, Los Angeles
2	UCB	University of California, Berkeley
3	UIC	University of Illinois Chicago

	uni	dept	grad_students
0	UCSD	Math	205
1	UCSD	HDSI	54
2	UCSD	COGS	281
3	UCB	CS	439
4	OSU	Math	304
5	OSU	CS	193

Lecture 6 – Concatenating and Merging¶

DSC 80, Winter 2023¶

Announcements¶

Agenda¶

Aside: Working with time series data¶

Time series – why now?¶

Datetime types¶

The datetime module¶

Times in pandas¶

Example: Exam speeds¶

Concatenating vertically¶

Example: Grades¶

Concatenating vertically¶

Example: Grades¶

Adding a single row¶

Missing columns?¶

⚠️ Warning: No loops!¶

Aside: Accessing file names programmatically¶

Concatenating horizontally¶

Example: Grades (again)¶

Concatenating horizontally¶

Summary: pd.concat¶

Merging¶

Joining¶

The merge method¶

Join types: inner joins¶

Different join types¶

Different join types handle mismatches differently¶

Symmetry¶

Specifying join keys¶

Many-to-one & many-to-many joins¶

One-to-one joins¶

Many-to-one joins¶

Many-to-many joins¶

Summary, next time¶

Summary¶

Next time¶

The `datetime` module¶

Times in `pandas`¶

Summary: `pd.concat`¶

The `merge` method¶