import re
import pandas as pd
import numpy as np

import util


s = '''132.249.20.188 - - [05/May/2022:14:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''


exp = '\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('05', 'May', '2022', '14', '26', '15')]


other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [05/May/2022:14:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'


new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[('05', 'May', '2022', '14', '26', '15')]


other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'


re.findall(new_exp, other_s)

[]


re.findall('\\b\w+\\b', 'hello, my name is billy')

['hello', 'my', 'name', 'is', 'billy']


re.findall('\\b\w+\\b', 'hello-my-name-is-bil_ly!!')

['hello', 'my', 'name', 'is', 'bil_ly']


print('ho\ney')

ho
ey


'hi\billy'

'hi\x08illy'


print('hi\billy') # \b means "backspace" in Python

hiilly


print('hi\\billy')

hi\billy


r'hi\billy'

'hi\\billy'


print(r'hi\billy')

hi\billy


re.findall('\b\w+\b', 'hello, my name is billy')

[]


re.findall(r'\b\w+\b', 'hello, my name is billy')

['hello', 'my', 'name', 'is', 'billy']


re.findall('\\b\w+\\b', 'hello, my name is billy')

['hello', 'my', 'name', 'is', 'billy']


# 2021 data is now actually available, but we will use 2020 data as we did earlier in the quarter
salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
util.anonymize_names(salaries)


salaries.head()


jobtitles = salaries['Job Title']
jobtitles.head()

0              Police Officer
1              Police Officer
2               Fire Engineer
3    Retirement Administrator
4        Fire Battalion Chief
Name: Job Title, dtype: object


jobtitles.shape[0], jobtitles.nunique()

(12605, 647)


jobtitles.value_counts().iloc[:100]

Police Officer           2129
Fire Fighter 2            325
Asst Eng-Civil            279
Grounds Maint Wrkr 2      278
Rec Leader 1              260
                         ... 
Librarian 4                25
Asst Fleet Technician      25
Public Works Supv          25
Custodian 2                25
Asst Deputy Director       25
Name: Job Title, Length: 100, dtype: int64


jobtitles.value_counts().iloc[:25].sort_values().plot(kind='barh', figsize=(8, 6));


jobtitles.sample(10)

6444               Plant Tech 3
1293           Deputy City Atty
11624              Equip Tech 2
1928     Water Sys District Mgr
196              Police Officer
8598           Deputy City Atty
11663              Rec Leader 1
3259              Fire Engineer
764              Police Officer
1794             Police Officer
Name: Job Title, dtype: object


jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

1133


jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

67              Fire Captain-Mast
189    Park & Recreation Director
217      Deputy City Atty - Unrep
261             Fire Captain-Mast
283      Deputy City Atty - Unrep
Name: Job Title, dtype: object


# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

833


jobtitles[jobtitles.str.lower().str.contains('to')]

3             Retirement Administrator
22                 Department Director
31       Asst Retirement Administrator
55                  Asst City Attorney
62                 Department Director
                     ...              
12484                    Storekeeper 1
12502            Storm Water Inspctr 3
12504                 Equip Operator 2
12532         Supv Storm Water Inspctr
12552             Sr Customer Srvs Rep
Name: Job Title, Length: 833, dtype: object


jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

11


jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

723            Principal Asst To City Atty
738                 Asst To The Fire Chief
2087                  Asst To The Director
5221     Conf Secretary To Chief Oper Ofcr
5884                  Asst To The Director
5945        Conf Secretary To Police Chief
6454           Conf Secretary To City Atty
11481              Conf Secretary To Mayor
12061                 Asst To The Director
12120          Conf Secretary To City Atty
12207              Conf Secretary To Mayor
Name: Job Title, dtype: object


jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

738      Asst To The Fire Chief
2087       Asst To The Director
5884       Asst To The Director
12061      Asst To The Director
Name: Job Title, dtype: object


jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3352    Asst For Community Outreach
6356    Asst For Community Outreach
Name: Job Title, dtype: object


jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto|\bthe|\bfor', '', regex=True)
    .str.replace('[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space
    .str.strip()                                      # Removes leading/trailing spaces if present
)


jobtitles.sample(10)

6494          water sys tech 4
9944       library assistant 3
9708       library assistant 3
9368           clerical asst 2
10492              lifeguard 1
3412           council rep 2 a
3238            police officer
11803        asoc commctns eng
8654     laboratory technician
7804      code compliance ofcr
Name: Job Title, dtype: object


jobtitles[jobtitles.str.contains('libr')].value_counts()

library assistant 1      179
library assistant 2      143
library assistant 3      101
librarian 2               66
librarian 3               32
librarian 4               25
supv librarian             7
library technician         7
librarian 1                6
deputy library dir         2
city librarian             1
librarian 3 law librn      1
sr library tech            1
Name: Job Title, dtype: int64


jobtitles[jobtitles.str.contains('eng')].value_counts()

asst eng civil                             279
asoc eng civil                             210
fire engineer                              208
sr civil engineer                           71
jr engineer civil                           70
principal engrng aide                       60
asst eng traffic                            33
asoc eng traffic                            28
structural engrng asoc                      23
student engineer                            20
sr engineering aide                         16
sr traffic engineer                         13
asoc eng electrical                         13
auto messenger 1                             9
structural engrng sr                         9
asst eng civil cntrct spec                   9
asoc eng civil sr cntrct spec                9
auto messenger 2                             8
asst eng electrical                          7
asoc eng fire protection                     5
asoc eng mechanical                          5
asoc commctns eng                            5
fire engineer mast                           4
sr civil engineer princ cntrc spec           3
sr electrical engineer                       3
sr mechanical engineer                       3
asoc eng civil asoc eng geol                 3
asoc eng corrosion                           3
asst eng mechanical                          2
prin corrosion engineering aide              2
sr engineer fire protection                  2
jr engineer civil student                    2
sr electrical engineer sr cntrl sys eng      1
jr engineer electrical                       1
structural engrng asst                       1
sr engineering geologist                     1
sr commctns engineer                         1
Name: Job Title, dtype: int64


jobtitles[jobtitles.str.contains('dir')].value_counts()

deputy director                    78
asst rec ctr dir                   45
rec cntr dir 3                     30
asst deputy director               25
department director                20
rec cntr dir 2                     14
asst department director           10
executive director                  9
rec cntr dir 1                      9
asst development services dir       3
asst director                       3
deputy library dir                  2
real estate assets dir              2
governmental rel dir                2
deputy pers director                2
risk management director            1
development services dir            1
deputy planning director            1
asst environmental services dir     1
personnel director                  1
planning director                   1
asst planning director              1
environmental services dir          1
asst metro wstwtr dir               1
public utilities director           1
park recreation director            1
asst pers director                  1
Name: Job Title, dtype: int64


jobtitles.str.split()

0                  [police, officer]
1                  [police, officer]
2                   [fire, engineer]
3        [retirement, administrator]
4           [fire, battalion, chief]
                    ...             
12600             [asst, eng, civil]
12601              [police, officer]
12602                [asst, planner]
12603             [project, ofcr, 1]
12604           [utility, worker, 2]
Name: Job Title, Length: 12605, dtype: object


all_words = jobtitles.str.split().sum()
all_words[:10]

['police',
 'officer',
 'police',
 'officer',
 'fire',
 'engineer',
 'retirement',
 'administrator',
 'fire',
 'battalion']


unique_words = pd.Series(all_words).value_counts()
unique_words.head(10)

2          2438
police     2329
officer    2150
1          1589
fire       1067
asst        721
civil       656
eng         615
3           612
asoc        563
dtype: int64


len(unique_words)

435


# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict)


counts_df.head()


counts_df.shape

(12605, 435)


counts_df = pd.concat([jobtitles.to_frame(), counts_df], axis=1).set_index('Job Title')
counts_df.head()


counts_df.head()


counts_df.iloc[:, :20].sum()

2            2438
police       2329
officer      2150
1            1589
fire         1067
asst          721
civil         656
eng           615
3             612
asoc          563
assistant     535
sr            529
supv          501
anlyst        477
tech          454
fighter       449
rec           444
library       433
maint         430
engineer      402
dtype: int64


counts_df.iloc[:, :20].sum(axis=1)

Job Title
police officer              2
police officer              2
fire engineer               2
retirement administrator    0
fire battalion chief        1
                           ..
asst eng civil              3
police officer              2
asst planner                1
project ofcr 1              1
utility worker 2            1
Length: 12605, dtype: int64


afc = counts_df.loc['asst fire chief'].iloc[0]
afc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: asst fire chief, Length: 435, dtype: int64


fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: fire battalion chief, Length: 435, dtype: int64


pair_counts = (
    pd.concat([afc, fbc], axis=1)
    .sort_values(by=['asst fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts


np.sum(pair_counts.iloc[0] * pair_counts.iloc[1])

2


counts_df.head()

afc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: asst fire chief, Length: 435, dtype: int64


dots = (
    counts_df[counts_df.index != 'asst fire chief']
    .apply(lambda s: np.dot(s, afc), axis=1)
    .sort_values(ascending=False)
)

dots

Job Title
asst deputy chief oper ofcr    2
fire battalion chief           2
deputy fire chief              2
fire battalion chief           2
deputy fire chief              2
                              ..
lifeguard 3                    0
water sys tech 3               0
sr commctns tech               0
sr life safety inspector       0
utility worker 2               0
Length: 12601, dtype: int64


np.unique(dots.index[dots == dots.max()])

array(['asst chief oper ofcr', 'asst deputy chief oper ofcr',
       'asst fire marshal civ', 'deputy fire chief',
       'fire battalion chief', 'fire chief'], dtype=object)


sentences = pd.Series([
    'I really want global peace',
    'I must love global warming',
    'We must solve climate change'
])

sentences

0      I really want global peace
1      I must love global warming
2    We must solve climate change
dtype: object


unique_words = pd.Series(sentences.str.split().sum()).value_counts()
unique_words

I          2
global     2
must       2
really     1
want       1
peace      1
love       1
warming    1
We         1
solve      1
climate    1
change     1
dtype: int64


counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = sentences.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(sentences)


counts_df


# There is an easier way of doing this in sklearn, as we will see soon
def sim_pair(s1, s2):
    return np.dot(s1, s2) / (np.linalg.norm(s1) * np.linalg.norm(s2))


sim_pair(counts_df.iloc[0], counts_df.iloc[1])

0.3999999999999999


sim_pair(counts_df.iloc[0], counts_df.iloc[2])

0.0


sim_pair(counts_df.iloc[1], counts_df.iloc[2])

0.19999999999999996

	Employee Name	Job Title	Base Pay	Overtime Pay	Other Pay	Benefits	Total Pay	Pension Debt	Total Pay & Benefits	Year	Notes	Agency	Status
0	Michael Xxxx	Police Officer	117691.0	187290.0	13331.00	36380.0	318312.0	NaN	354692.0	2020	NaN	San Diego	FT
1	Gary Xxxx	Police Officer	117691.0	160062.0	42946.00	31795.0	320699.0	NaN	352494.0	2020	NaN	San Diego	FT
2	Eric Xxxx	Fire Engineer	35698.0	204462.0	69121.00	38362.0	309281.0	NaN	347643.0	2020	NaN	San Diego	PT
3	Gregg Xxxx	Retirement Administrator	305000.0	0.0	12814.00	24792.0	317814.0	NaN	342606.0	2020	NaN	San Diego	FT
4	Joseph Xxxx	Fire Battalion Chief	94451.0	157778.0	48151.00	42096.0	300380.0	NaN	342476.0	2020	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

Lecture 18 – Text as Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Log parsing¶

The more specific, the better!¶

Another character class¶

Aside: "raw" strings¶

Reflection¶

Limitations of regexes¶

Advice¶

Quantifying text data¶

Quantifying text data¶

Example: San Diego employee salaries¶

Exploring job titles¶

Messiness of job titles¶

Canonicalizing job titles¶

Punctuation¶

"Glue" words¶

Fixing punctuation and removing "glue" words¶

Abbreviations¶

The limits of canonicalization¶

Bag of words 👜¶

A counts matrix¶

Creating a counts matrix¶

Interpreting the counts matrix¶

Question: What job titles are most similar to `'asst fire chief'`?¶

Aside: dot product¶

Computing similarities¶

Bag of words¶

Cosine similarity and bag of words¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

Summary, next time¶

Summary¶

	police	officer	fire	...
0	1	1	0	...
1	1	1	0	...
2	0	0	1	...
3	0	0	0	...
4	0	0	1	...

	I	global	must	really	want	peace	love	warming	We	solve	climate	change
I really want global peace	1	1	0	1	1	1	0	0	0	0	0	0
I must love global warming	1	1	1	0	0	0	1	1	0	0	0	0
We must solve climate change	0	0	1	0	0	0	0	0	1	1	1	1

	police	officer	fire	...
0	1	1	0	...
1	1	1	0	...
2	0	0	1	...
3	0	0	0	...
4	0	0	1	...

Lecture 18 – Text as Data¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Example: Log parsing¶

The more specific, the better!¶

Another character class¶

Aside: "raw" strings¶

Reflection¶

Limitations of regexes¶

Advice¶

Quantifying text data¶

Quantifying text data¶

Example: San Diego employee salaries¶

Exploring job titles¶

Messiness of job titles¶

Canonicalizing job titles¶

Punctuation¶

"Glue" words¶

Fixing punctuation and removing "glue" words¶

Abbreviations¶

The limits of canonicalization¶

Bag of words 👜¶

A counts matrix¶

Creating a counts matrix¶

Interpreting the counts matrix¶

Question: What job titles are most similar to 'asst fire chief'?¶

Aside: dot product¶

Computing similarities¶

Bag of words¶

Cosine similarity and bag of words¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

Summary, next time¶

Summary¶

Question: What job titles are most similar to `'asst fire chief'`?¶

	police	officer	fire	...
0	1	1	0	...
1	1	1	0	...
2	0	0	1	...
3	0	0	0	...
4	0	0	1	...