import pandas as pd
import numpy as np
import re

pd.options.plotting.backend = 'plotly'

import util


import re


re.search('AB*A', 
          'here is a string for you: ABBBA. here is another: ABBBBBBBA')

<re.Match object; span=(26, 31), match='ABBBA'>


re.findall('AB*A', 
           'here is a string for you: ABBBA. here is another: ABBBBBBBA')

['ABBBA', 'ABBBBBBBA']


re.sub('AB*A', 
       'billy', 
       'here is a string for you: ABBBA. here is another: ABBBBBBBA')

'here is a string for you: billy. here is another: billy'


re.findall('\bcat\b', 'my cat is hungry')

[]


re.findall(r'\bcat\b', 'my cat is hungry')

['cat']


# Huh?
print('\bcat\b')

cat


re.findall(r'\w+@(\w+)\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['notucsd', 'ucsd']


re.findall(r'\w+@\w+\.edu', 
           'my old email was billy@notucsd.edu, my new email is notbilly@ucsd.edu')

['billy@notucsd.edu', 'notbilly@ucsd.edu']


# A regex that matches strings with two of the same vowel followed by 3 digits
# We only want to capture the digits, but...
re.findall(r'(aa|ee|ii|oo|uu)(\d{3})', 'eeoo124')

[('oo', '124')]


s = '''132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'''


exp = '\[(.+)\/(.+)\/(.+):(.+):(.+):(.+) .+\]'
re.findall(exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]


other_s = '[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'
re.findall(exp, other_s)

[('adr', 'jduy', 'wffsdffs', 'r4s4', '4wsgdfd', 'asdf')]

s

'132.249.20.188 - - [24/Feb/2023:12:26:15 -0800] "GET /my/home/ HTTP/1.1" 200 2585'


new_exp = '\[(\d{2})\/([A-Z]{1}[a-z]{2})\/(\d{4}):(\d{2}):(\d{2}):(\d{2}) -\d{4}\]'
re.findall(new_exp, s)

[('24', 'Feb', '2023', '12', '26', '15')]


other_s

'[adr/jduy/wffsdffs:r4s4:4wsgdfd:asdf 7]'


re.findall(new_exp, other_s)

[]


salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2021.csv')
util.anonymize_names(salaries)


salaries.head()


jobtitles = salaries['Job Title']
jobtitles.head()

0                 City Attorney
1                         Mayor
2            Investment Officer
3                Police Officer
4    Independent Budget Analyst
Name: Job Title, dtype: object


jobtitles.shape[0], jobtitles.nunique()

(12305, 588)


jobtitles.value_counts().iloc[:100]

Police Officer                   2123
Fire Fighter Ii                   331
Assistant Engineer - Civil        284
Grounds Maintenance Worker Ii     250
Fire Captain                      248
                                 ... 
Grounds Maintenance Manager        27
Electrician                        27
Executive Assistant                26
Paralegal                          26
Librarian Iv                       25
Name: Job Title, Length: 100, dtype: int64


jobtitles.value_counts().iloc[:25].sort_values().plot(kind='barh')


jobtitles.isna().sum()

2


jobtitles = jobtitles[jobtitles.notna()]


# Uses character class negation
jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

845


jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

281          Park & Recreation Director
539     Associate Engineer - Mechanical
1023         Associate Engineer - Civil
1376       Associate Engineer - Traffic
1460       Budget/Legislative Analyst I
Name: Job Title, dtype: object


# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

1541


jobtitles[jobtitles.str.lower().str.contains('to')]

0                             City Attorney
10       Assistant Retirement Administrator
25                      Department Director
26                  Assistant City Attorney
27             Fire Prevention Inspector Ii
                        ...                
12162                       Test Monitor Ii
12185              Word Processing Operator
12190                       Deputy Director
12210            City Attorney Investigator
12267                       Test Monitor Ii
Name: Job Title, Length: 1541, dtype: object


jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

11


jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

664                            Assistant To The Fire Chief
1403                  Principal Assistant To City Attorney
2358                             Assistant To The Director
4336                Confidential Secretary To Police Chief
4459                             Assistant To The Director
5196     Confidential Secretary To Chief Operating Officer
5563               Confidential Secretary To City Attorney
5685                             Assistant To The Director
7544                       Confidential Secretary To Mayor
9627                  Principal Assistant To City Attorney
12061                            Assistant To The Director
Name: Job Title, dtype: object


jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

664      Assistant To The Fire Chief
2358       Assistant To The Director
4459       Assistant To The Director
5685       Assistant To The Director
12061      Assistant To The Director
Name: Job Title, dtype: object


jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3676     Assistant For Community Outreach
4451     Assistant For Community Outreach
11010    Assistant For Community Outreach
Name: Job Title, dtype: object


jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b|\bfor\b', '', regex=True)
    .str.replace('[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                      # Removes leading/trailing spaces if present.
)


jobtitles.sample(10)

7755                     paralegal
3775                police officer
11323        clerical assistant ii
9372                  greenskeeper
7221          pesticide applicator
10655    assistant center director
7010      recycling specialist iii
11046                  lifeguard i
8452         library assistant iii
10363         library assistant ii
Name: Job Title, dtype: object


jobtitles[jobtitles.str.contains('asst')].value_counts()

Series([], Name: Job Title, dtype: int64)


jobtitles[jobtitles.str.contains('assistant')].value_counts().head()

assistant engineer civil    284
library assistant i         127
library assistant ii        116
library assistant iii       107
clerical assistant ii       100
Name: Job Title, dtype: int64


jobtitles.str.split()

0                      [city, attorney]
1                               [mayor]
2                 [investment, officer]
3                     [police, officer]
4        [independent, budget, analyst]
                      ...              
12300           [recreation, leader, i]
12301               [fire, fighter, ii]
12302                   [fire, captain]
12303       [fleet, repair, supervisor]
12304                  [fire, engineer]
Name: Job Title, Length: 12303, dtype: object


all_words = jobtitles.str.split().sum()
all_words[:10]

['city',
 'attorney',
 'mayor',
 'investment',
 'officer',
 'police',
 'officer',
 'independent',
 'budget',
 'analyst']


unique_words = pd.Series(all_words).value_counts()
unique_words.head(10)

officer       2343
ii            2305
police        2294
i             1449
assistant     1193
fire          1158
engineer      1032
civil          667
iii            625
technician     616
dtype: int64


len(unique_words)

327


# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict)


counts_df.head()


counts_df.shape

(12303, 327)


counts_df = counts_df.set_index(jobtitles)
counts_df


counts_df.head()


# Remember, the columns of counts_df are ordered by number of occurrences.
counts_df.iloc[:, :20].sum()

officer       2343
ii            2305
police        2294
i             1449
assistant     1193
fire          1158
engineer      1032
civil          667
iii            625
technician     616
senior         567
associate      558
analyst        527
worker         496
fighter        470
management     409
manager        393
operator       380
recreation     371
supervisor     362
dtype: int64


counts_df.iloc[:, :20].sum(axis=1)

Job Title
city attorney                 0
mayor                         0
investment officer            1
police officer                2
independent budget analyst    1
                             ..
recreation leader i           2
fire fighter ii               3
fire captain                  1
fleet repair supervisor       1
fire engineer                 2
Length: 12303, dtype: int64


dfc = counts_df.loc['deputy fire chief'].iloc[0]
dfc

officer      0
ii           0
police       0
i            0
assistant    0
            ..
security     0
geologist    0
utilities    0
gardener     0
principle    0
Name: deputy fire chief, Length: 327, dtype: int64


fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

officer      0
ii           0
police       0
i            0
assistant    0
            ..
security     0
geologist    0
utilities    0
gardener     0
principle    0
Name: fire battalion chief, Length: 327, dtype: int64


pair_counts = (
    pd.concat([dfc, fbc], axis=1)
    .sort_values(by=['deputy fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts


np.sum(pair_counts.iloc[0] * pair_counts.iloc[1])

2


counts_df.head()

dfc

officer      0
ii           0
police       0
i            0
assistant    0
            ..
security     0
geologist    0
utilities    0
gardener     0
principle    0
Name: deputy fire chief, Length: 327, dtype: int64


dots = (
    counts_df[counts_df.index != 'deputy fire chief']
    .apply(lambda s: np.dot(s, dfc), axis=1)
    .sort_values(ascending=False)
)

dots

Job Title
fire battalion chief                           2
fire battalion chief                           2
assistant fire chief                           2
fire battalion chief                           2
fire battalion chief                           2
                                              ..
finance analyst iii                            0
associate engineer traffic                     0
supervising procurement contracting officer    0
sanitation driver ii                           0
city attorney                                  0
Length: 12292, dtype: int64


np.unique(dots.index[dots == dots.max()])

array(['assistant deputy chief operating officer', 'assistant fire chief',
       'deputy chief operating officer', 'fire battalion chief',
       'fire chief'], dtype=object)

operation	example	matches ✅	does not match ❌
escape character	`ucsd\.edu`	`'ucsd.edu'`	`'ucsd!edu'`
beginning of line	`^ark`	`'ark two'` `'ark o ark'`	`'dark'`
end of line	`ark$`	`'dark'` `'ark o ark'`	`'ark two'`
zero or one	`cat?`	`'ca'` `'cat'`	`'cart'` (matches `'ca'` only)
built-in character classes*	`\w+` `\d+`	`'billy'` `'231231'`	`'this person'` `'858 people'`
character class negation	`[^a-z]+`	`'KINGTRITON551'` `'1721$$'`	`'porch'` `'billy.edu'`

	Employee Name	Job Title	Base Pay	Other Pay	Benefits	Total Pay	Pension Debt	Total Pay & Benefits	Year	Notes	Agency	Status
0	Mara Xxxx	City Attorney	218759.0	-2560.00	108652.0	216199.0	427749.18	752600.18	2021	NaN	San Diego	FT
1	Todd Xxxx	Mayor	218759.0	-81.00	95549.0	218678.0	427749.18	741976.18	2021	NaN	San Diego	FT
2	Elizabeth Xxxx	Investment Officer	259732.0	-870.00	71438.0	258862.0	221041.09	551341.09	2021	NaN	San Diego	FT
3	Terence Xxxx	Police Officer	212837.0	39683.00	56569.0	252520.0	222375.06	531464.06	2021	NaN	San Diego	FT
4	Andrea Xxxx	Independent Budget Analyst	224312.0	59819.00	54213.0	284131.0	192126.79	530470.79	2021	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...

Lecture 18 – Regular Expressions, Bag of Words¶

DSC 80, Winter 2023¶

📣 Announcements¶

Agenda¶

More regular expressions¶

Even more regex syntax¶

Example (built-in character classes)¶

Exercise¶

Regex in Python¶

re in Python¶

Raw strings¶

Capture groups¶

Example: Log parsing¶

The more specific, the better!¶

Limitations¶

Limitations of regexes¶

Text features¶

Review: Regression and features¶

Moving forward¶

Text features¶

Example: San Diego employee salaries¶

Aside on privacy and ethics¶

Goal: Quantifying similarity¶

Exploring job titles¶

Canonicalization¶

Punctuation¶

"Glue" words¶

Fixing punctuation and removing "glue" words¶

Possible issue: inconsistent representations¶

Bag of words 💰¶

Text similarity¶

A counts matrix¶

Creating a counts matrix¶

Interpreting the counts matrix¶

Question: What job titles are most similar to 'deputy fire chief'?¶

Aside: Dot product¶

Computing similarities¶

Bag of words¶

Aside: Interactive bag of words demo¶

Summary, next time¶

Summary¶

Next time¶

`re` in Python¶

Question: What job titles are most similar to `'deputy fire chief'`?¶

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...