from dsc80_utils import *

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2022.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

salaries.head()

jobtitles = salaries['Job Title']
jobtitles.head()

0                         City Attorney
1                                 Mayor
2                Assistant Police Chief
3                       Police Sergeant
4    Assistant Retirement Administrator
Name: Job Title, dtype: object

jobtitles.shape[0], jobtitles.nunique()

(12831, 611)

jobtitles.value_counts().iloc[:100]

Job Title
Police Officer Ii               1082
Police Sergeant                  311
Fire Fighter Ii                  306
                                ... 
Public Works Supervisor           29
Project Assistant                 29
Associate Engineer - Traffic      29
Name: count, Length: 100, dtype: int64

jobtitles.value_counts().iloc[:10].sort_values().plot(kind='barh')

jobtitles.isna().sum()

np.int64(0)

# Uses character class negation.
jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True).sum()

np.int64(922)

jobtitles[jobtitles.str.contains(r'[^A-Za-z0-9 ]', regex=True)].head()

137          Park & Recreation Director
248     Associate Engineer - Mechanical
734     Associate Engineer - Electrical
882        Associate Engineer - Traffic
1045         Associate Engineer - Civil
Name: Job Title, dtype: object

# Why are we converting to lowercase?
jobtitles.str.lower().str.contains('to').sum()

np.int64(1577)

jobtitles[jobtitles.str.lower().str.contains('to')]

0                             City Attorney
4        Assistant Retirement Administrator
8                  Retirement Administrator
                        ...                
12778                        Test Monitor I
12812                           Custodian I
12826              Word Processing Operator
Name: Job Title, Length: 1577, dtype: object

jobtitles.str.lower().str.contains(r'\bto\b', regex=True).sum()

np.int64(10)

jobtitles[jobtitles.str.lower().str.contains(r'\bto\b', regex=True)]

1638              Assistant To The Chief Operating Officer
2183                  Principal Assistant To City Attorney
2238                             Assistant To The Director
                               ...                        
6594     Confidential Secretary To Chief Operating Officer
6832                       Confidential Secretary To Mayor
11028                      Confidential Secretary To Mayor
Name: Job Title, Length: 10, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bthe\b', regex=True)]

1638    Assistant To The Chief Operating Officer
2238                   Assistant To The Director
5609                   Assistant To The Director
6544                   Assistant To The Director
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bfor\b', regex=True)]

3449     Assistant For Community Outreach
6889     Assistant For Community Outreach
10810    Assistant For Community Outreach
Name: Job Title, dtype: object

jobtitles[jobtitles.str.lower().str.contains(r'\bi+v?\b', regex=True)]

5                   Police Officer Ii
10                  Police Officer Ii
48       Fire Prevention Inspector Ii
                     ...             
12822           Clerical Assistant Ii
12828               Police Officer Ii
12830                Police Officer I
Name: Job Title, Length: 6087, dtype: object

jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b|\bfor\b', '', regex=True)
    .str.replace(r'[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(r'\bi+v?\b', '', regex=True)
    .str.replace(r' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                       # Removes leading/trailing spaces if present.
)

jobtitles.sample(5)

7255              finance analyst
10760           library assistant
4350               police officer
3902     senior personnel analyst
6170                  electrician
Name: Job Title, dtype: object

(jobtitles == 'police officer').sum()

np.int64(1378)

jobtitles.str.split()

0                  [city, attorney]
1                           [mayor]
2        [assistant, police, chief]
                    ...            
12828             [police, officer]
12829          [police, dispatcher]
12830             [police, officer]
Name: Job Title, Length: 12831, dtype: object

# The .explode method concats the lists together.
all_words = jobtitles.str.split().explode()
all_words

0              city
0          attorney
1             mayor
            ...    
12829    dispatcher
12830        police
12830       officer
Name: Job Title, Length: 30057, dtype: object

unique_words = all_words.value_counts()
unique_words

Job Title
police       2299
officer      1608
assistant    1267
             ... 
relations       1
dna             1
gardener        1
Name: count, Length: 330, dtype: int64

# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(jobtitles)

counts_df.head()

counts_df.shape

(12831, 330)

dfc = counts_df.loc['deputy fire chief'].iloc[0]
dfc

police       0
officer      0
assistant    0
            ..
relations    0
dna          0
gardener     0
Name: deputy fire chief, Length: 330, dtype: int64

fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

police       0
officer      0
assistant    0
            ..
relations    0
dna          0
gardener     0
Name: fire battalion chief, Length: 330, dtype: int64

pair_counts = (
    pd.concat([dfc, fbc], axis=1)
    .sort_values(by=['deputy fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts

np.dot(pair_counts.iloc[0], pair_counts.iloc[1])

np.int64(2)

sentences = pd.Series([
    'I really really want global peace',
    'I must enjoy global warming',
    'We must solve climate change'
])

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = sentences.str.split().explode().value_counts()
unique_words

I          2
really     2
global     2
          ..
solve      1
climate    1
change     1
Name: count, Length: 12, dtype: int64

counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = sentences.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(sentences)

counts_df

counts_df

def sim_pair(s1, s2):
    return np.dot(s1, s2) / (np.linalg.norm(s1) * np.linalg.norm(s2))

# Look at the documentation of the .corr method to see how this works!
counts_df.T.corr(sim_pair)

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

tf = sentences.iloc[1].count('global') / len(sentences.iloc[1].split())
tf

0.2

idf = np.log(len(sentences) / sentences.str.contains('global').sum())
idf

np.float64(0.4054651081081644)

tf * idf

np.float64(0.08109302162163289)

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = np.unique(sentences.str.split().explode())
unique_words

array(['I', 'We', 'change', 'climate', 'enjoy', 'global', 'must', 'peace',
       'really', 'solve', 'want', 'warming'], dtype=object)

tfidf_dict = {}

for word in unique_words:
    re_pat = fr'\b{word}\b'
    tf = sentences.str.count(re_pat) / sentences.str.split().str.len()
    idf = np.log(len(sentences) / sentences.str.contains(re_pat).sum())
    tfidf_dict[word] = tf * idf
    
tfidf = pd.DataFrame(tfidf_dict).set_index(sentences)

tfidf

display_df(tfidf, cols=12)

display_df(tfidf, cols=12)

tfidf.idxmax(axis=1)

I really really want global peace    really
I must enjoy global warming           enjoy
We must solve climate change             We
dtype: object

from IPython.display import YouTubeVideo
YouTubeVideo('gzcBTUvVp7M')

from pathlib import Path
sotu_txt = Path('data') / 'stateoftheunion1790-2023.txt'
sotu = sotu_txt.read_text()

len(sotu)

10577941

print(sotu[:1600])

The Project Gutenberg EBook of Complete State of the Union Addresses,
from 1790 to the Present. Speeches beginning in 2002 are from UCSB The American Presidency Project.
Speeches from 2018-2023 were manually downloaded from whitehouse.gov.

Character set encoding: UTF8

The addresses are separated by three asterisks


CONTENTS

  George Washington, State of the Union Address, January 8, 1790
  George Washington, State of the Union Address, December 8, 1790
  George Washington, State of the Union Address, October 25, 1791
  George Washington, State of the Union Address, November 6, 1792
  George Washington, State of the Union Address, December 3, 1793
  George Washington, State of the Union Address, November 19, 1794
  George Washington, State of the Union Address, December 8, 1795
  George Washington, State of the Union Address, December 7, 1796
  John Adams, State of the Union Address, November 22, 1797
  John Adams, State of the Union Address, December 8, 1798
  John Adams, State of the Union Address, December 3, 1799
  John Adams, State of the Union Address, November 11, 1800
  Thomas Jefferson, State of the Union Address, December 8, 1801
  Thomas Jefferson, State of the Union Address, December 15, 1802
  Thomas Jefferson, State of the Union Address, October 17, 1803
  Thomas Jefferson, State of the Union Address, November 8, 1804
  Thomas Jefferson, State of the Union Address, December 3, 1805
  Thomas Jefferson, State of the Union Address, December 2, 1806
  Thomas Jefferson, State of the Union Address, October 27, 1807
  Thomas Jefferson, State of the Union Address,

speeches = sotu.split('\n***\n')[1:]

len(speeches)

233

print(speeches[-1][:1000])

State of the Union Address
Joseph R. Biden Jr.  
February 7, 2023

  Mr. Speaker. Madam Vice President. Our First Lady and Second
Gentleman. Members of Congress and the Cabinet. Leaders of our
military. Mr. Chief Justice, Associate Justices, and retired Justices
of the Supreme Court. And you, my fellow Americans.
  I start tonight by congratulating the members of the 118th Congress
and the new Speaker of the House, Kevin McCarthy. Mr. Speaker, I look
forward to working together.
  I also want to congratulate the new leader of the House Democrats and
the first Black House Minority Leader in history, Hakeem Jeffries.
  Congratulations to the longest serving Senate leader in history,
Mitch McConnell.
  And congratulations to Chuck Schumer for another term as Senate
Majority Leader, this time with an even bigger majority.
  And I want to give special recognition to someone who I think will be
considered the greatest Speaker in the history of this country, Nancy
Pelosi.
  The story of Amer

import re
def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
speeches_df

speeches_df

unique_words = speeches_df['contents'].str.split().explode().value_counts()
# Take the top 500 most common words for speed
unique_words = unique_words.iloc[:500].index
unique_words

Index(['the', 'of', 'to', 'and', 'in', 'a', 'that', 'for', 'be', 'our',
       ...
       'desire', 'call', 'submitted', 'increasing', 'months', 'point', 'trust',
       'throughout', 'set', 'object'],
      dtype='object', name='contents', length=500)

from tqdm.notebook import tqdm

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

# Wrap the sequence with `tqdm()` to display a progress bar
for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf = pd.DataFrame(tfidf_dict)
tfidf.head()

summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
          ...    
230          it's
231       tonight
232          it's
Length: 233, dtype: object

def five_largest(row):
    return ', '.join(row.index[row.argsort()][-5:])

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)

keywords_df

# display_df(keywords_df, rows=233)

tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf_nl = pd.DataFrame(tfidf_nl_dict)
tfidf_nl.head()

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)
keywords_nl_df

(1000 / 999)

1.001001001001001

np.log(1000 / 999)

np.float64(0.001000500333583622)

(50 / 2)

25.0

(500 / 2)

250.0

np.log(50 / 2)

np.float64(3.2188758248682006)

np.log(500 / 2)

np.float64(5.521460917862246)

	Employee Name	Job Title	Base Pay	Overtime Pay	...	Year	Notes	Agency	Status
0	Mara Xxxx	City Attorney	227441.53	0.00	...	2022	NaN	San Diego	FT
1	Todd Xxxx	Mayor	227441.53	0.00	...	2022	NaN	San Diego	FT
2	Terence Xxxx	Assistant Police Chief	227224.32	0.00	...	2022	NaN	San Diego	FT
3	Esmeralda Xxxx	Police Sergeant	124604.40	162506.54	...	2022	NaN	San Diego	FT
4	Marcelle Xxxx	Assistant Retirement Administrator	279868.04	0.00	...	2022	NaN	San Diego	FT

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

	big	data	science
big big big big data	4	1	0
big data science	1	1	1
science big data	1	1	1

Pair	Dot Product	Cosine Similarity
big data science and big big big big data	5	0.7001
big data science and science big data	3	1

	the	of	to	and	...	trust	throughout	set	object
0	0.09	0.06	0.05	0.04	...	1.47e-03	0.00e+00	0.00e+00	5.78e-03
1	0.09	0.06	0.03	0.03	...	0.00e+00	0.00e+00	0.00e+00	2.99e-03
2	0.11	0.07	0.04	0.03	...	1.39e-03	0.00e+00	1.30e-03	1.82e-03
3	0.09	0.07	0.04	0.03	...	2.29e-03	7.53e-04	0.00e+00	2.01e-03
4	0.09	0.07	0.04	0.02	...	8.12e-04	1.60e-03	0.00e+00	1.07e-03

	I	really	global	must	...	We	solve	climate	change
I really really want global peace	1	2	1	0	...	0	0	0	0
I must enjoy global warming	1	0	1	1	...	0	0	0	0
We must solve climate change	0	0	0	1	...	1	1	1	1

	I really really want global peace	I must enjoy global warming	We must solve climate change
I really really want global peace	1.00	0.32	0.0
I must enjoy global warming	0.32	1.00	0.2
We must solve climate change	0.00	0.20	1.0

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	...	trust	throughout	set	object
0	...	4.29e-04	0.00e+00	0.00e+00	2.04e-03
1	...	0.00e+00	0.00e+00	0.00e+00	1.06e-03
2	...	4.06e-04	0.00e+00	3.48e-04	6.44e-04
3	...	6.70e-04	2.17e-04	0.00e+00	7.09e-04
4	...	2.38e-04	4.62e-04	0.00e+00	3.77e-04

	president	date	0
0	George Washington	January 8, 1790	your, proper, regard, ought, object
1	George Washington	December 8, 1790	case, established, object, commerce, convention
2	George Washington	October 25, 1791	community, upon, lands, proper, provision
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	get, americans, percent, jobs, it's
231	Joseph R. Biden Jr.	March 1, 2022	let, jobs, americans, get, tonight
232	Joseph R. Biden Jr.	February 7, 2023	down, percent, jobs, tonight, it's

	president	date	0
0	George Washington	January 8, 1790	a, and, to, of, the
1	George Washington	December 8, 1790	in, and, to, of, the
2	George Washington	October 25, 1791	a, and, to, of, the
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	of, it's, and, to, the
231	Joseph R. Biden Jr.	March 1, 2022	we, of, to, and, the
232	Joseph R. Biden Jr.	February 7, 2023	a, of, and, to, the

Lecture 12 – Text Features¶

DSC 80, Winter 2025¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at dsc80.com/q)

Text features¶

Review: Regression and features¶

Moving forward¶

Text features¶

Example: San Diego employee salaries¶

Aside on privacy and ethics¶

Goal: Quantifying similarity¶

Exploring job titles¶

Canonicalization¶

Punctuation¶

"Glue" words¶

Roman numerals (e.g. "Ii")¶

Fixing punctuation and removing "glue" words and roman numerals¶

Bag of words 💰¶

Text similarity¶

A counts matrix¶

Creating a counts matrix¶

Bag of words¶

Cosine similarity¶

Question: What job titles are most similar to 'deputy fire chief'?¶

Counting shared words¶

Recall: The dot product¶

Cosine similarity and bag of words¶

Normalizing¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

Question 🤔 (Answer at dsc80.com/q)

TF-IDF¶

The importance of words¶

Term frequency¶

Inverse document frequency¶

Intuition¶

Term frequency-inverse document frequency¶

Computing TF-IDF¶

TF-IDF of all words in all documents¶

Interpreting TF-IDFs¶

Example: State of the Union addresses 🎤¶

State of the Union addresses¶

The data¶

Finding the most important words in each speech¶

💡 Pro-Tip: Using tqdm¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Question 🤔 (Answer at dsc80.com/q)

Summary, next time¶

Summary¶

Next time¶

Question: What job titles are most similar to `'deputy fire chief'`?¶

💡 Pro-Tip: Using `tqdm`¶