from dsc80_utils import *

salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2021.csv')
salaries['Employee Name'] = salaries['Employee Name'].str.split().str[0] + ' Xxxx'

jobtitles = salaries['Job Title']
jobtitles = jobtitles[jobtitles.notna()]
jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto\b|\bthe\b|\bfor\b', '', regex=True)
    .str.replace('[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(' +', ' ', regex=True)               # ' +' matches 1 or more occurrences of a space.
    .str.strip()                                      # Removes leading/trailing spaces if present.
)

unique_words = pd.Series(jobtitles.str.split().sum()).value_counts()

# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(jobtitles)

jobtitles

0                  city attorney
1                          mayor
2             investment officer
                  ...           
12302               fire captain
12303    fleet repair supervisor
12304              fire engineer
Name: Job Title, Length: 12303, dtype: object

jobtitles.str.split()

0                   [city, attorney]
1                            [mayor]
2              [investment, officer]
                    ...             
12302                [fire, captain]
12303    [fleet, repair, supervisor]
12304               [fire, engineer]
Name: Job Title, Length: 12303, dtype: object

# The .explode method concats the lists together
all_words = jobtitles.str.split().explode()
all_words

0              city
0          attorney
1             mayor
            ...    
12303    supervisor
12304          fire
12304      engineer
Name: Job Title, Length: 33304, dtype: object

unique_words = all_words.value_counts()
unique_words

officer      2343
ii           2305
police       2294
             ... 
utilities       1
gardener        1
principle       1
Name: Job Title, Length: 327, dtype: int64

# Created using a dictionary to avoid a "DataFrame is highly fragmented" warning.
counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict)

counts_df.head()

counts_df.shape

(12303, 327)

counts_df = counts_df.set_index(jobtitles)
counts_df.head()

counts_df.head()

# Remember, the columns of counts_df are ordered by number of occurrences.
counts_df.iloc[:, :20].sum()

officer       2343
ii            2305
police        2294
              ... 
operator       380
recreation     371
supervisor     362
Length: 20, dtype: int64

counts_df.iloc[:, :20].sum(axis=1)

Job Title
city attorney              0
mayor                      0
investment officer         1
                          ..
fire captain               1
fleet repair supervisor    1
fire engineer              2
Length: 12303, dtype: int64

dfc = counts_df.loc['deputy fire chief'].iloc[0]
dfc

officer      0
ii           0
police       0
            ..
utilities    0
gardener     0
principle    0
Name: deputy fire chief, Length: 327, dtype: int64

fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

officer      0
ii           0
police       0
            ..
utilities    0
gardener     0
principle    0
Name: fire battalion chief, Length: 327, dtype: int64

pair_counts = (
    pd.concat([dfc, fbc], axis=1)
    .sort_values(by=['deputy fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts

np.sum(pair_counts.iloc[0] * pair_counts.iloc[1])

2

counts_df.head()

dfc

officer      0
ii           0
police       0
            ..
utilities    0
gardener     0
principle    0
Name: deputy fire chief, Length: 327, dtype: int64

dots = (
    counts_df[counts_df.index != 'deputy fire chief']
    .apply(lambda s: np.dot(s, dfc), axis=1)
    .sort_values(ascending=False)
)

dots

Job Title
fire battalion chief                           2
fire battalion chief                           2
assistant fire chief                           2
                                              ..
supervising procurement contracting officer    0
sanitation driver ii                           0
city attorney                                  0
Length: 12292, dtype: int64

np.unique(dots.index[dots == dots.max()])

array(['assistant deputy chief operating officer', 'assistant fire chief',
       'deputy chief operating officer', 'fire battalion chief',
       'fire chief'], dtype=object)

sentences = pd.Series([
    'I really really want global peace',
    'I must enjoy global warming',
    'We must solve climate change'
])

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = pd.Series(sentences.str.split().sum()).value_counts()
unique_words

I          2
really     2
global     2
          ..
solve      1
climate    1
change     1
Length: 12, dtype: int64

counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = sentences.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(sentences)

counts_df

counts_df

def sim_pair(s1, s2):
    return np.dot(s1, s2) / (np.linalg.norm(s1) * np.linalg.norm(s2))

# Look at the documentation of the .corr method to see how this works!
counts_df.T.corr(sim_pair)

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

tf = sentences.iloc[1].count('global') / len(sentences.iloc[1].split())
tf

0.2

idf = np.log(len(sentences) / sentences.str.contains('global').sum())
idf

0.4054651081081644

tf * idf

0.08109302162163289

sentences

0    I really really want global peace
1          I must enjoy global warming
2         We must solve climate change
dtype: object

unique_words = np.unique(sentences.str.split().sum())
unique_words

array(['I', 'We', 'change', 'climate', 'enjoy', 'global', 'must', 'peace',
       'really', 'solve', 'want', 'warming'], dtype='<U7')

tfidf_dict = {}

for word in unique_words:
    re_pat = fr'\b{word}\b'
    tf = sentences.str.count(re_pat) / sentences.str.split().str.len()
    idf = np.log(len(sentences) / sentences.str.contains(re_pat).sum())
    tfidf_dict[word] = tf * idf
    
tfidf = pd.DataFrame(tfidf_dict).set_index(sentences)

tfidf

tfidf

display_df(tfidf, cols=12)

tfidf.idxmax(axis=1)

I really really want global peace    really
I must enjoy global warming           enjoy
We must solve climate change             We
dtype: object

from IPython.display import YouTubeVideo
YouTubeVideo('gzcBTUvVp7M')

sotu = open('data/stateoftheunion1790-2023.txt').read()

len(sotu)

10577941

print(sotu[:1600])

The Project Gutenberg EBook of Complete State of the Union Addresses,
from 1790 to the Present. Speeches beginning in 2002 are from UCSB The American Presidency Project.
Speeches from 2018-2023 were manually downloaded from whitehouse.gov.

Character set encoding: UTF8

The addresses are separated by three asterisks


CONTENTS

  George Washington, State of the Union Address, January 8, 1790
  George Washington, State of the Union Address, December 8, 1790
  George Washington, State of the Union Address, October 25, 1791
  George Washington, State of the Union Address, November 6, 1792
  George Washington, State of the Union Address, December 3, 1793
  George Washington, State of the Union Address, November 19, 1794
  George Washington, State of the Union Address, December 8, 1795
  George Washington, State of the Union Address, December 7, 1796
  John Adams, State of the Union Address, November 22, 1797
  John Adams, State of the Union Address, December 8, 1798
  John Adams, State of the Union Address, December 3, 1799
  John Adams, State of the Union Address, November 11, 1800
  Thomas Jefferson, State of the Union Address, December 8, 1801
  Thomas Jefferson, State of the Union Address, December 15, 1802
  Thomas Jefferson, State of the Union Address, October 17, 1803
  Thomas Jefferson, State of the Union Address, November 8, 1804
  Thomas Jefferson, State of the Union Address, December 3, 1805
  Thomas Jefferson, State of the Union Address, December 2, 1806
  Thomas Jefferson, State of the Union Address, October 27, 1807
  Thomas Jefferson, State of the Union Address,

speeches = sotu.split('\n***\n')[1:]

len(speeches)

233

print(speeches[-1][:1000])

State of the Union Address
Joseph R. Biden Jr.  
February 7, 2023

  Mr. Speaker. Madam Vice President. Our First Lady and Second
Gentleman. Members of Congress and the Cabinet. Leaders of our
military. Mr. Chief Justice, Associate Justices, and retired Justices
of the Supreme Court. And you, my fellow Americans.
  I start tonight by congratulating the members of the 118th Congress
and the new Speaker of the House, Kevin McCarthy. Mr. Speaker, I look
forward to working together.
  I also want to congratulate the new leader of the House Democrats and
the first Black House Minority Leader in history, Hakeem Jeffries.
  Congratulations to the longest serving Senate leader in history,
Mitch McConnell.
  And congratulations to Chuck Schumer for another term as Senate
Majority Leader, this time with an even bigger majority.
  And I want to give special recognition to someone who I think will be
considered the greatest Speaker in the history of this country, Nancy
Pelosi.
  The story of Amer

import re
def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))

speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))
speeches_df

speeches_df.head()

unique_words = pd.Series(speeches_df['contents'].str.split().sum()).value_counts()
# Take the top 500 most common words for speed
unique_words = unique_words.iloc[:500].index
unique_words

Index(['the', 'of', 'to', 'and', 'in', 'a', 'that', 'for', 'be', 'our',
       ...
       'desire', 'call', 'submitted', 'increasing', 'months', 'point', 'trust',
       'throughout', 'set', 'object'],
      dtype='object', length=500)

from tqdm.notebook import tqdm

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

# Wrap the sequence with `tqdm()` to display a progress bar
for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf = pd.DataFrame(tfidf_dict)
tfidf.head()

summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
          ...    
230          it's
231       tonight
232          it's
Length: 233, dtype: object

def five_largest(row):
    return list(row.index[row.argsort()][-5:])

keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)

display_df(keywords_df, rows=233)

tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()

for word in tqdm(unique_words):
    re_pat = fr' {word} ' # Imperfect pattern for speed.
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl

  0%|          | 0/500 [00:00<?, ?it/s]

tfidf_nl = pd.DataFrame(tfidf_nl_dict)
tfidf_nl.head()

keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)
keywords_nl_df

(1000 / 999)

1.001001001001001

np.log(1000 / 999)

0.001000500333583622

(50 / 2)

25.0

(500 / 2)

250.0

np.log(50 / 2)

3.2188758248682006

np.log(500 / 2)

5.521460917862246

	senior	lecturer	teaching	professor	assistant	associate
senior lecturer	1	1	0	0	0	0
assistant teaching professor	0	0	1	1	1	0
associate professor	0	0	0	1	0	1
senior assistant to the assistant professor	1	0	0	1	2	0

	the	of	to	and	...	trust	throughout	set	object
0	0.09	0.06	0.05	0.04	...	1.47e-03	0.00e+00	0.00e+00	5.78e-03
1	0.09	0.06	0.03	0.03	...	0.00e+00	0.00e+00	0.00e+00	2.99e-03
2	0.11	0.07	0.04	0.03	...	1.39e-03	0.00e+00	1.30e-03	1.82e-03
3	0.09	0.07	0.04	0.03	...	2.29e-03	7.53e-04	0.00e+00	2.01e-03
4	0.09	0.07	0.04	0.02	...	8.12e-04	1.60e-03	0.00e+00	1.07e-03

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...

	officer	ii	police	i	...	geologist	utilities	gardener	principle
Job Title
city attorney	0	0	0	0	...	0	0	0	0
mayor	0	0	0	0	...	0	0	0	0
investment officer	1	0	0	0	...	0	0	0	0
police officer	1	0	1	0	...	0	0	0	0
independent budget analyst	0	0	0	0	...	0	0	0	0

	I	really	global	must	...	We	solve	climate	change
I really really want global peace	1	2	1	0	...	0	0	0	0
I must enjoy global warming	1	0	1	1	...	0	0	0	0
We must solve climate change	0	0	0	1	...	1	1	1	1

	I really really want global peace	I must enjoy global warming	We must solve climate change
I really really want global peace	1.00	0.32	0.0
I must enjoy global warming	0.32	1.00	0.2
We must solve climate change	0.00	0.20	1.0

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
...	...	...	...	...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...
232	State of the Union Address	Joseph R. Biden Jr.	February 7, 2023	mr speaker madam vice president our firs...

	...	trust	throughout	set	object
0	...	4.29e-04	0.00e+00	0.00e+00	2.04e-03
1	...	0.00e+00	0.00e+00	0.00e+00	1.06e-03
2	...	4.06e-04	0.00e+00	3.48e-04	6.44e-04
3	...	6.70e-04	2.17e-04	0.00e+00	7.09e-04
4	...	2.38e-04	4.62e-04	0.00e+00	3.77e-04

	president	date	0
0	George Washington	January 8, 1790	[your, proper, regard, ought, object]
1	George Washington	December 8, 1790	[case, established, object, commerce, convention]
2	George Washington	October 25, 1791	[community, upon, lands, proper, provision]
3	George Washington	November 6, 1792	[subject, upon, information, proper, provision]
4	George Washington	December 3, 1793	[having, vessels, executive, shall, ought]
5	George Washington	November 19, 1794	[too, army, let, ought, constitution]
6	George Washington	December 8, 1795	[army, prevent, object, provision, treaty]
7	George Washington	December 7, 1796	[republic, treaty, britain, ought, object]
8	John Adams	November 22, 1797	[spain, british, claims, treaty, vessels]
9	John Adams	December 8, 1798	[st, minister, treaty, spain, commerce]
10	John Adams	December 3, 1799	[civil, period, british, minister, treaty]
11	John Adams	November 11, 1800	[experience, protection, navy, commerce, ought]
12	Thomas Jefferson	December 8, 1801	[consideration, shall, object, vessels, subject]
13	Thomas Jefferson	December 15, 1802	[shall, debt, naval, duties, vessels]
14	Thomas Jefferson	October 17, 1803	[debt, vessels, sum, millions, friendly]
15	Thomas Jefferson	November 8, 1804	[received, convention, having, due, friendly]
16	Thomas Jefferson	December 3, 1805	[families, convention, sum, millions, vessels]
17	Thomas Jefferson	December 2, 1806	[due, consideration, millions, shall, spain]
18	Thomas Jefferson	October 27, 1807	[whether, army, british, vessels, shall]
19	Thomas Jefferson	November 8, 1808	[thus, british, millions, commerce, her]
20	James Madison	November 29, 1809	[cases, having, due, british, minister]
21	James Madison	December 5, 1810	[provisions, view, minister, commerce, british]
22	James Madison	November 5, 1811	[britain, provisions, commerce, minister, brit...
23	James Madison	November 4, 1812	[nor, subject, provisions, britain, british]
24	James Madison	December 7, 1813	[number, having, naval, britain, british]
25	James Madison	September 20, 1814	[naval, vessels, britain, his, british]
26	James Madison	December 5, 1815	[debt, treasury, millions, establishment, sum]
27	James Madison	December 3, 1816	[constitution, annual, sum, treasury, british]
28	James Monroe	December 12, 1817	[improvement, territory, indian, millions, lands]
29	James Monroe	November 16, 1818	[minister, object, territory, her, spain]
30	James Monroe	December 7, 1819	[parties, friendly, minister, treaty, spain]
31	James Monroe	November 14, 1820	[amount, minister, extent, vessels, spain]
32	James Monroe	December 3, 1821	[powers, duties, revenue, spain, vessels]
33	James Monroe	December 3, 1822	[object, proper, vessels, spain, convention]
34	James Monroe	December 2, 1823	[th, department, object, minister, spain]
35	James Monroe	December 7, 1824	[spain, governments, convention, parties, object]
36	John Quincy Adams	December 6, 1825	[officers, commerce, condition, upon, improvem...
37	John Quincy Adams	December 5, 1826	[commercial, upon, vessels, british, duties]
38	John Quincy Adams	December 4, 1827	[lands, british, receipts, upon, th]
39	John Quincy Adams	December 2, 1828	[duties, revenue, upon, commercial, britain]
40	Andrew Jackson	December 8, 1829	[attention, subject, her, upon, duties]
41	Andrew Jackson	December 6, 1830	[general, subject, character, vessels, upon]
42	Andrew Jackson	December 6, 1831	[indian, commerce, claims, treaty, minister]
43	Andrew Jackson	December 4, 1832	[general, subject, duties, lands, commerce]
44	Andrew Jackson	December 3, 1833	[treasury, convention, minister, spain, duties]
45	Andrew Jackson	December 1, 1834	[bill, treaty, minister, claims, upon]
46	Andrew Jackson	December 7, 1835	[treaty, upon, claims, subject, minister]
47	Andrew Jackson	December 5, 1836	[upon, treasury, duties, revenue, banks]
48	Martin van Buren	December 5, 1837	[price, subject, upon, banks, lands]
49	Martin van Buren	December 3, 1838	[subject, upon, indian, banks, court]
50	Martin van Buren	December 2, 1839	[duties, treasury, extent, institutions, banks]
51	Martin van Buren	December 5, 1840	[general, revenue, upon, extent, having]
52	John Tyler	December 7, 1841	[banks, britain, amount, duties, treasury]
53	John Tyler	December 6, 1842	[claims, minister, thus, amount, treasury]
54	John Tyler	December 6, 1843	[treasury, british, her, minister, mexico]
55	John Tyler	December 3, 1844	[minister, upon, treaty, her, mexico]
56	James Polk	December 2, 1845	[british, convention, territory, duties, mexico]
57	James Polk	December 8, 1846	[army, territory, minister, her, mexico]
58	James Polk	December 7, 1847	[amount, treaty, her, army, mexico]
59	James Polk	December 5, 1848	[tariff, upon, bill, constitution, mexico]
60	Zachary Taylor	December 4, 1849	[territory, treaty, recommend, minister, mexico]
61	Millard Fillmore	December 2, 1850	[recommend, claims, upon, mexico, duties]
62	Millard Fillmore	December 2, 1851	[department, annual, fiscal, subject, mexico]
63	Millard Fillmore	December 6, 1852	[duties, navy, mexico, subject, her]
64	Franklin Pierce	December 5, 1853	[commercial, regard, upon, construction, subject]
65	Franklin Pierce	December 4, 1854	[character, duties, naval, minister, property]
66	Franklin Pierce	December 31, 1855	[constitution, british, territory, convention,...
67	Franklin Pierce	December 2, 1856	[institutions, property, condition, thus, terr...
68	James Buchanan	December 8, 1857	[treaty, constitution, territory, convention, ...
69	James Buchanan	December 6, 1858	[june, mexico, minister, constitution, territory]
70	James Buchanan	December 19, 1859	[minister, th, fiscal, mexico, june]
71	James Buchanan	December 3, 1860	[minister, duties, claims, convention, constit...
72	Abraham Lincoln	December 3, 1861	[army, claims, labor, capital, court]
73	Abraham Lincoln	December 1, 1862	[upon, population, shall, per, sum]
74	Abraham Lincoln	December 8, 1863	[upon, receipts, subject, navy, naval]
75	Abraham Lincoln	December 6, 1864	[condition, secretary, naval, treasury, navy]
76	Andrew Johnson	December 4, 1865	[form, commerce, powers, general, constitution]
77	Andrew Johnson	December 3, 1866	[thus, june, constitution, mexico, condition]
78	Andrew Johnson	December 3, 1867	[june, value, department, upon, constitution]
79	Andrew Johnson	December 9, 1868	[millions, amount, expenditures, june, per]
80	Ulysses S. Grant	December 6, 1869	[subject, upon, receipts, per, spain]
81	Ulysses S. Grant	December 5, 1870	[her, convention, vessels, spain, british]
82	Ulysses S. Grant	December 4, 1871	[object, powers, treaty, desire, recommend]
83	Ulysses S. Grant	December 2, 1872	[territory, line, her, britain, treaty]
84	Ulysses S. Grant	December 1, 1873	[consideration, banks, subject, amount, claims]
85	Ulysses S. Grant	December 7, 1874	[duties, upon, attention, claims, convention]
86	Ulysses S. Grant	December 7, 1875	[parties, territory, court, spain, claims]
87	Ulysses S. Grant	December 5, 1876	[subject, court, per, commission, claims]
88	Rutherford B. Hayes	December 3, 1877	[upon, sum, fiscal, commercial, value]
89	Rutherford B. Hayes	December 2, 1878	[per, secretary, fiscal, june, indian]
90	Rutherford B. Hayes	December 1, 1879	[subject, territory, june, commission, indian]
91	Rutherford B. Hayes	December 6, 1880	[subject, office, relations, attention, commer...
92	Chester A. Arthur	December 6, 1881	[spain, international, british, relations, fri...
93	Chester A. Arthur	December 4, 1882	[territory, establishment, mexico, internation...
94	Chester A. Arthur	December 4, 1883	[total, convention, mexico, commission, treaty]
95	Chester A. Arthur	December 1, 1884	[treaty, territory, commercial, secretary, ves...
96	Grover Cleveland	December 8, 1885	[duties, vessels, treaty, condition, upon]
97	Grover Cleveland	December 6, 1886	[mexico, claims, subject, convention, fiscal]
98	Grover Cleveland	December 6, 1887	[condition, sum, thus, price, tariff]
99	Grover Cleveland	December 3, 1888	[secretary, treaty, upon, per, june]
100	Benjamin Harrison	December 3, 1889	[general, commission, indian, upon, lands]
101	Benjamin Harrison	December 1, 1890	[receipts, subject, upon, per, tariff]
102	Benjamin Harrison	December 9, 1891	[court, tariff, indian, upon, per]
103	Benjamin Harrison	December 6, 1892	[tariff, secretary, upon, value, per]
104	William McKinley	December 6, 1897	[conditions, upon, international, territory, s...
105	William McKinley	December 5, 1898	[navy, commission, naval, june, spain]
106	William McKinley	December 5, 1899	[treaty, officers, commission, international, ...
107	William McKinley	December 3, 1900	[settlement, civil, shall, convention, commiss...
108	Theodore Roosevelt	December 3, 1901	[army, commercial, conditions, navy, man]
109	Theodore Roosevelt	December 2, 1902	[upon, man, navy, conditions, tariff]
110	Theodore Roosevelt	December 7, 1903	[june, lands, territory, property, treaty]
111	Theodore Roosevelt	December 6, 1904	[cases, conditions, indian, labor, man]
112	Theodore Roosevelt	December 5, 1905	[upon, conditions, commission, cannot, man]
113	Theodore Roosevelt	December 3, 1906	[upon, navy, tax, court, man]
114	Theodore Roosevelt	December 3, 1907	[conditions, navy, upon, army, man]
115	Theodore Roosevelt	December 8, 1908	[man, officers, labor, control, banks]
116	William H. Taft	December 7, 1909	[convention, banks, court, department, tariff]
117	William H. Taft	December 6, 1910	[department, court, commercial, international,...
118	William H. Taft	December 5, 1911	[mexico, department, per, tariff, court]
119	William H. Taft	December 3, 1912	[tariff, upon, army, per, department]
120	Woodrow Wilson	December 2, 1913	[how, shall, upon, mexico, ought]
121	Woodrow Wilson	December 8, 1914	[shall, convention, ought, matter, upon]
122	Woodrow Wilson	December 7, 1915	[her, navy, millions, economic, cannot]
123	Woodrow Wilson	December 5, 1916	[commerce, shall, upon, commission, bill]
124	Woodrow Wilson	December 4, 1917	[purpose, her, know, settlement, shall]
125	Woodrow Wilson	December 2, 1918	[shall, go, men, upon, back]
126	Woodrow Wilson	December 2, 1919	[economic, her, budget, labor, conditions]
127	Woodrow Wilson	December 7, 1920	[expenditures, receipts, treasury, budget, upon]
128	Warren Harding	December 6, 1921	[capital, ought, problems, conditions, tariff]
129	Warren Harding	December 8, 1922	[responsibility, republic, problems, ought, per]
130	Calvin Coolidge	December 6, 1923	[conditions, production, commission, ought, co...
131	Calvin Coolidge	December 3, 1924	[navy, international, desire, economic, court]
132	Calvin Coolidge	December 8, 1925	[international, budget, economic, ought, court]
133	Calvin Coolidge	December 7, 1926	[tax, federal, reduction, tariff, ought]
134	Calvin Coolidge	December 6, 1927	[construction, banks, per, program, property]
135	Calvin Coolidge	December 4, 1928	[federal, department, production, program, per]
136	Herbert Hoover	December 3, 1929	[commission, federal, construction, tariff, per]
137	Herbert Hoover	December 2, 1930	[about, budget, economic, per, construction]
138	Herbert Hoover	December 8, 1931	[upon, construction, federal, economic, banks]
139	Herbert Hoover	December 6, 1932	[health, june, value, economic, banks]
140	Franklin D. Roosevelt	January 3, 1934	[labor, permanent, problems, cannot, banks]
141	Franklin D. Roosevelt	January 4, 1935	[private, work, local, program, cannot]
142	Franklin D. Roosevelt	January 3, 1936	[income, shall, let, say, today]
143	Franklin D. Roosevelt	January 6, 1937	[powers, convention, needs, help, problems]
144	Franklin D. Roosevelt	January 3, 1938	[budget, business, economic, today, income]
145	Franklin D. Roosevelt	January 4, 1939	[labor, cannot, capital, income, billion]
146	Franklin D. Roosevelt	January 3, 1940	[world, domestic, cannot, economic, today]
147	Franklin D. Roosevelt	January 6, 1941	[freedom, problems, cannot, program, today]
148	Franklin D. Roosevelt	January 6, 1942	[him, today, know, forces, production]
149	Franklin D. Roosevelt	January 7, 1943	[pacific, get, cannot, americans, production]
150	Franklin D. Roosevelt	January 11, 1944	[individual, total, know, economic, cannot]
151	Franklin D. Roosevelt	January 6, 1945	[cannot, production, army, forces, jobs]
152	Harry S. Truman	January 21, 1946	[fiscal, program, billion, million, dollars]
153	Harry S. Truman	January 6, 1947	[commission, budget, economic, labor, program]
154	Harry S. Truman	January 7, 1948	[tax, billion, today, program, economic]
155	Harry S. Truman	January 5, 1949	[economic, price, program, cannot, production]
156	Harry S. Truman	January 4, 1950	[income, today, program, programs, economic]
157	Harry S. Truman	January 8, 1951	[help, program, production, strength, economic]
158	Harry S. Truman	January 9, 1952	[defense, working, program, help, production]
159	Harry S. Truman	January 7, 1953	[republic, free, cannot, world, economic]
160	Dwight D. Eisenhower	February 2, 1953	[federal, labor, budget, economic, programs]
161	Dwight D. Eisenhower	January 7, 1954	[federal, programs, economic, budget, program]
162	Dwight D. Eisenhower	January 6, 1955	[problems, federal, economic, programs, program]
163	Dwight D. Eisenhower	January 5, 1956	[billion, federal, problems, economic, program]
164	Dwight D. Eisenhower	January 10, 1957	[cannot, programs, human, program, economic]
165	Dwight D. Eisenhower	January 9, 1958	[program, strength, today, programs, economic]
166	Dwight D. Eisenhower	January 9, 1959	[growth, help, billion, programs, economic]
167	Dwight D. Eisenhower	January 7, 1960	[freedom, cannot, today, economic, help]
168	Dwight D. Eisenhower	January 12, 1961	[million, percent, billion, program, programs]
169	John F. Kennedy	January 30, 1961	[budget, programs, problems, economic, program]
170	John F. Kennedy	January 11, 1962	[billion, help, program, jobs, cannot]
171	John F. Kennedy	January 14, 1963	[help, cannot, tax, percent, billion]
172	Lyndon B. Johnson	January 8, 1964	[help, billion, americans, budget, million]
173	Lyndon B. Johnson	January 4, 1965	[americans, man, programs, tonight, help]
174	Lyndon B. Johnson	January 12, 1966	[program, percent, help, billion, tonight]
175	Lyndon B. Johnson	January 10, 1967	[programs, americans, billion, tonight, percent]
176	Lyndon B. Johnson	January 17, 1968	[programs, million, budget, tonight, billion]
177	Lyndon B. Johnson	January 14, 1969	[americans, program, billion, budget, tonight]
178	Richard Nixon	January 22, 1970	[billion, percent, america, today, programs]
179	Richard Nixon	January 22, 1971	[federal, americans, budget, tonight, let]
180	Richard Nixon	January 20, 1972	[america, program, programs, today, help]
181	Richard Nixon	February 2, 1973	[economic, help, americans, working, programs]
182	Richard Nixon	January 30, 1974	[program, americans, today, energy, tonight]
183	Gerald R. Ford	January 15, 1975	[program, percent, billion, programs, energy]
184	Gerald R. Ford	January 19, 1976	[federal, americans, budget, jobs, programs]
185	Gerald R. Ford	January 12, 1977	[programs, today, percent, jobs, energy]
186	Jimmy Carter	January 19, 1978	[cannot, economic, tonight, jobs, it's]
187	Jimmy Carter	January 25, 1979	[cannot, budget, tonight, americans, it's]
188	Jimmy Carter	January 21, 1980	[help, america, energy, tonight, it's]
189	Jimmy Carter	January 16, 1981	[percent, economic, energy, program, programs]
190	Ronald Reagan	January 26, 1982	[jobs, help, program, billion, programs]
191	Ronald Reagan	January 25, 1983	[problems, programs, americans, economic, perc...
192	Ronald Reagan	January 25, 1984	[budget, help, americans, tonight, it's]
193	Ronald Reagan	February 6, 1985	[help, tax, jobs, tonight, it's]
194	Ronald Reagan	February 4, 1986	[america, cannot, it's, budget, tonight]
195	Ronald Reagan	January 27, 1987	[percent, let, budget, tonight, it's]
196	Ronald Reagan	January 25, 1988	[let, americans, it's, budget, tonight]
197	George H.W. Bush	February 9, 1989	[help, ask, it's, budget, tonight]
198	George H.W. Bush	January 31, 1990	[percent, budget, today, tonight, it's]
199	George H.W. Bush	January 29, 1991	[jobs, budget, americans, know, tonight]
200	George H.W. Bush	January 28, 1992	[know, get, tonight, help, it's]
201	William J. Clinton	February 17, 1993	[tax, budget, percent, tonight, jobs]
202	William J. Clinton	January 25, 1994	[americans, it's, health, get, jobs]
203	William J. Clinton	January 24, 1995	[jobs, americans, get, tonight, it's]
204	William J. Clinton	January 23, 1996	[tonight, families, working, americans, children]
205	William J. Clinton	February 4, 1997	[america, children, budget, americans, tonight]
206	William J. Clinton	January 27, 1998	[ask, americans, children, help, tonight]
207	William J. Clinton	January 19, 1999	[children, budget, help, americans, tonight]
208	William J. Clinton	January 27, 2000	[families, help, children, americans, tonight]
209	George W. Bush	February 27, 2001	[help, tax, percent, tonight, budget]
210	George W. Bush	September 20, 2001	[freedom, america, ask, americans, tonight]
211	George W. Bush	January 29, 2002	[americans, budget, tonight, america, jobs]
212	George W. Bush	January 28, 2003	[america, help, million, americans, tonight]
213	George W. Bush	January 20, 2004	[children, america, americans, help, tonight]
214	George W. Bush	February 2, 2005	[freedom, tonight, help, social, americans]
215	George W. Bush	January 31, 2006	[reform, jobs, americans, america, tonight]
216	George W. Bush	January 23, 2007	[children, health, americans, tonight, help]
217	George W. Bush	January 29, 2008	[america, americans, trust, tonight, help]
218	Barack Obama	February 24, 2009	[know, budget, jobs, tonight, it's]
219	Barack Obama	January 27, 2010	[get, tonight, americans, jobs, it's]
220	Barack Obama	January 25, 2011	[percent, get, tonight, jobs, it's]
221	Barack Obama	January 24, 2012	[americans, tonight, get, it's, jobs]
222	Barack Obama	February 12, 2013	[families, it's, get, tonight, jobs]
223	Barack Obama	January 28, 2014	[get, tonight, help, it's, jobs]
224	Barack Obama	January 20, 2015	[families, americans, tonight, jobs, it's]
225	Barack Obama	January 12, 2016	[tonight, jobs, americans, get, it's]
226	Donald J. Trump	February 27, 2017	[america, jobs, americans, it's, tonight]
227	Donald J. Trump	January 30, 2018	[tax, get, it's, americans, tonight]
228	Donald J. Trump	February 5, 2019	[get, jobs, americans, it's, tonight]
229	Donald J. Trump	February 4, 2020	[jobs, it's, americans, percent, tonight]
230	Joseph R. Biden Jr.	April 28, 2021	[get, americans, percent, jobs, it's]
231	Joseph R. Biden Jr.	March 1, 2022	[let, jobs, americans, get, tonight]
232	Joseph R. Biden Jr.	February 7, 2023	[down, percent, jobs, tonight, it's]

	president	date	0
0	George Washington	January 8, 1790	[a, and, to, of, the]
1	George Washington	December 8, 1790	[in, and, to, of, the]
2	George Washington	October 25, 1791	[a, and, to, of, the]
...	...	...	...
230	Joseph R. Biden Jr.	April 28, 2021	[of, it's, and, to, the]
231	Joseph R. Biden Jr.	March 1, 2022	[we, of, to, and, the]
232	Joseph R. Biden Jr.	February 7, 2023	[a, of, and, to, the]

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...

Lecture 12 – Text Features¶

DSC 80, Fall 2023¶

📣 Announcements 📣¶

📆 Agenda¶

📝 Mid-Quarter Survey¶

🙋🙋🏽‍♀️ Slido¶

Bag of words 💰¶

Example: San Diego employee salaries¶

Text similarity¶

A counts matrix¶

Creating a counts matrix¶

Interpreting the counts matrix¶

Question: What job titles are most similar to 'deputy fire chief'?¶

The dot product¶

Computing similarities¶

🙋🙋🏽‍♀️ Questions?¶

Bag of words¶

Aside: Interactive bag of words demo¶

Cosine similarity¶

Cosine similarity and bag of words¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

🙋🙋🏽‍♀️ Questions?¶

TF-IDF¶

The importance of words¶

Term frequency¶

Inverse document frequency¶

Intuition¶

Term frequency-inverse document frequency¶

Computing TF-IDF¶

TF-IDF of all words in all documents¶

Interpreting TF-IDFs¶

🙋🙋🏽‍♀️ Questions?¶

Example: State of the Union addresses 🎤¶

State of the Union addresses¶

The data¶

Finding the most important words in each speech¶

💡 Pro-Tip: Using tqdm¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

🙋🙋🏽‍♀️ Questions?¶

Summary, next time¶

Summary¶

Next time¶

Question: What job titles are most similar to `'deputy fire chief'`?¶

💡 Pro-Tip: Using `tqdm`¶

	officer	police	...
0	0	0	...
1	0	0	...
2	1	0	...
3	1	1	...
4	0	0	...