import re
import pandas as pd
import numpy as np
from IPython.display import YouTubeVideo, display


salaries = pd.read_csv('https://transcal.s3.amazonaws.com/public/export/san-diego-2020.csv')
jobtitles = salaries['Job Title']

jobtitles = (
    jobtitles
    .str.lower()
    .str.replace(r'\bto|\bthe|\bfor', '', regex=True)
    .str.replace('[^A-Za-z0-9 ]', ' ', regex=True)
    .str.replace(' +', ' ', regex=True)               
    .str.strip()                                      
)

all_words = jobtitles.str.split().sum()
unique_words = pd.Series(all_words).value_counts()

counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = jobtitles.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(jobtitles)


jobtitles.head()

0              police officer
1              police officer
2               fire engineer
3    retirement administrator
4        fire battalion chief
Name: Job Title, dtype: object


counts_df


afc = counts_df.loc['asst fire chief'].iloc[0]
afc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: asst fire chief, Length: 435, dtype: int64


fbc = counts_df.loc['fire battalion chief'].iloc[0]
fbc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: fire battalion chief, Length: 435, dtype: int64


pair_counts = (
    pd.concat([afc, fbc], axis=1)
    .sort_values(by=['asst fire chief', 'fire battalion chief'], ascending=False)
    .head(10)
    .T
)

pair_counts


np.sum(pair_counts.iloc[0] * pair_counts.iloc[1])

2


counts_df.head()

afc

2          0
police     0
officer    0
1          0
fire       1
          ..
law        0
librn      0
risk       0
medical    0
african    0
Name: asst fire chief, Length: 435, dtype: int64


dots = (
    counts_df[counts_df.index != 'asst fire chief']
    .apply(lambda s: np.dot(s, afc), axis=1)
    .sort_values(ascending=False)
)

dots

Job Title
asst deputy chief oper ofcr    2
fire battalion chief           2
deputy fire chief              2
fire battalion chief           2
deputy fire chief              2
                              ..
lifeguard 3                    0
water sys tech 3               0
sr commctns tech               0
sr life safety inspector       0
utility worker 2               0
Length: 12601, dtype: int64


np.unique(dots.index[dots == dots.max()])

array(['asst chief oper ofcr', 'asst deputy chief oper ofcr',
       'asst fire marshal civ', 'deputy fire chief',
       'fire battalion chief', 'fire chief'], dtype=object)


sentences = pd.Series([
    'I really want global peace',
    'I must love love global warming',
    'I must solve climate change'
])

sentences

0         I really want global peace
1    I must love love global warming
2        I must solve climate change
dtype: object


unique_words = pd.Series(sentences.str.split().sum()).value_counts()
unique_words

I          3
global     2
must       2
love       2
really     1
want       1
peace      1
warming    1
solve      1
climate    1
change     1
dtype: int64


counts_dict = {}
for word in unique_words.index:
    re_pat = fr'\b{word}\b'
    counts_dict[word] = sentences.str.count(re_pat).astype(int).tolist()
    
counts_df = pd.DataFrame(counts_dict).set_index(sentences)


counts_df


# There is an easier way of doing this in sklearn, as we will see soon
def sim_pair(s1, s2):
    return np.dot(s1, s2) / (np.linalg.norm(s1) * np.linalg.norm(s2))


sim_pair(counts_df.iloc[0], counts_df.iloc[1])

0.31622776601683794


sim_pair(counts_df.iloc[0], counts_df.iloc[2])

0.19999999999999996


sim_pair(counts_df.iloc[1], counts_df.iloc[2])

0.31622776601683794


sentences

0         I really want global peace
1    I must love love global warming
2        I must solve climate change
dtype: object


tf = sentences.iloc[1].count('global') / len(sentences.iloc[1].split())
tf

0.16666666666666666


idf = np.log(len(sentences) / sentences.str.contains('global').sum())
idf

0.4054651081081644


tf * idf

0.06757751801802739


sentences

0         I really want global peace
1    I must love love global warming
2        I must solve climate change
dtype: object


unique_words = np.unique(sentences.str.split().sum())
unique_words

array(['I', 'change', 'climate', 'global', 'love', 'must', 'peace',
       'really', 'solve', 'want', 'warming'], dtype='<U7')


tfidf_dict = {}

for word in unique_words:
    re_pat = fr'\b{word}\b'
    tf = sentences.str.count(re_pat) / sentences.str.split().str.len()
    idf = np.log(len(sentences) / sentences.str.contains(re_pat).sum())
    tfidf_dict[word] = tf * idf
    
tfidf = pd.DataFrame(tfidf_dict).set_index(sentences)


tfidf


tfidf


tfidf.idxmax(axis=1)

I really want global peace          peace
I must love love global warming      love
I must solve climate change        change
dtype: object


YouTubeVideo('mVIXLQrC9rE')


sotu = open('data/stateoftheunion1790-2022.txt').read()


len(sotu)

10535675


print(sotu[:20000])

The Project Gutenberg EBook of Complete State of the Union Addresses,
from 1790 to the Present. Speeches beginning in 2002 are from UCSB The American Presidency Project.
Speeches from 2018-2022 were manually downloaded from whitehouse.gov.

Character set encoding: UTF8

The addresses are separated by three asterisks


CONTENTS

  George Washington, State of the Union Address, January 8, 1790
  George Washington, State of the Union Address, December 8, 1790
  George Washington, State of the Union Address, October 25, 1791
  George Washington, State of the Union Address, November 6, 1792
  George Washington, State of the Union Address, December 3, 1793
  George Washington, State of the Union Address, November 19, 1794
  George Washington, State of the Union Address, December 8, 1795
  George Washington, State of the Union Address, December 7, 1796
  John Adams, State of the Union Address, November 22, 1797
  John Adams, State of the Union Address, December 8, 1798
  John Adams, State of the Union Address, December 3, 1799
  John Adams, State of the Union Address, November 11, 1800
  Thomas Jefferson, State of the Union Address, December 8, 1801
  Thomas Jefferson, State of the Union Address, December 15, 1802
  Thomas Jefferson, State of the Union Address, October 17, 1803
  Thomas Jefferson, State of the Union Address, November 8, 1804
  Thomas Jefferson, State of the Union Address, December 3, 1805
  Thomas Jefferson, State of the Union Address, December 2, 1806
  Thomas Jefferson, State of the Union Address, October 27, 1807
  Thomas Jefferson, State of the Union Address, November 8, 1808
  James Madison, State of the Union Address, November 29, 1809
  James Madison, State of the Union Address, December 5, 1810
  James Madison, State of the Union Address, November 5, 1811
  James Madison, State of the Union Address, November 4, 1812
  James Madison, State of the Union Address, December 7, 1813
  James Madison, State of the Union Address, September 20, 1814
  James Madison, State of the Union Address, December 5, 1815
  James Madison, State of the Union Address, December 3, 1816
  James Monroe, State of the Union Address, December 12, 1817
  James Monroe, State of the Union Address, November 16, 1818
  James Monroe, State of the Union Address, December 7, 1819
  James Monroe, State of the Union Address, November 14, 1820
  James Monroe, State of the Union Address, December 3, 1821
  James Monroe, State of the Union Address, December 3, 1822
  James Monroe, State of the Union Address, December 2, 1823
  James Monroe, State of the Union Address, December 7, 1824
  John Quincy Adams, State of the Union Address, December 6, 1825
  John Quincy Adams, State of the Union Address, December 5, 1826
  John Quincy Adams, State of the Union Address, December 4, 1827
  John Quincy Adams, State of the Union Address, December 2, 1828
  Andrew Jackson, State of the Union Address, December 8, 1829
  Andrew Jackson, State of the Union Address, December 6, 1830
  Andrew Jackson, State of the Union Address, December 6, 1831
  Andrew Jackson, State of the Union Address, December 4, 1832
  Andrew Jackson, State of the Union Address, December 3, 1833
  Andrew Jackson, State of the Union Address, December 1, 1834
  Andrew Jackson, State of the Union Address, December 7, 1835
  Andrew Jackson, State of the Union Address, December 5, 1836
  Martin van Buren, State of the Union Address, December 5, 1837
  Martin van Buren, State of the Union Address, December 3, 1838
  Martin van Buren, State of the Union Address, December 2, 1839
  Martin van Buren, State of the Union Address, December 5, 1840
  John Tyler, State of the Union Address, December 7, 1841
  John Tyler, State of the Union Address, December 6, 1842
  John Tyler, State of the Union Address, December 6, 1843
  John Tyler, State of the Union Address, December 3, 1844
  James Polk, State of the Union Address, December 2, 1845
  James Polk, State of the Union Address, December 8, 1846
  James Polk, State of the Union Address, December 7, 1847
  James Polk, State of the Union Address, December 5, 1848
  Zachary Taylor, State of the Union Address, December 4, 1849
  Millard Fillmore, State of the Union Address, December 2, 1850
  Millard Fillmore, State of the Union Address, December 2, 1851
  Millard Fillmore, State of the Union Address, December 6, 1852
  Franklin Pierce, State of the Union Address, December 5, 1853
  Franklin Pierce, State of the Union Address, December 4, 1854
  Franklin Pierce, State of the Union Address, December 31, 1855
  Franklin Pierce, State of the Union Address, December 2, 1856
  James Buchanan, State of the Union Address, December 8, 1857
  James Buchanan, State of the Union Address, December 6, 1858
  James Buchanan, State of the Union Address, December 19, 1859
  James Buchanan, State of the Union Address, December 3, 1860
  Abraham Lincoln, State of the Union Address, December 3, 1861
  Abraham Lincoln, State of the Union Address, December 1, 1862
  Abraham Lincoln, State of the Union Address, December 8, 1863
  Abraham Lincoln, State of the Union Address, December 6, 1864
  Andrew Johnson, State of the Union Address, December 4, 1865
  Andrew Johnson, State of the Union Address, December 3, 1866
  Andrew Johnson, State of the Union Address, December 3, 1867
  Andrew Johnson, State of the Union Address, December 9, 1868
  Ulysses S. Grant, State of the Union Address, December 6, 1869
  Ulysses S. Grant, State of the Union Address, December 5, 1870
  Ulysses S. Grant, State of the Union Address, December 4, 1871
  Ulysses S. Grant, State of the Union Address, December 2, 1872
  Ulysses S. Grant, State of the Union Address, December 1, 1873
  Ulysses S. Grant, State of the Union Address, December 7, 1874
  Ulysses S. Grant, State of the Union Address, December 7, 1875
  Ulysses S. Grant, State of the Union Address, December 5, 1876
  Rutherford B. Hayes, State of the Union Address, December 3, 1877
  Rutherford B. Hayes, State of the Union Address, December 2, 1878
  Rutherford B. Hayes, State of the Union Address, December 1, 1879
  Rutherford B. Hayes, State of the Union Address, December 6, 1880
  Chester A. Arthur, State of the Union Address, December 6, 1881
  Chester A. Arthur, State of the Union Address, December 4, 1882
  Chester A. Arthur, State of the Union Address, December 4, 1883
  Chester A. Arthur, State of the Union Address, December 1, 1884
  Grover Cleveland, State of the Union Address, December 8, 1885
  Grover Cleveland, State of the Union Address, December 6, 1886
  Grover Cleveland, State of the Union Address, December 6, 1887
  Grover Cleveland, State of the Union Address, December 3, 1888
  Benjamin Harrison, State of the Union Address, December 3, 1889
  Benjamin Harrison, State of the Union Address, December 1, 1890
  Benjamin Harrison, State of the Union Address, December 9, 1891
  Benjamin Harrison, State of the Union Address, December 6, 1892
  William McKinley, State of the Union Address, December 6, 1897
  William McKinley, State of the Union Address, December 5, 1898
  William McKinley, State of the Union Address, December 5, 1899
  William McKinley, State of the Union Address, December 3, 1900
  Theodore Roosevelt, State of the Union Address, December 3, 1901
  Theodore Roosevelt, State of the Union Address, December 2, 1902
  Theodore Roosevelt, State of the Union Address, December 7, 1903
  Theodore Roosevelt, State of the Union Address, December 6, 1904
  Theodore Roosevelt, State of the Union Address, December 5, 1905
  Theodore Roosevelt, State of the Union Address, December 3, 1906
  Theodore Roosevelt, State of the Union Address, December 3, 1907
  Theodore Roosevelt, State of the Union Address, December 8, 1908
  William H. Taft, State of the Union Address, December 7, 1909
  William H. Taft, State of the Union Address, December 6, 1910
  William H. Taft, State of the Union Address, December 5, 1911
  William H. Taft, State of the Union Address, December 3, 1912
  Woodrow Wilson, State of the Union Address, December 2, 1913
  Woodrow Wilson, State of the Union Address, December 8, 1914
  Woodrow Wilson, State of the Union Address, December 7, 1915
  Woodrow Wilson, State of the Union Address, December 5, 1916
  Woodrow Wilson, State of the Union Address, December 4, 1917
  Woodrow Wilson, State of the Union Address, December 2, 1918
  Woodrow Wilson, State of the Union Address, December 2, 1919
  Woodrow Wilson, State of the Union Address, December 7, 1920
  Warren Harding, State of the Union Address, December 6, 1921
  Warren Harding, State of the Union Address, December 8, 1922
  Calvin Coolidge, State of the Union Address, December 6, 1923
  Calvin Coolidge, State of the Union Address, December 3, 1924
  Calvin Coolidge, State of the Union Address, December 8, 1925
  Calvin Coolidge, State of the Union Address, December 7, 1926
  Calvin Coolidge, State of the Union Address, December 6, 1927
  Calvin Coolidge, State of the Union Address, December 4, 1928
  Herbert Hoover, State of the Union Address, December 3, 1929
  Herbert Hoover, State of the Union Address, December 2, 1930
  Herbert Hoover, State of the Union Address, December 8, 1931
  Herbert Hoover, State of the Union Address, December 6, 1932
  Franklin D. Roosevelt, State of the Union Address, January 3, 1934
  Franklin D. Roosevelt, State of the Union Address, January 4, 1935
  Franklin D. Roosevelt, State of the Union Address, January 3, 1936
  Franklin D. Roosevelt, State of the Union Address, January 6, 1937
  Franklin D. Roosevelt, State of the Union Address, January 3, 1938
  Franklin D. Roosevelt, State of the Union Address, January 4, 1939
  Franklin D. Roosevelt, State of the Union Address, January 3, 1940
  Franklin D. Roosevelt, State of the Union Address, January 6, 1941
  Franklin D. Roosevelt, State of the Union Address, January 6, 1942
  Franklin D. Roosevelt, State of the Union Address, January 7, 1943
  Franklin D. Roosevelt, State of the Union Address, January 11, 1944
  Franklin D. Roosevelt, State of the Union Address, January 6, 1945
  Harry S. Truman, State of the Union Address, January 21, 1946
  Harry S. Truman, State of the Union Address, January 6, 1947
  Harry S. Truman, State of the Union Address, January 7, 1948
  Harry S. Truman, State of the Union Address, January 5, 1949
  Harry S. Truman, State of the Union Address, January 4, 1950
  Harry S. Truman, State of the Union Address, January 8, 1951
  Harry S. Truman, State of the Union Address, January 9, 1952
  Harry S. Truman, State of the Union Address, January 7, 1953
  Dwight D. Eisenhower, State of the Union Address, February 2, 1953
  Dwight D. Eisenhower, State of the Union Address, January 7, 1954
  Dwight D. Eisenhower, State of the Union Address, January 6, 1955
  Dwight D. Eisenhower, State of the Union Address, January 5, 1956
  Dwight D. Eisenhower, State of the Union Address, January 10, 1957
  Dwight D. Eisenhower, State of the Union Address, January 9, 1958
  Dwight D. Eisenhower, State of the Union Address, January 9, 1959
  Dwight D. Eisenhower, State of the Union Address, January 7, 1960
  Dwight D. Eisenhower, State of the Union Address, January 12, 1961
  John F. Kennedy, State of the Union Address, January 30, 1961
  John F. Kennedy, State of the Union Address, January 11, 1962
  John F. Kennedy, State of the Union Address, January 14, 1963
  Lyndon B. Johnson, State of the Union Address, January 8, 1964
  Lyndon B. Johnson, State of the Union Address, January 4, 1965
  Lyndon B. Johnson, State of the Union Address, January 12, 1966
  Lyndon B. Johnson, State of the Union Address, January 10, 1967
  Lyndon B. Johnson, State of the Union Address, January 17, 1968
  Lyndon B. Johnson, State of the Union Address, January 14, 1969
  Richard Nixon, State of the Union Address, January 22, 1970
  Richard Nixon, State of the Union Address, January 22, 1971
  Richard Nixon, State of the Union Address, January 20, 1972
  Richard Nixon, State of the Union Address, February 2, 1973
  Richard Nixon, State of the Union Address, January 30, 1974
  Gerald R. Ford, State of the Union Address, January 15, 1975
  Gerald R. Ford, State of the Union Address, January 19, 1976
  Gerald R. Ford, State of the Union Address, January 12, 1977
  Jimmy Carter, State of the Union Address, January 19, 1978
  Jimmy Carter, State of the Union Address, January 25, 1979
  Jimmy Carter, State of the Union Address, January 21, 1980
  Jimmy Carter, State of the Union Address, January 16, 1981
  Ronald Reagan, State of the Union Address, January 26, 1982
  Ronald Reagan, State of the Union Address, January 25, 1983
  Ronald Reagan, State of the Union Address, January 25, 1984
  Ronald Reagan, State of the Union Address, February 6, 1985
  Ronald Reagan, State of the Union Address, February 4, 1986
  Ronald Reagan, State of the Union Address, January 27, 1987
  Ronald Reagan, State of the Union Address, January 25, 1988
  George H.W. Bush, Address on Administration Goals, February 9, 1989
  George H.W. Bush, State of the Union Address, January 31, 1990
  George H.W. Bush, State of the Union Address, January 29, 1991
  George H.W. Bush, State of the Union Address, January 28, 1992
  William J. Clinton, Address on Administration Goals, February 17, 1993
  William J. Clinton, State of the Union Address, January 25, 1994
  William J. Clinton, State of the Union Address, January 24, 1995
  William J. Clinton, State of the Union Address, January 23, 1996
  William J. Clinton, State of the Union Address, February 4, 1997
  William J. Clinton, State of the Union Address, January 27, 1998
  William J. Clinton, State of the Union Address, January 19, 1999
  William J. Clinton, State of the Union Address, January 27, 2000
  George W. Bush, Address on Administration Goals (Budget Message), February 27, 2001
  George W. Bush, State of the Union Address, September 20, 2001
  George W. Bush, State of the Union Address, January 29, 2002
  George W. Bush, State of the Union Address, January 28, 2003
  George W. Bush, State of the Union Address, January 20, 2004
  George W. Bush, State of the Union Address, February 2, 2005
  George W. Bush, State of the Union Address, January 31, 2006
  George W. Bush, State of the Union Address, January 23, 2007
  George W. Bush, State of the Union Address, January 31, 2008
  Barack Obama, Address Before a Joint Session of Congress, February 24, 2009
  Barack Obama, State of the Union Address, January 27, 2010
  Barack Obama, State of the Union Address, January 25, 2011
  Barack Obama, State of the Union Address, January 24, 2012
  Barack Obama, State of the Union Address, February 12, 2013
  Barack Obama, State of the Union Address, January 28, 2014
  Barack Obama, State of the Union Address, January 20, 2015
  Barack Obama, State of the Union Address, January 12, 2016
  Donald J. Trump, Address Before a Joint Session of Congress, February 27, 2017
  Donald J. Trump, Address Before a Joint Session of Congress, January 30, 2018
  Donald J. Trump, Address Before a Joint Session of Congress, February 5, 2019
  Donald J. Trump, Address Before a Joint Session of Congress, February 4, 2020
  Joseph R. Biden Jr., Address Before a Joint Session of Congress, April 28, 2021
  Joseph R. Biden Jr., Address Before a Joint Session of Congress, March 1, 2022

***

State of the Union Address
George Washington
January 8, 1790

Fellow-Citizens of the Senate and House of Representatives:

I embrace with great satisfaction the opportunity which now presents itself
of congratulating you on the present favorable prospects of our public
affairs. The recent accession of the important state of North Carolina to
the Constitution of the United States (of which official information has
been received), the rising credit and respectability of our country, the
general and increasing good will toward the government of the Union, and
the concord, peace, and plenty with which we are blessed are circumstances
auspicious in an eminent degree to our national prosperity.

In resuming your consultations for the general good you can not but derive
encouragement from the reflection that the measures of the last session
have been as satisfactory to your constituents as the novelty and
difficulty of the work allowed you to hope. Still further to realize their
expectations and to secure the blessings which a gracious Providence has
placed within our reach will in the course of the present important session
call for the cool and deliberate exertion of your patriotism, firmness, and
wisdom.

Among the many interesting objects which will engage your attention that of
providing for the common defense will merit particular regard. To be
prepared for war is one of the most effectual means of preserving peace.

A free people ought not only to be armed, but disciplined; to which end a
uniform and well-digested plan is requisite; and their safety and interest
require that they should promote such manufactories as tend to render them
independent of others for essential, particularly military, supplies.

The proper establishment of the troops which may be deemed indispensable
will be entitled to mature consideration. In the arrangements which may be
made respecting it it will be of importance to conciliate the comfortable
support of the officers and soldiers with a due regard to economy.

There was reason to hope that the pacific measures adopted with regard to
certain hostile tribes of Indians would have relieved the inhabitants of
our southern and western frontiers from their depredations, but you will
perceive from the information contained in the papers which I shall direct
to be laid before you (comprehending a communication from the Commonwealth
of Virginia) that we ought to be prepared to afford protection to those
parts of the Union, and, if necessary, to punish aggressors.

The interests of the United States require that our intercourse with other
nations should be facilitated by such provisions as will enable me to
fulfill my duty in that respect in the manner which circumstances may
render most conducive to the public good, and to this end that the
compensation to be made to the persons who may be employed should,
according to the nature of their appointments, be defined by law, and a
competent fund designated for defraying the expenses incident to the
conduct of foreign affairs.

Various considerations also render it expedient that the terms on which
foreigners may be admitted to the rights of citizens should be speedily
ascertained by a uniform rule of naturalization.

Uniformity in the currency, weights, and measures of the United States is
an object of great importance, and will, I am persuaded, be duly attended
to.

The advancement of agriculture, commerce, and manufactures by all proper
means will not, I trust, need recommendation; but I can not forbear
intimating to you the expediency of giving effectual encouragement as well
to the introduction of new and useful inventions from abroad as to the
exertions of skill and genius in producing them at home, and of
facilitating the intercourse between the distant parts of our country by a
due attention to the post-office and post-roads.

Nor am I less persuaded that you will agree with me in opinion that there
is nothing which can better deserve your patronage than the promotion of
science and literature. Knowledge is in every country the surest basis of
public happiness. In one in which the measures of government receive their
impressions so immediately from the sense of the community as in ours it is
proportionably essential.

To the security of a free constitution it contributes in various ways--by
convincing those who are intrusted with the public administration that
every valuable end of government is best answered by the enlightened
confidence of the people, and by teaching the people themselves to know and
to value their own rights; to discern and provide ag


speeches = sotu.split('\n***\n')[1:]


len(speeches)

232


print(speeches[-1][:1000])

State of the Union Address
Joseph R. Biden Jr.  
March 1, 2022

Madam Speaker, Madam Vice President, and our First Lady and Second Gentleman, members of Congress and the Cabinet, Justices of the Supreme Court, my fellow Americans: Last year, COVID-19 kept us apart. This year, we’re finally together again.

Tonight — tonight we meet as Democrats, Republicans, and independents, but, most importantly, as Americans with a duty to one another, to America, to the American people, and to the Constitution, and an unwavering resolve that freedom will always triumph over tyranny.

Six — thank you. Six days ago, Russia’s Vladimir Putin sought to shake the very foundations of the free world, thinking he could make it bend to his menacing ways. But he badly miscalculated. He thought he could roll into Ukraine and the world would roll over. Instead, he met with a wall of strength he never anticipated or imagined. He met the Ukrainian people.

UKRAINE

From President Zelenskyy to every Ukrainian, th


def extract_struct(speech):
    L = speech.strip().split('\n', maxsplit=3)
    L[3] = re.sub(r"[^A-Za-z' ]", ' ', L[3]).lower()
    return dict(zip(['speech', 'president', 'date', 'contents'], L))


speeches_df = pd.DataFrame(list(map(extract_struct, speeches)))


speeches_df


speeches_df.head()


unique_words = pd.Series(speeches_df['contents'].str.split().sum()).value_counts()
unique_words

the           146704
of             94207
to             60350
and            60308
in             38073
               ...  
wonderland         1
policed            1
dallying           1
dilly              1
em                 1
Length: 24103, dtype: int64


unique_words = unique_words.iloc[:500].index

tfidf_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
    re_pat = fr' {word} ' # Imperfect pattern for speed
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf = np.log(len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum())
    tfidf_dict[word] =  tf * idf


tfidf = pd.DataFrame(tfidf_dict)


tfidf.head()


summaries = tfidf.idxmax(axis=1)
summaries

0          object
1      convention
2       provision
3       provision
4           ought
          ...    
227       tonight
228       tonight
229       tonight
230          jobs
231       tonight
Length: 232, dtype: object


def five_largest(row):
    return list(row.index[row.argsort()][-5:])


keywords = tfidf.apply(five_largest, axis=1)
keywords_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords
], axis=1)


with pd.option_context('display.max_rows', 300):
    display(keywords_df)


tfidf_nl_dict = {}
tf_denom = speeches_df['contents'].str.split().str.len()
for word in unique_words:
    re_pat = fr' {word} ' # Imperfect pattern for speed
    tf = speeches_df['contents'].str.count(re_pat) / tf_denom
    idf_nl = len(speeches_df) / speeches_df['contents'].str.contains(re_pat).sum()
    tfidf_nl_dict[word] =  tf * idf_nl


tfidf_nl = pd.DataFrame(tfidf_nl_dict)


tfidf_nl


keywords_nl = tfidf_nl.apply(five_largest, axis=1)
keywords_nl_df = pd.concat([
    speeches_df['president'],
    speeches_df['date'],
    keywords_nl
], axis=1)


keywords_nl_df


(1000 / 999)

1.001001001001001


np.log(1000 / 999)

0.001000500333583622


(50 / 2)

25.0


(500 / 2)

250.0


np.log(50 / 2)

3.2188758248682006


np.log(500 / 2)

5.521460917862246

	change	climate	global	love	must	peace	really	solve	want	warming
I really want global peace	0.000000	0.000000	0.081093	0.000000	0.000000	0.219722	0.219722	0.000000	0.219722	0.000000
I must love love global warming	0.000000	0.000000	0.067578	0.366204	0.067578	0.000000	0.000000	0.000000	0.000000	0.183102
I must solve climate change	0.219722	0.219722	0.000000	0.000000	0.081093	0.000000	0.000000	0.219722	0.000000	0.000000

	change	climate	global	love	must	peace	really	solve	want	warming
I really want global peace	0.000000	0.000000	0.081093	0.000000	0.000000	0.219722	0.219722	0.000000	0.219722	0.000000
I must love love global warming	0.000000	0.000000	0.067578	0.366204	0.067578	0.000000	0.000000	0.000000	0.000000	0.183102
I must solve climate change	0.219722	0.219722	0.000000	0.000000	0.081093	0.000000	0.000000	0.219722	0.000000	0.000000

	the	of	to	and	in	a	that	for	be	our	...	submitted	did	increasing	throughout	point	months	set	object	agreement	almost
0	0.089073	0.063361	0.051423	0.037649	0.018365	0.019284	0.013774	0.006428	0.018365	0.009183	...	0.000000	0.000000	0.001392	0.000000	0.000000	0.000000	0.000000	0.005758	0.000000	0.000000
1	0.086957	0.063435	0.034925	0.032074	0.019244	0.014968	0.012117	0.011404	0.012830	0.012117	...	0.001312	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.002979	0.000000	0.000000
2	0.105035	0.069010	0.038194	0.031684	0.017795	0.018229	0.013889	0.009549	0.014757	0.002170	...	0.000799	0.000000	0.000658	0.000000	0.000000	0.000000	0.001291	0.001814	0.000000	0.000000
3	0.093212	0.066444	0.042065	0.026769	0.022945	0.015296	0.011472	0.014340	0.013862	0.005258	...	0.000000	0.000000	0.000000	0.000749	0.000000	0.000000	0.000000	0.001998	0.000000	0.000000
4	0.091603	0.067176	0.037659	0.024936	0.013232	0.017303	0.006107	0.011705	0.021883	0.008142	...	0.000000	0.000772	0.000000	0.001595	0.000000	0.000000	0.000000	0.001064	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
227	0.040610	0.023646	0.033756	0.041981	0.017992	0.017135	0.011652	0.007882	0.005312	0.017820	...	0.000000	0.000520	0.000000	0.000000	0.000253	0.001343	0.000510	0.000000	0.000000	0.000834
228	0.049582	0.025413	0.029145	0.034654	0.020970	0.017949	0.011374	0.011374	0.004443	0.017416	...	0.000000	0.002156	0.000000	0.000000	0.000263	0.001114	0.000793	0.000000	0.001309	0.002307
229	0.047201	0.023920	0.026631	0.035242	0.018338	0.016425	0.011162	0.010046	0.003987	0.015149	...	0.000000	0.000242	0.000000	0.000250	0.000236	0.001250	0.000237	0.000000	0.000587	0.000776
230	0.053181	0.026402	0.035831	0.033819	0.023007	0.014835	0.010686	0.008423	0.004652	0.009806	...	0.000000	0.001906	0.000191	0.000591	0.000557	0.000197	0.000374	0.000000	0.000231	0.000816
231	0.044309	0.023458	0.033759	0.039221	0.017004	0.017376	0.012908	0.009433	0.004716	0.008316	...	0.000000	0.001129	0.000188	0.000389	0.000550	0.000584	0.000554	0.000000	0.000000	0.000604

Lecture 19 – Text as Data, Continued¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Bag of words 💰¶

Recap¶

Question: What job titles are most similar to `'asst fire chief'`?¶

Aside: dot product¶

Computing similarities¶

Bag of words¶

Cosine similarity and bag of words¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

TF-IDF¶

The importance of words¶

Term frequency¶

Inverse document frequency¶

Intuition¶

Term frequency-inverse document frequency¶

Computing TF-IDF¶

TF-IDF of all words in all documents¶

Example: State of the Union addresses 🎤¶

The data¶

Finding the most important words in each speech¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Summary, next time¶

Summary¶

	2	police	officer	1	fire	asst	civil	eng	3	asoc	...	motive	metro	sign	stores	sec	law	librn	risk	medical	african
Job Title
police officer	0	1	1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
police officer	0	1	1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
fire engineer	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
retirement administrator	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
fire battalion chief	0	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
asst eng civil	0	0	0	0	0	1	1	1	0	0	...	0	0	0	0	0	0	0	0	0	0
police officer	0	1	1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
asst planner	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
project ofcr 1	0	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
utility worker 2	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	speech	president	date	contents
0	State of the Union Address	George Washington	January 8, 1790	fellow citizens of the senate and house of re...
1	State of the Union Address	George Washington	December 8, 1790	fellow citizens of the senate and house of re...
2	State of the Union Address	George Washington	October 25, 1791	fellow citizens of the senate and house of re...
3	State of the Union Address	George Washington	November 6, 1792	fellow citizens of the senate and house of re...
4	State of the Union Address	George Washington	December 3, 1793	fellow citizens of the senate and house of re...
...	...	...	...	...
227	State of the Union Address	Donald J. Trump	January 30, 2018	mr speaker mr vice president members of c...
228	State of the Union Address	Donald J. Trump	February 5, 2019	madam speaker mr vice president members of...
229	State of the Union Address	Donald J. Trump	February 4, 2020	thank you very much thank you thank you ver...
230	State of the Union Address	Joseph R. Biden Jr.	April 28, 2021	thank you thank you thank you good to be b...
231	State of the Union Address	Joseph R. Biden Jr.	March 1, 2022	madam speaker madam vice president and our ...

	...	submitted	did	increasing	throughout	set	object
0	...	0.000000	0.000000	0.000382	0.000000	0.000000	0.002031
1	...	0.000435	0.000000	0.000000	0.000000	0.000000	0.001051
2	...	0.000265	0.000000	0.000181	0.000000	0.000345	0.000640
3	...	0.000000	0.000000	0.000000	0.000215	0.000000	0.000705
4	...	0.000000	0.000212	0.000000	0.000458	0.000000	0.000375

	president	date	0
0	George Washington	January 8, 1790	[proper, your, regard, ought, object]
1	George Washington	December 8, 1790	[case, established, object, commerce, convention]
2	George Washington	October 25, 1791	[upon, community, lands, proper, provision]
3	George Washington	November 6, 1792	[subject, upon, information, proper, provision]
4	George Washington	December 3, 1793	[having, vessels, executive, shall, ought]
5	George Washington	November 19, 1794	[too, army, let, ought, constitution]
6	George Washington	December 8, 1795	[representatives, prevent, object, provision, ...
7	George Washington	December 7, 1796	[republic, treaty, britain, ought, object]
8	John Adams	November 22, 1797	[spain, british, claims, treaty, vessels]
9	John Adams	December 8, 1798	[st, minister, treaty, spain, commerce]
10	John Adams	December 3, 1799	[period, civil, british, minister, treaty]
11	John Adams	November 11, 1800	[experience, protection, navy, commerce, ought]
12	Thomas Jefferson	December 8, 1801	[consideration, shall, object, vessels, subject]
13	Thomas Jefferson	December 15, 1802	[sum, debt, naval, duties, vessels]
14	Thomas Jefferson	October 17, 1803	[debt, vessels, sum, millions, friendly]
15	Thomas Jefferson	November 8, 1804	[received, convention, due, having, friendly]
16	Thomas Jefferson	December 3, 1805	[families, convention, sum, millions, vessels]
17	Thomas Jefferson	December 2, 1806	[due, consideration, millions, shall, spain]
18	Thomas Jefferson	October 27, 1807	[whether, army, british, vessels, shall]
19	Thomas Jefferson	November 8, 1808	[thus, british, millions, commerce, her]
20	James Madison	November 29, 1809	[cases, having, due, british, minister]
21	James Madison	December 5, 1810	[provisions, view, minister, commerce, british]
22	James Madison	November 5, 1811	[britain, provisions, commerce, minister, brit...
23	James Madison	November 4, 1812	[nor, subject, provisions, britain, british]
24	James Madison	December 7, 1813	[number, having, naval, britain, british]
25	James Madison	September 20, 1814	[naval, vessels, britain, his, british]
26	James Madison	December 5, 1815	[debt, treasury, millions, establishment, sum]
27	James Madison	December 3, 1816	[constitution, annual, sum, treasury, british]
28	James Monroe	December 12, 1817	[improvement, territory, indian, millions, lands]
29	James Monroe	November 16, 1818	[minister, object, territory, her, spain]
30	James Monroe	December 7, 1819	[parties, friendly, minister, treaty, spain]
31	James Monroe	November 14, 1820	[amount, minister, extent, vessels, spain]
32	James Monroe	December 3, 1821	[powers, duties, revenue, spain, vessels]
33	James Monroe	December 3, 1822	[object, proper, vessels, spain, convention]
34	James Monroe	December 2, 1823	[th, department, object, minister, spain]
35	James Monroe	December 7, 1824	[spain, governments, convention, parties, object]
36	John Quincy Adams	December 6, 1825	[officers, commerce, condition, upon, improvem...
37	John Quincy Adams	December 5, 1826	[upon, commercial, vessels, british, duties]
38	John Quincy Adams	December 4, 1827	[lands, british, receipts, upon, th]
39	John Quincy Adams	December 2, 1828	[duties, upon, revenue, commercial, britain]
40	Andrew Jackson	December 8, 1829	[court, subject, upon, her, duties]
41	Andrew Jackson	December 6, 1830	[general, subject, vessels, character, upon]
42	Andrew Jackson	December 6, 1831	[indian, commerce, claims, treaty, minister]
43	Andrew Jackson	December 4, 1832	[general, subject, duties, lands, commerce]
44	Andrew Jackson	December 3, 1833	[treasury, convention, minister, spain, duties]
45	Andrew Jackson	December 1, 1834	[bill, treaty, minister, claims, upon]
46	Andrew Jackson	December 7, 1835	[treaty, upon, claims, subject, minister]
47	Andrew Jackson	December 5, 1836	[property, treasury, duties, revenue, banks]
48	Martin van Buren	December 5, 1837	[price, subject, upon, banks, lands]
49	Martin van Buren	December 3, 1838	[subject, upon, indian, banks, court]
50	Martin van Buren	December 2, 1839	[duties, treasury, extent, institutions, banks]
51	Martin van Buren	December 5, 1840	[general, revenue, upon, extent, having]
52	John Tyler	December 7, 1841	[banks, britain, amount, duties, treasury]
53	John Tyler	December 6, 1842	[claims, minister, thus, amount, treasury]
54	John Tyler	December 6, 1843	[treasury, british, her, minister, mexico]
55	John Tyler	December 3, 1844	[minister, upon, treaty, her, mexico]
56	James Polk	December 2, 1845	[british, convention, territory, duties, mexico]
57	James Polk	December 8, 1846	[army, territory, minister, her, mexico]
58	James Polk	December 7, 1847	[amount, treaty, her, army, mexico]
59	James Polk	December 5, 1848	[tariff, upon, bill, constitution, mexico]
60	Zachary Taylor	December 4, 1849	[territory, treaty, recommend, minister, mexico]
61	Millard Fillmore	December 2, 1850	[recommend, upon, claims, mexico, duties]
62	Millard Fillmore	December 2, 1851	[department, annual, fiscal, subject, mexico]
63	Millard Fillmore	December 6, 1852	[duties, navy, mexico, subject, her]
64	Franklin Pierce	December 5, 1853	[commercial, regard, upon, construction, subject]
65	Franklin Pierce	December 4, 1854	[character, duties, naval, minister, property]
66	Franklin Pierce	December 31, 1855	[constitution, british, territory, convention,...
67	Franklin Pierce	December 2, 1856	[institutions, property, condition, thus, terr...
68	James Buchanan	December 8, 1857	[treaty, constitution, territory, convention, ...
69	James Buchanan	December 6, 1858	[june, mexico, minister, constitution, territory]
70	James Buchanan	December 19, 1859	[republic, th, fiscal, mexico, june]
71	James Buchanan	December 3, 1860	[minister, duties, claims, convention, constit...
72	Abraham Lincoln	December 3, 1861	[army, claims, labor, capital, court]
73	Abraham Lincoln	December 1, 1862	[upon, shall, population, per, sum]
74	Abraham Lincoln	December 8, 1863	[lands, receipts, subject, navy, naval]
75	Abraham Lincoln	December 6, 1864	[condition, secretary, naval, treasury, navy]
76	Andrew Johnson	December 4, 1865	[form, commerce, powers, general, constitution]
77	Andrew Johnson	December 3, 1866	[thus, june, constitution, mexico, condition]
78	Andrew Johnson	December 3, 1867	[june, value, department, upon, constitution]
79	Andrew Johnson	December 9, 1868	[millions, amount, expenditures, june, per]
80	Ulysses S. Grant	December 6, 1869	[subject, upon, receipts, per, spain]
81	Ulysses S. Grant	December 5, 1870	[her, convention, vessels, spain, british]
82	Ulysses S. Grant	December 4, 1871	[object, powers, treaty, desire, recommend]
83	Ulysses S. Grant	December 2, 1872	[territory, line, her, britain, treaty]
84	Ulysses S. Grant	December 1, 1873	[consideration, banks, subject, amount, claims]
85	Ulysses S. Grant	December 7, 1874	[duties, upon, attention, claims, convention]
86	Ulysses S. Grant	December 7, 1875	[parties, territory, court, spain, claims]
87	Ulysses S. Grant	December 5, 1876	[subject, court, per, commission, claims]
88	Rutherford B. Hayes	December 3, 1877	[upon, sum, fiscal, commercial, value]
89	Rutherford B. Hayes	December 2, 1878	[per, secretary, fiscal, june, indian]
90	Rutherford B. Hayes	December 1, 1879	[subject, territory, june, commission, indian]
91	Rutherford B. Hayes	December 6, 1880	[subject, relations, office, attention, commer...
92	Chester A. Arthur	December 6, 1881	[spain, international, british, relations, fri...
93	Chester A. Arthur	December 4, 1882	[territory, establishment, mexico, internation...
94	Chester A. Arthur	December 4, 1883	[total, convention, mexico, commission, treaty]
95	Chester A. Arthur	December 1, 1884	[treaty, territory, commercial, secretary, ves...
96	Grover Cleveland	December 8, 1885	[duties, vessels, treaty, upon, condition]
97	Grover Cleveland	December 6, 1886	[mexico, claims, subject, convention, fiscal]
98	Grover Cleveland	December 6, 1887	[condition, sum, thus, price, tariff]
99	Grover Cleveland	December 3, 1888	[secretary, treaty, upon, per, june]
100	Benjamin Harrison	December 3, 1889	[general, commission, indian, upon, lands]
101	Benjamin Harrison	December 1, 1890	[receipts, subject, upon, per, tariff]
102	Benjamin Harrison	December 9, 1891	[court, tariff, indian, upon, per]
103	Benjamin Harrison	December 6, 1892	[secretary, tariff, upon, value, per]
104	William McKinley	December 6, 1897	[upon, international, agreement, territory, sp...
105	William McKinley	December 5, 1898	[navy, commission, naval, june, spain]
106	William McKinley	December 5, 1899	[treaty, officers, commission, international, ...
107	William McKinley	December 3, 1900	[settlement, civil, shall, convention, commiss...
108	Theodore Roosevelt	December 3, 1901	[army, commercial, conditions, navy, man]
109	Theodore Roosevelt	December 2, 1902	[upon, man, navy, conditions, tariff]
110	Theodore Roosevelt	December 7, 1903	[june, lands, territory, property, treaty]
111	Theodore Roosevelt	December 6, 1904	[cases, conditions, indian, labor, man]
112	Theodore Roosevelt	December 5, 1905	[business, conditions, commission, cannot, man]
113	Theodore Roosevelt	December 3, 1906	[upon, navy, tax, court, man]
114	Theodore Roosevelt	December 3, 1907	[conditions, navy, upon, army, man]
115	Theodore Roosevelt	December 8, 1908	[man, officers, labor, control, banks]
116	William H. Taft	December 7, 1909	[convention, banks, court, department, tariff]
117	William H. Taft	December 6, 1910	[department, court, commercial, international,...
118	William H. Taft	December 5, 1911	[mexico, department, per, tariff, court]
119	William H. Taft	December 3, 1912	[tariff, republic, army, per, department]
120	Woodrow Wilson	December 2, 1913	[how, shall, upon, mexico, ought]
121	Woodrow Wilson	December 8, 1914	[shall, convention, ought, upon, matter]
122	Woodrow Wilson	December 7, 1915	[navy, her, millions, economic, cannot]
123	Woodrow Wilson	December 5, 1916	[commerce, shall, upon, commission, bill]
124	Woodrow Wilson	December 4, 1917	[purpose, her, know, settlement, shall]
125	Woodrow Wilson	December 2, 1918	[shall, go, men, upon, back]
126	Woodrow Wilson	December 2, 1919	[economic, her, budget, labor, conditions]
127	Woodrow Wilson	December 7, 1920	[expenditures, receipts, treasury, budget, upon]
128	Warren Harding	December 6, 1921	[capital, ought, problems, conditions, tariff]
129	Warren Harding	December 8, 1922	[responsibility, republic, problems, ought, per]
130	Calvin Coolidge	December 6, 1923	[conditions, production, commission, ought, co...
131	Calvin Coolidge	December 3, 1924	[navy, international, desire, economic, court]
132	Calvin Coolidge	December 8, 1925	[international, budget, economic, ought, court]
133	Calvin Coolidge	December 7, 1926	[tax, federal, reduction, tariff, ought]
134	Calvin Coolidge	December 6, 1927	[construction, banks, per, program, property]
135	Calvin Coolidge	December 4, 1928	[federal, department, production, program, per]
136	Herbert Hoover	December 3, 1929	[commission, federal, construction, tariff, per]
137	Herbert Hoover	December 2, 1930	[about, budget, economic, per, construction]
138	Herbert Hoover	December 8, 1931	[upon, construction, federal, economic, banks]
139	Herbert Hoover	December 6, 1932	[health, june, value, economic, banks]
140	Franklin D. Roosevelt	January 3, 1934	[labor, permanent, problems, cannot, banks]
141	Franklin D. Roosevelt	January 4, 1935	[private, work, local, program, cannot]
142	Franklin D. Roosevelt	January 3, 1936	[income, shall, let, say, today]
143	Franklin D. Roosevelt	January 6, 1937	[powers, convention, needs, help, problems]
144	Franklin D. Roosevelt	January 3, 1938	[budget, business, economic, today, income]
145	Franklin D. Roosevelt	January 4, 1939	[labor, cannot, capital, income, billion]
146	Franklin D. Roosevelt	January 3, 1940	[world, domestic, cannot, economic, today]
147	Franklin D. Roosevelt	January 6, 1941	[freedom, problems, cannot, program, today]
148	Franklin D. Roosevelt	January 6, 1942	[him, today, know, forces, production]
149	Franklin D. Roosevelt	January 7, 1943	[pacific, get, cannot, americans, production]
150	Franklin D. Roosevelt	January 11, 1944	[program, total, know, economic, cannot]
151	Franklin D. Roosevelt	January 6, 1945	[cannot, production, army, forces, jobs]
152	Harry S. Truman	January 21, 1946	[fiscal, program, billion, million, dollars]
153	Harry S. Truman	January 6, 1947	[commission, budget, economic, labor, program]
154	Harry S. Truman	January 7, 1948	[tax, billion, today, program, economic]
155	Harry S. Truman	January 5, 1949	[economic, price, program, cannot, production]
156	Harry S. Truman	January 4, 1950	[income, today, program, programs, economic]
157	Harry S. Truman	January 8, 1951	[help, program, production, strength, economic]
158	Harry S. Truman	January 9, 1952	[defense, working, program, help, production]
159	Harry S. Truman	January 7, 1953	[republic, free, cannot, world, economic]
160	Dwight D. Eisenhower	February 2, 1953	[federal, labor, budget, economic, programs]
161	Dwight D. Eisenhower	January 7, 1954	[federal, programs, economic, budget, program]
162	Dwight D. Eisenhower	January 6, 1955	[problems, federal, economic, programs, program]
163	Dwight D. Eisenhower	January 5, 1956	[billion, federal, problems, economic, program]
164	Dwight D. Eisenhower	January 10, 1957	[cannot, programs, human, program, economic]
165	Dwight D. Eisenhower	January 9, 1958	[program, strength, today, programs, economic]
166	Dwight D. Eisenhower	January 9, 1959	[growth, help, billion, programs, economic]
167	Dwight D. Eisenhower	January 7, 1960	[freedom, cannot, today, economic, help]
168	Dwight D. Eisenhower	January 12, 1961	[million, percent, billion, program, programs]
169	John F. Kennedy	January 30, 1961	[million, programs, problems, economic, program]
170	John F. Kennedy	January 11, 1962	[billion, help, program, jobs, cannot]
171	John F. Kennedy	January 14, 1963	[help, cannot, tax, percent, billion]
172	Lyndon B. Johnson	January 8, 1964	[help, billion, americans, budget, million]
173	Lyndon B. Johnson	January 4, 1965	[americans, man, programs, tonight, help]
174	Lyndon B. Johnson	January 12, 1966	[program, percent, help, billion, tonight]
175	Lyndon B. Johnson	January 10, 1967	[programs, americans, billion, tonight, percent]
176	Lyndon B. Johnson	January 17, 1968	[programs, million, budget, tonight, billion]
177	Lyndon B. Johnson	January 14, 1969	[americans, program, billion, budget, tonight]
178	Richard Nixon	January 22, 1970	[billion, percent, america, today, programs]
179	Richard Nixon	January 22, 1971	[federal, budget, americans, tonight, let]
180	Richard Nixon	January 20, 1972	[america, program, programs, today, help]
181	Richard Nixon	February 2, 1973	[economic, help, americans, working, programs]
182	Richard Nixon	January 30, 1974	[program, americans, today, energy, tonight]
183	Gerald R. Ford	January 15, 1975	[program, percent, billion, programs, energy]
184	Gerald R. Ford	January 19, 1976	[federal, americans, budget, jobs, programs]
185	Gerald R. Ford	January 12, 1977	[programs, today, percent, jobs, energy]
186	Jimmy Carter	January 19, 1978	[tax, cannot, economic, tonight, jobs]
187	Jimmy Carter	January 25, 1979	[help, cannot, budget, tonight, americans]
188	Jimmy Carter	January 21, 1980	[economic, help, america, energy, tonight]
189	Jimmy Carter	January 16, 1981	[percent, economic, energy, program, programs]
190	Ronald Reagan	January 26, 1982	[jobs, help, program, billion, programs]
191	Ronald Reagan	January 25, 1983	[problems, programs, americans, economic, perc...
192	Ronald Reagan	January 25, 1984	[budget, percent, help, americans, tonight]
193	Ronald Reagan	February 6, 1985	[growth, help, tax, jobs, tonight]
194	Ronald Reagan	February 4, 1986	[families, america, cannot, budget, tonight]
195	Ronald Reagan	January 27, 1987	[america, percent, let, budget, tonight]
196	Ronald Reagan	January 25, 1988	[let, americans, agreement, budget, tonight]
197	George H.W. Bush	February 9, 1989	[program, help, ask, budget, tonight]
198	George H.W. Bush	January 31, 1990	[america, percent, budget, today, tonight]
199	George H.W. Bush	January 29, 1991	[jobs, budget, americans, know, tonight]
200	George H.W. Bush	January 28, 1992	[jobs, know, get, tonight, help]
201	William J. Clinton	February 17, 1993	[tax, budget, percent, tonight, jobs]
202	William J. Clinton	January 25, 1994	[care, americans, health, get, jobs]
203	William J. Clinton	January 24, 1995	[programs, jobs, americans, get, tonight]
204	William J. Clinton	January 23, 1996	[tonight, families, working, americans, children]
205	William J. Clinton	February 4, 1997	[america, children, budget, americans, tonight]
206	William J. Clinton	January 27, 1998	[ask, americans, children, help, tonight]
207	William J. Clinton	January 19, 1999	[children, budget, help, americans, tonight]
208	William J. Clinton	January 27, 2000	[families, help, children, americans, tonight]
209	George W. Bush	February 27, 2001	[help, tax, percent, tonight, budget]
210	George W. Bush	September 20, 2001	[freedom, america, ask, americans, tonight]
211	George W. Bush	January 29, 2002	[americans, budget, tonight, america, jobs]
212	George W. Bush	January 28, 2003	[america, help, million, americans, tonight]
213	George W. Bush	January 20, 2004	[children, america, americans, help, tonight]
214	George W. Bush	February 2, 2005	[freedom, tonight, help, social, americans]
215	George W. Bush	January 31, 2006	[reform, jobs, americans, america, tonight]
216	George W. Bush	January 23, 2007	[children, health, americans, tonight, help]
217	George W. Bush	January 29, 2008	[ask, america, americans, tonight, help]
218	Barack Obama	February 24, 2009	[banks, know, budget, jobs, tonight]
219	Barack Obama	January 27, 2010	[families, get, tonight, americans, jobs]
220	Barack Obama	January 25, 2011	[americans, percent, get, tonight, jobs]
221	Barack Obama	January 24, 2012	[energy, americans, tonight, get, jobs]
222	Barack Obama	February 12, 2013	[energy, families, get, tonight, jobs]
223	Barack Obama	January 28, 2014	[americans, get, tonight, help, jobs]
224	Barack Obama	January 20, 2015	[get, families, americans, tonight, jobs]
225	Barack Obama	January 12, 2016	[want, tonight, jobs, americans, get]
226	Donald J. Trump	February 27, 2017	[down, america, jobs, americans, tonight]
227	Donald J. Trump	January 30, 2018	[jobs, tax, get, americans, tonight]
228	Donald J. Trump	February 5, 2019	[members, get, jobs, americans, tonight]
229	Donald J. Trump	February 4, 2020	[million, jobs, americans, percent, tonight]
230	Joseph R. Biden Jr.	April 28, 2021	[america, get, americans, percent, jobs]
231	Joseph R. Biden Jr.	March 1, 2022	[let, jobs, americans, get, tonight]

	president	date	0
0	George Washington	January 8, 1790	[a, and, to, of, the]
1	George Washington	December 8, 1790	[in, and, to, of, the]
2	George Washington	October 25, 1791	[a, and, to, of, the]
3	George Washington	November 6, 1792	[in, and, to, of, the]
4	George Washington	December 3, 1793	[be, and, to, of, the]
...	...	...	...
227	Donald J. Trump	January 30, 2018	[we, of, to, the, and]
228	Donald J. Trump	February 5, 2019	[in, of, to, and, the]
229	Donald J. Trump	February 4, 2020	[in, of, to, and, the]
230	Joseph R. Biden Jr.	April 28, 2021	[in, of, and, to, the]
231	Joseph R. Biden Jr.	March 1, 2022	[we, of, to, and, the]

Lecture 19 – Text as Data, Continued¶

DSC 80, Spring 2022¶

Announcements¶

Agenda¶

Bag of words 💰¶

Recap¶

Question: What job titles are most similar to 'asst fire chief'?¶

Aside: dot product¶

Computing similarities¶

Bag of words¶

Cosine similarity and bag of words¶

A recipe for computing similarities¶

Example: Global warming 🌎¶

Pitfalls of the bag of words model¶

TF-IDF¶

The importance of words¶

Term frequency¶

Inverse document frequency¶

Intuition¶

Term frequency-inverse document frequency¶

Computing TF-IDF¶

TF-IDF of all words in all documents¶

Example: State of the Union addresses 🎤¶

The data¶

Finding the most important words in each speech¶

Summarizing speeches¶

Aside: What if we remove the $\log$ from $\text{idf}(t)$?¶

The role of $\log$ in $\text{idf}(t)$¶

Summary, next time¶

Summary¶

Question: What job titles are most similar to `'asst fire chief'`?¶