from course_utils import *

import requests

fac_response = requests.get('https://datascience.ucsd.edu/faculty/', verify=False)
fac_response

fac_text = fac_response.text
len(fac_text)

print(fac_text[:1000])

!cat data/lec10_ex1.html

from IPython.display import HTML
HTML(filename=Path('data') / 'lec10_ex1.html')

<img src="king-selfie.png" alt="A photograph of King Triton." width=500>

Click <a href="https://example.com/">this link</a> to see an example.

!cat data/lec10_ex2.html

<div style="background-color:lightblue">
  <h3>This is a heading</h3>
  <p>This is a paragraph.</p>
</div>

html_string = '''
<html>
    <body>
      <div id="content">
        <h1>Heading here</h1>
        <p>My First paragraph</p>
        <p>My <em>second</em> paragraph</p>
        <hr>
      </div>
      <div id="nav">
        <ul>
          <li>item 1</li>
          <li>item 2</li>
          <li>item 3</li>
        </ul>
      </div>
    </body>
</html>
'''.strip()

HTML(html_string)

import bs4

bs4.BeautifulSoup?

soup = bs4.BeautifulSoup(html_string)
soup

type(soup)

print(soup.text)

soup.descendants

for child in soup.descendants:
#     print(child) # What would happen if we ran this instead?
    if isinstance(child, str):
        continue
    print(child.name)

soup.find('div')

soup.find('div', attrs={'id': 'nav'})

# The ul child is not at the top of the tree, but we can still find it.
soup.find('ul')

soup.find_all('div')

soup.find_all('li')

[x.text for x in soup.find_all('li')]

soup.find('p')

soup.find('p').text

soup.find('div')

soup.find('div').text

soup.find('div').attrs

soup.find('div').get('id')

soup

# While there are multiple 'id' attributes, none of them are in the <html> tag at the top.
soup.get('id')

soup.find('div').get('id')

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

len(soup.find_all("td"))

soup.find("tr").get("class")

def download_page(i):
    url = f'https://quotes.toscrape.com/page/{i}'
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)

soup = download_page(1)

divs = soup.find_all('div', class_='quote')
# Shortcut for the following, just for when the attribute key is class:
# divs = soup.find_all('div', attrs={'class': 'quote'})

divs[0]

divs[0]

# The quote.
divs[0].find('span', class_='text').text

# The author.
divs[0].find('small', class_='author').text

# The URL for the author.
divs[0].find('a').get('href')

# The quote's tags.
divs[0].find('meta', class_='keywords').get('content')

def process_quote(div):
    quote = div.find('span', class_='text').text
    author = div.find('small', class_='author').text
    author_url = 'https://quotes.toscrape.com' + div.find('a').get('href')
    tags = div.find('meta', class_='keywords').get('content')
    
    return {'quote': quote, 'author': author, 'author_url': author_url, 'tags': tags}

process_quote(divs[-1])

def process_page(divs):
    return pd.DataFrame([process_quote(div) for div in divs])

process_page(divs)

def make_quote_df(n):
    '''Returns a DataFrame containing the quotes on the first n pages of https://quotes.toscrape.com/.'''
    dfs = []
    for i in range(1, n+1):
        # Download page n and create a BeautifulSoup object.
        soup = download_page(i)
        
        # Create DataFrame using the information in that page.
        divs = soup.find_all('div', class_='quote')
        df = process_page(divs)
        
        # Append DataFrame to dfs.
        dfs.append(df)
        
    # Stitch all DataFrames together.
    return pd.concat(dfs).reset_index(drop=True)

quotes = make_quote_df(3)
quotes.head()

quotes[quotes['author'] == 'Albert Einstein']

fac_response = requests.get('https://datascience.ucsd.edu/faculty/', verify=False)
fac_response

soup = bs4.BeautifulSoup(fac_response.text)

divs = soup.find_all(
    class_='vc_grid-item',
)

len(divs)

divs[0]

divs[0].find('h4').text

divs[0].find(class_='field').text

names = [div.find('h4').text for div in divs]
names[:10]

titles = [div.find(class_='field').text for div in divs]
titles[:10]

faculty = pd.DataFrame({
    'name': names, 
    'title': titles, 
})
faculty.head()

faculty[faculty['title'].str.contains('Teaching') | faculty['title'].str.contains('Lecturer')]

from IPython.display import Image, display

def show_picture(name):
    idx = faculty[faculty['name'].str.lower().str.contains(name.lower())].index[0]
    display(Image(url=divs[idx].find('img')['src'], width=200, height=200))
    
show_picture('marina')

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

Element	Description
`<html>`	the document
`<head>`	the header
`<body>`	the body
`<div>`	a logical division of the document
`<span>`	an inline logical division
`<p>`	a paragraph
`<a>`	an anchor (hyperlink)
`<h1>, <h2>, ...`	header(s)
`<img>`	an image

	quote	author	author_url	tags
0	“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”	Albert Einstein	https://quotes.toscrape.com/author/Albert-Einstein	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we truly are, far more than our abilities.”	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”	Albert Einstein	https://quotes.toscrape.com/author/Albert-Einstein	inspirational,life,live,miracle,miracles

Lecture 10 – Web Scraping¶

Agenda 📆¶

Review: Accessing HTML¶

Making requests¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Summary: APIs vs. scraping¶

The anatomy of HTML documents¶

What is HTML?¶

The anatomy of HTML documents¶

Useful tags to know¶

Example: Images and hyperlinks¶

The `<div>` tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Finding elements in a tree¶

Using `find`¶

Using `find_all`¶

Node attributes¶

Question 🤔

Example: Scraping quotes¶

Example: Scraping quotes¶

Question 🤔

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI faculty page¶

Example: Scraping the HDSI faculty page¶

Question 🤔

Web data in practice¶

Summary, next time¶

Next time¶

Lecture 10 – Web Scraping¶

Agenda 📆¶

Review: Accessing HTML¶

Making requests¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Summary: APIs vs. scraping¶

The anatomy of HTML documents¶

What is HTML?¶

The anatomy of HTML documents¶

Useful tags to know¶

Example: Images and hyperlinks¶

The <div> tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

BeautifulSoup objects¶

Traversing through descendants¶

Finding elements in a tree¶

Using find¶

Using find_all¶

Node attributes¶

Question 🤔

Example: Scraping quotes¶

Example: Scraping quotes¶

Question 🤔

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI faculty page¶

Example: Scraping the HDSI faculty page¶

Question 🤔

Web data in practice¶

Summary, next time¶

Next time¶

The `<div>` tag¶

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Using `find`¶

Using `find_all`¶