from dsc80_utils import *

import requests

fac_response = requests.get('https://datascience.ucsd.edu/faculty/')
fac_response

<Response [200]>

fac_text = fac_response.text
len(fac_text)

277166

print(fac_text[:10000])

<!DOCTYPE html>
<html lang="en-US">
<head>
    <meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta http-equiv="X-UA-Compatible" content="IE=edge">
	<link rel="profile" href="https://gmpg.org/xfn/11" />
    <title>Faculty &#8211; Halıcıoğlu Data Science Institute &#8211; UC San Diego</title>
                        <script>
                            /* You can add more configuration options to webfontloader by previously defining the WebFontConfig with your options */
                            if ( typeof WebFontConfig === "undefined" ) {
                                WebFontConfig = new Object();
                            }
                            WebFontConfig['google'] = {families: ['Jost:400,700', 'Roboto:400,500']};

                            (function() {
                                var wf = document.createElement( 'script' );
                                wf.src = 'https://ajax.googleapis.com/ajax/libs/webfont/1.5.3/webfont.js';
                                wf.type = 'text/javascript';
                                wf.async = 'true';
                                var s = document.getElementsByTagName( 'script' )[0];
                                s.parentNode.insertBefore( wf, s );
                            })();
                        </script>
                        <meta name='robots' content='max-image-preview:large' />
<link rel='dns-prefetch' href='//kit.fontawesome.com' />
<link rel='dns-prefetch' href='//platform-api.sharethis.com' />
<link rel='dns-prefetch' href='//fonts.googleapis.com' />
<link rel='dns-prefetch' href='//use.fontawesome.com' />
<link rel="alternate" type="application/rss+xml" title="Halıcıoğlu Data Science Institute - UC San Diego &raquo; Feed" href="https://datascience.ucsd.edu/feed/" />
<link rel="alternate" type="text/calendar" title="Halıcıoğlu Data Science Institute - UC San Diego &raquo; iCal Feed" href="https://datascience.ucsd.edu/events/?ical=1" />
		<!-- This site uses the Google Analytics by MonsterInsights plugin v8.23.1 - Using Analytics tracking - https://www.monsterinsights.com/ -->
							<script src="//www.googletagmanager.com/gtag/js?id=G-ZWZDLH05C7"  data-cfasync="false" data-wpfc-render="false" type="text/javascript" async></script>
			<script data-cfasync="false" data-wpfc-render="false" type="text/javascript">
				var mi_version = '8.23.1';
				var mi_track_user = true;
				var mi_no_track_reason = '';
				
								var disableStrs = [
										'ga-disable-G-ZWZDLH05C7',
									];

				/* Function to detect opted out users */
				function __gtagTrackerIsOptedOut() {
					for (var index = 0; index < disableStrs.length; index++) {
						if (document.cookie.indexOf(disableStrs[index] + '=true') > -1) {
							return true;
						}
					}

					return false;
				}

				/* Disable tracking if the opt-out cookie exists. */
				if (__gtagTrackerIsOptedOut()) {
					for (var index = 0; index < disableStrs.length; index++) {
						window[disableStrs[index]] = true;
					}
				}

				/* Opt-out function */
				function __gtagTrackerOptout() {
					for (var index = 0; index < disableStrs.length; index++) {
						document.cookie = disableStrs[index] + '=true; expires=Thu, 31 Dec 2099 23:59:59 UTC; path=/';
						window[disableStrs[index]] = true;
					}
				}

				if ('undefined' === typeof gaOptout) {
					function gaOptout() {
						__gtagTrackerOptout();
					}
				}
								window.dataLayer = window.dataLayer || [];

				window.MonsterInsightsDualTracker = {
					helpers: {},
					trackers: {},
				};
				if (mi_track_user) {
					function __gtagDataLayer() {
						dataLayer.push(arguments);
					}

					function __gtagTracker(type, name, parameters) {
						if (!parameters) {
							parameters = {};
						}

						if (parameters.send_to) {
							__gtagDataLayer.apply(null, arguments);
							return;
						}

						if (type === 'event') {
														parameters.send_to = monsterinsights_frontend.v4_id;
							var hookName = name;
							if (typeof parameters['event_category'] !== 'undefined') {
								hookName = parameters['event_category'] + ':' + name;
							}

							if (typeof MonsterInsightsDualTracker.trackers[hookName] !== 'undefined') {
								MonsterInsightsDualTracker.trackers[hookName](parameters);
							} else {
								__gtagDataLayer('event', name, parameters);
							}
							
						} else {
							__gtagDataLayer.apply(null, arguments);
						}
					}

					__gtagTracker('js', new Date());
					__gtagTracker('set', {
						'developer_id.dZGIzZG': true,
											});
										__gtagTracker('config', 'G-ZWZDLH05C7', {"forceSSL":"true","link_attribution":"true"} );
															window.gtag = __gtagTracker;										(function () {
						/* https://developers.google.com/analytics/devguides/collection/analyticsjs/ */
						/* ga and __gaTracker compatibility shim. */
						var noopfn = function () {
							return null;
						};
						var newtracker = function () {
							return new Tracker();
						};
						var Tracker = function () {
							return null;
						};
						var p = Tracker.prototype;
						p.get = noopfn;
						p.set = noopfn;
						p.send = function () {
							var args = Array.prototype.slice.call(arguments);
							args.unshift('send');
							__gaTracker.apply(null, args);
						};
						var __gaTracker = function () {
							var len = arguments.length;
							if (len === 0) {
								return;
							}
							var f = arguments[len - 1];
							if (typeof f !== 'object' || f === null || typeof f.hitCallback !== 'function') {
								if ('send' === arguments[0]) {
									var hitConverted, hitObject = false, action;
									if ('event' === arguments[1]) {
										if ('undefined' !== typeof arguments[3]) {
											hitObject = {
												'eventAction': arguments[3],
												'eventCategory': arguments[2],
												'eventLabel': arguments[4],
												'value': arguments[5] ? arguments[5] : 1,
											}
										}
									}
									if ('pageview' === arguments[1]) {
										if ('undefined' !== typeof arguments[2]) {
											hitObject = {
												'eventAction': 'page_view',
												'page_path': arguments[2],
											}
										}
									}
									if (typeof arguments[2] === 'object') {
										hitObject = arguments[2];
									}
									if (typeof arguments[5] === 'object') {
										Object.assign(hitObject, arguments[5]);
									}
									if ('undefined' !== typeof arguments[1].hitType) {
										hitObject = arguments[1];
										if ('pageview' === hitObject.hitType) {
											hitObject.eventAction = 'page_view';
										}
									}
									if (hitObject) {
										action = 'timing' === arguments[1].hitType ? 'timing_complete' : hitObject.eventAction;
										hitConverted = mapArgs(hitObject);
										__gtagTracker('event', action, hitConverted);
									}
								}
								return;
							}

							function mapArgs(args) {
								var arg, hit = {};
								var gaMap = {
									'eventCategory': 'event_category',
									'eventAction': 'event_action',
									'eventLabel': 'event_label',
									'eventValue': 'event_value',
									'nonInteraction': 'non_interaction',
									'timingCategory': 'event_category',
									'timingVar': 'name',
									'timingValue': 'value',
									'timingLabel': 'event_label',
									'page': 'page_path',
									'location': 'page_location',
									'title': 'page_title',
									'referrer' : 'page_referrer',
								};
								for (arg in args) {
																		if (!(!args.hasOwnProperty(arg) || !gaMap.hasOwnProperty(arg))) {
										hit[gaMap[arg]] = args[arg];
									} else {
										hit[arg] = args[arg];
									}
								}
								return hit;
							}

							try {
								f.hitCallback();
							} catch (ex) {
							}
						};
						__gaTracker.create = newtracker;
						__gaTracker.getByName = newtracker;
						__gaTracker.getAll = function () {
							return [];
						};
						__gaTracker.remove = noopfn;
						__gaTracker.loaded = true;
						window['__gaTracker'] = __gaTracker;
					})();
									} else {
										console.log("");
					(function () {
						function __gtagTracker() {
							return null;
						}

						window['__gtagTracker'] = __gtagTracker;
						window['gtag'] = __gtagTracker;
					})();
									}
			</script>
				<!-- / Google Analytics by MonsterInsights -->
		<script type="text/javascript">
/* <![CDATA[ */
window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/14.0.0\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/datascience.ucsd.edu\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.4.3"}};
/*! This file is auto-generated */
!function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83e\udef1\ud83c\udffb\u200d\ud83e\udef2\ud83c\udfff","\ud83e\udef1\ud83c\udffb\u200b\ud83e\udef2\ud83c\udfff")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenC

!cat data/lec10_ex1.html

<html>
  <head>
    <title>Page title</title>
  </head>

  <body>
    <h1>This is a heading</h1>
    <p>This is a paragraph.</p>
    <p>This is <b>another</b> paragraph.</p>
  </body>
</html>

from IPython.display import HTML
HTML(filename=Path('data') / 'lec10_ex1.html')

<img src="king-selfie.png" alt="A photograph of King Triton." width=500>

Click <a href="https://practice.dsc80.com">this link</a> to access past exams.

!cat data/lec10_ex2.html

<html>
  <head>
    <title>Project 4A and 4B - DSC 80, Winter 2024</title>
    <link
      href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css"
      rel="stylesheet"
    />
  </head>

  <body>
    <h1>Project Overview</h1>
    <img src="../imgs/platter.png" width="200" alt="My dinner last night." />
    <p>
      When the project is released, you can start it by
      <a href="https://github.com/dsc-courses/dsc80-2024-wi/"
        >public GitHub repo</a
      >.
    </p>
    <center>
      <h3>
        Note that you'll have to submit your notebook as a PDF and a link to
        your website.
      </h3>
    </center>
  </body>
</html>

<div style="background-color:lightblue">
  <h3>This is a heading</h3>
  <p>This is a paragraph.</p>
</div>

html_string = '''
<html>
    <body>
      <div id="content">
        <h1>Heading here</h1>
        <p>My First paragraph</p>
        <p>My <em>second</em> paragraph</p>
        <hr>
      </div>
      <div id="nav">
        <ul>
          <li>item 1</li>
          <li>item 2</li>
          <li>item 3</li>
        </ul>
      </div>
    </body>
</html>
'''.strip()

HTML(html_string)

import bs4

bs4.BeautifulSoup?

soup = bs4.BeautifulSoup(html_string)
soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

type(soup)

bs4.BeautifulSoup

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

print(soup.text)



Heading here
My First paragraph
My second paragraph


item 1
item 2
item 3

soup.descendants

<generator object Tag.descendants at 0x133e79ac0>

for child in soup.descendants:
#     print(child) # What would happen if we ran this instead?
    if isinstance(child, str):
        continue
    print(child.name)

html
body
div
h1
p
p
em
hr
div
ul
li
li
li

soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

len(soup.find_all('div'))

2

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

soup.find('div', attrs={'id': 'nav'})

<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

# The ul child is not at the top of the tree, but we can still find it.
soup.find('li')

<li>item 1</li>

soup.find_all('div')

[<div id="content">
 <h1>Heading here</h1>
 <p>My First paragraph</p>
 <p>My <em>second</em> paragraph</p>
 <hr/>
 </div>,
 <div id="nav">
 <ul>
 <li>item 1</li>
 <li>item 2</li>
 <li>item 3</li>
 </ul>
 </div>]

soup.find_all('li')

[<li>item 1</li>, <li>item 2</li>, <li>item 3</li>]

soup.find('ucsd')

soup.find_all('li')[0]

<li>item 1</li>

[x.text for x in soup.find_all('li')]

['item 1', 'item 2', 'item 3']

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

soup.find('p')

<p>My First paragraph</p>

soup.find('p').text

'My First paragraph'

soup.find('div')

<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>

soup.find('div').text

'\nHeading here\nMy First paragraph\nMy second paragraph\n\n'

soup.find('div').attrs

{'id': 'content'}

# soup.find('img').get('href') # this would find you the link to an image, for example.

[div.attrs for div in soup.find_all('div')]

[{'id': 'content'}, {'id': 'nav'}]

soup.find('div').get('id')

'content'

soup

<html>
<body>
<div id="content">
<h1>Heading here</h1>
<p>My First paragraph</p>
<p>My <em>second</em> paragraph</p>
<hr/>
</div>
<div id="nav">
<ul>
<li>item 1</li>
<li>item 2</li>
<li>item 3</li>
</ul>
</div>
</body>
</html>

# While there are multiple 'id' attributes, none of them are in the <html> tag at the top.
soup.get('id')

soup.find('div').attrs['id']

'content'

soup.find('div').get('id')

'content'

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

[<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
 <a class="tag" href="/tag/change/page/1/">change</a>
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>
 <a class="tag" href="/tag/world/page/1/">world</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
 <span>by <small class="author" itemprop="author">J.K. Rowling</small>
 <a href="/author/J-K-Rowling">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="abilities,choices" itemprop="keywords"/>
 <a class="tag" href="/tag/abilities/page/1/">abilities</a>
 <a class="tag" href="/tag/choices/page/1/">choices</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="inspirational,life,live,miracle,miracles" itemprop="keywords"/>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 <a class="tag" href="/tag/life/page/1/">life</a>
 <a class="tag" href="/tag/live/page/1/">live</a>
 <a class="tag" href="/tag/miracle/page/1/">miracle</a>
 <a class="tag" href="/tag/miracles/page/1/">miracles</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
 <span>by <small class="author" itemprop="author">Jane Austen</small>
 <a href="/author/Jane-Austen">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="aliteracy,books,classic,humor" itemprop="keywords"/>
 <a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
 <a class="tag" href="/tag/books/page/1/">books</a>
 <a class="tag" href="/tag/classic/page/1/">classic</a>
 <a class="tag" href="/tag/humor/page/1/">humor</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
 <span>by <small class="author" itemprop="author">Marilyn Monroe</small>
 <a href="/author/Marilyn-Monroe">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="be-yourself,inspirational" itemprop="keywords"/>
 <a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="adulthood,success,value" itemprop="keywords"/>
 <a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
 <a class="tag" href="/tag/success/page/1/">success</a>
 <a class="tag" href="/tag/value/page/1/">value</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
 <span>by <small class="author" itemprop="author">André Gide</small>
 <a href="/author/Andre-Gide">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="life,love" itemprop="keywords"/>
 <a class="tag" href="/tag/life/page/1/">life</a>
 <a class="tag" href="/tag/love/page/1/">love</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
 <span>by <small class="author" itemprop="author">Thomas A. Edison</small>
 <a href="/author/Thomas-A-Edison">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="edison,failure,inspirational,paraphrased" itemprop="keywords"/>
 <a class="tag" href="/tag/edison/page/1/">edison</a>
 <a class="tag" href="/tag/failure/page/1/">failure</a>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 <a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
 <span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
 <a href="/author/Eleanor-Roosevelt">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="misattributed-eleanor-roosevelt" itemprop="keywords"/>
 <a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
 <span>by <small class="author" itemprop="author">Steve Martin</small>
 <a href="/author/Steve-Martin">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="humor,obvious,simile" itemprop="keywords"/>
 <a class="tag" href="/tag/humor/page/1/">humor</a>
 <a class="tag" href="/tag/obvious/page/1/">obvious</a>
 <a class="tag" href="/tag/simile/page/1/">simile</a>
 </div>
 </div>]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

'Albert Einstein'

'https://quotes.toscrape.com/author/Albert-Einstein'

'change,deep-thoughts,thinking,world'

{'quote': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 'author': 'Albert Einstein',
 'author_url': 'https://quotes.toscrape.com/author/Albert-Einstein',
 'tags': 'change,deep-thoughts,thinking,world'}

<Response [200]>

63

len(soup.find_all("td"))

soup.find("tr").get("class")

def download_page(i):
    url = f'https://quotes.toscrape.com/page/{i}'
    request = requests.get(url)
    return bs4.BeautifulSoup(request.text)

soup = download_page(1)

soup.find_all('div', attrs={'class': 'quote'})

[<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
 <a class="tag" href="/tag/change/page/1/">change</a>
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>
 <a class="tag" href="/tag/world/page/1/">world</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
 <span>by <small class="author" itemprop="author">J.K. Rowling</small>
 <a href="/author/J-K-Rowling">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="abilities,choices" itemprop="keywords"/>
 <a class="tag" href="/tag/abilities/page/1/">abilities</a>
 <a class="tag" href="/tag/choices/page/1/">choices</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="inspirational,life,live,miracle,miracles" itemprop="keywords"/>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 <a class="tag" href="/tag/life/page/1/">life</a>
 <a class="tag" href="/tag/live/page/1/">live</a>
 <a class="tag" href="/tag/miracle/page/1/">miracle</a>
 <a class="tag" href="/tag/miracles/page/1/">miracles</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>
 <span>by <small class="author" itemprop="author">Jane Austen</small>
 <a href="/author/Jane-Austen">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="aliteracy,books,classic,humor" itemprop="keywords"/>
 <a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
 <a class="tag" href="/tag/books/page/1/">books</a>
 <a class="tag" href="/tag/classic/page/1/">classic</a>
 <a class="tag" href="/tag/humor/page/1/">humor</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>
 <span>by <small class="author" itemprop="author">Marilyn Monroe</small>
 <a href="/author/Marilyn-Monroe">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="be-yourself,inspirational" itemprop="keywords"/>
 <a class="tag" href="/tag/be-yourself/page/1/">be-yourself</a>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="adulthood,success,value" itemprop="keywords"/>
 <a class="tag" href="/tag/adulthood/page/1/">adulthood</a>
 <a class="tag" href="/tag/success/page/1/">success</a>
 <a class="tag" href="/tag/value/page/1/">value</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</span>
 <span>by <small class="author" itemprop="author">André Gide</small>
 <a href="/author/Andre-Gide">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="life,love" itemprop="keywords"/>
 <a class="tag" href="/tag/life/page/1/">life</a>
 <a class="tag" href="/tag/love/page/1/">love</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“I have not failed. I've just found 10,000 ways that won't work.”</span>
 <span>by <small class="author" itemprop="author">Thomas A. Edison</small>
 <a href="/author/Thomas-A-Edison">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="edison,failure,inspirational,paraphrased" itemprop="keywords"/>
 <a class="tag" href="/tag/edison/page/1/">edison</a>
 <a class="tag" href="/tag/failure/page/1/">failure</a>
 <a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
 <a class="tag" href="/tag/paraphrased/page/1/">paraphrased</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“A woman is like a tea bag; you never know how strong it is until it's in hot water.”</span>
 <span>by <small class="author" itemprop="author">Eleanor Roosevelt</small>
 <a href="/author/Eleanor-Roosevelt">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="misattributed-eleanor-roosevelt" itemprop="keywords"/>
 <a class="tag" href="/tag/misattributed-eleanor-roosevelt/page/1/">misattributed-eleanor-roosevelt</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“A day without sunshine is like, you know, night.”</span>
 <span>by <small class="author" itemprop="author">Steve Martin</small>
 <a href="/author/Steve-Martin">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="humor,obvious,simile" itemprop="keywords"/>
 <a class="tag" href="/tag/humor/page/1/">humor</a>
 <a class="tag" href="/tag/obvious/page/1/">obvious</a>
 <a class="tag" href="/tag/simile/page/1/">simile</a>
 </div>
 </div>]

divs = soup.find_all('div', class_='quote')

divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

divs[0]

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

# The quote.
divs[0].find('span', class_='text').text

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

# The author.
divs[0].find('small', class_='author').text

'Albert Einstein'

# The URL for the author.
'https://quotes.toscrape.com' + divs[0].find('a').get('href')

'https://quotes.toscrape.com/author/Albert-Einstein'

# The quote's tags.
divs[0].find('meta', class_='keywords').get('content')

'change,deep-thoughts,thinking,world'

def process_quote(div):
    quote = div.find('span', class_='text').text
    author = div.find('small', class_='author').text
    author_url = 'https://quotes.toscrape.com' + div.find('a').get('href')
    tags = div.find('meta', class_='keywords').get('content')
    
    return {'quote': quote, 'author': author, 'author_url': author_url, 'tags': tags}

process_quote(divs[0])

{'quote': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 'author': 'Albert Einstein',
 'author_url': 'https://quotes.toscrape.com/author/Albert-Einstein',
 'tags': 'change,deep-thoughts,thinking,world'}

def process_page(divs):
    return pd.DataFrame([process_quote(div) for div in divs])

process_page(divs)

def make_quote_df(n):
    '''Returns a DataFrame containing the quotes on the first n pages of https://quotes.toscrape.com/.'''
    dfs = []
    for i in range(1, n+1):
        # Download page n and create a BeautifulSoup object.
        soup = download_page(i)
        
        # Create DataFrame using the information in that page.
        divs = soup.find_all('div', class_='quote')
        df = process_page(divs)
        
        # Append DataFrame to dfs.
        dfs.append(df)
        
    # Stitch all DataFrames together.
    return pd.concat(dfs, ignore_index=True)

quotes = make_quote_df(3)
# quotes.head()
quotes

quotes

quotes[quotes['author'] == 'Albert Einstein']

fac_response = requests.get('https://datascience.ucsd.edu/faculty/')
fac_response

<Response [200]>

soup = bs4.BeautifulSoup(fac_response.text)

divs = soup.find_all('div', class_='vc_grid-item')

len(divs)

63

divs[0]

<div class="vc_grid-item vc_clearfix col_1-5 vc_grid-item-zone-c-bottom vc_visible-item vc_grid-term-council vc_grid-term-faculty vc_grid-term-faculty-fellows">
<a class="anchor-link" id="ilkay-altintas" name="ilkay-altintas"></a>
<div class="vc_grid-item-mini vc_clearfix"><div class="vc_gitem-animated-block">
<div class="vc_gitem-zone vc_gitem-zone-a vc-gitem-zone-height-mode-auto vc-gitem-zone-height-mode-auto-1-1" style="background-image: url(https://datascience.ucsd.edu/wp-content/uploads/2022/10/Ilkay-Altintas-2.jpg) !important;"><a class="vc_gitem-link vc-zone-link" href="https://datascience.ucsd.edu/people/ilkay-altintas/"></a><img alt="Ilkay Altintas" class="vc_gitem-zone-img" decoding="async" src="https://datascience.ucsd.edu/wp-content/uploads/2022/10/Ilkay-Altintas-2.jpg"/>
<div class="vc_gitem-zone-mini"></div>
</div>
</div><div class="vc_gitem-zone vc_gitem-zone-c">
<div class="vc_gitem-zone-mini">
<div class="vc_gitem_row vc_row vc_gitem-row-position-top">
<div class="vc_col-sm-12 vc_gitem-col vc_gitem-col-align-">
<div class="vc_custom_heading vc_gitem-post-data vc_gitem-post-data-source-post_title">
<h4 style="text-align: left"><a href="https://datascience.ucsd.edu/people/ilkay-altintas/">Ilkay Altintas</a></h4>
</div>
<div class="vc_gitem-align-left fields">
<div class="field pendari_people_title">SDSC Chief Data Science Officer &amp; HDSI Founding Faculty Fellow</div>
</div>
<div class="excerpt">
</div>
<div class="terms">
                        Council Faculty Faculty Fellows 
                      </div>
</div>
</div>
</div>
</div></div></div>

divs[39].find('h4').text

'Suraj Rampure'

divs[39].find('div', class_='pendari_people_title').text

'Lecturer'

names = [div.find('h4').text for div in divs]
names[:10]

['Ilkay Altintas',
 'Tiffany Amariuta',
 'Mikio Aoi',
 'Ery Arias-Castro',
 'Vineet Bafna',
 'Mikhail Belkin',
 'Jelena Bradic',
 'Henrik Christensen',
 'Alex Cloninger',
 'Anders Dale']

titles = [div.find(class_='field').text for div in divs]
titles[:10]

['SDSC Chief Data Science Officer & HDSI Founding Faculty Fellow',
 'Assistant Professor',
 'Assistant Professor',
 'Professor',
 'Professor',
 'Professor',
 'Professor',
 'Distinguished Scientist, Professor',
 'Associate Professor',
 'Distinguished Scientist, Professor']

faculty = pd.DataFrame({
    'name': names, 
    'title': titles, 
})
faculty.head()

faculty[faculty['title'].str.contains('Teaching') | faculty['title'].str.contains('Lecturer')]

len(divs)

63

names.index('Marina Langlois')

27

divs[27].find('img').get('src')

'https://datascience.ucsd.edu/wp-content/uploads/2022/10/Marina-Langlois-1.jpg'

from IPython.display import Image, display

def show_picture(name):
    idx = faculty[faculty['name'].str.lower().str.contains(name)].index[0]
    display(Image(divs[idx].find('img').get('src')))
    
show_picture('suraj')

<head>
    <title>3*Canada-2022-06-04</title>
</head>
<body>
    <h1>Spotify Top 3 - Canada</h1>
    <table>
        <tr class='heading'>
            <th>Rank</th>
            <th>Artist(s)</th> 
            <th>Song</th>
        </tr>
        <tr class=1>
            <td>1</td>
            <td>Harry Styles</td> 
            <td>As It Was</td>
        </tr>
        <tr class=2>
            <td>2</td>
            <td>Jack Harlow</td> 
            <td>First Class</td>
        </tr>
        <tr class=3>
            <td>3</td>
            <td>Kendrick Lamar</td> 
            <td>N95</td>
        </tr>
    </table>
</body>

def top_nth(n):
    return soup.find("tr", attrs={"class": n}).find_all("td")[-1].text

Element	Description
`<html>`	the document
`<head>`	the header
`<body>`	the body
`<div>`	a logical division of the document
`<span>`	an inline logical division
`<p>`	a paragraph
`<a>`	an anchor (hyperlink)
`<h1>, <h2>, ...`	header(s)
`<img>`	an image

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
...	...	...	...	...
7	“I have not failed. I've just found 10,000 way...	Thomas A. Edison	https://quotes.toscrape.com/author/Thomas-A-Ed...	edison,failure,inspirational,paraphrased
8	“A woman is like a tea bag; you never know how...	Eleanor Roosevelt	https://quotes.toscrape.com/author/Eleanor-Roo...	misattributed-eleanor-roosevelt
9	“A day without sunshine is like, you know, nig...	Steve Martin	https://quotes.toscrape.com/author/Steve-Martin	humor,obvious,simile

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
...	...	...	...	...
27	“It is impossible to live without failing at s...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling
28	“Logic will get you from A to Z; imagination w...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	imagination
29	“One good thing about music, when it hits you,...	Bob Marley	https://quotes.toscrape.com/author/Bob-Marley	music

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
1	“It is our choices, Harry, that show what we t...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling	abilities,choices
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
...	...	...	...	...
27	“It is impossible to live without failing at s...	J.K. Rowling	https://quotes.toscrape.com/author/J-K-Rowling
28	“Logic will get you from A to Z; imagination w...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	imagination
29	“One good thing about music, when it hits you,...	Bob Marley	https://quotes.toscrape.com/author/Bob-Marley	music

	quote	author	author_url	tags
0	“The world as we have created it is a process ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	change,deep-thoughts,thinking,world
2	“There are only two ways to live your life. On...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	inspirational,life,live,miracle,miracles
5	“Try not to become a man of success. Rather be...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	adulthood,success,value
12	“If you can't explain it to a six year old, yo...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	simplicity,understand
26	“If you want your children to be intelligent, ...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	children,fairy-tales
28	“Logic will get you from A to Z; imagination w...	Albert Einstein	https://quotes.toscrape.com/author/Albert-Eins...	imagination

	name	title
0	Ilkay Altintas	SDSC Chief Data Science Officer & HDSI Foundin...
1	Tiffany Amariuta	Assistant Professor
2	Mikio Aoi	Assistant Professor
3	Ery Arias-Castro	Professor
4	Vineet Bafna	Professor

	name	title
12	Justin Eldridge	Assistant Teaching Professor
13	Shannon Ellis	Associate Teaching Professor
27	Marina Langlois	Lecturer
...	...	...
39	Suraj Rampure	Lecturer
47	Jack Silberman	Lecturer
51	Janine Tiefenbruck	Lecturer

Lecture 10 – Web Scraping¶

DSC 80, Winter 2024¶

Announcements 📣¶

Agenda 📆¶

Question 🤔 (Answer at q.dsc80.com)

Review: Accessing HTML¶

Making requests¶

Best practices for scraping¶

Consequences of irresponsible scraping¶

Summary: APIs vs. scraping¶

The anatomy of HTML documents¶

What is HTML?¶

This is a heading

The anatomy of HTML documents¶

Useful tags to know¶

Example: Images and hyperlinks¶

The <div> tag¶

Document trees¶

Parsing HTML using Beautiful Soup¶

Beautiful Soup 🍜¶

Example HTML document¶

Heading here

BeautifulSoup objects¶

Traversing through descendants¶

Finding elements in a tree¶

Using find¶

Using find_all¶

Node attributes¶

Exercise

Example: Scraping quotes¶

Example: Scraping quotes¶

The plan¶

Downloading a single page¶

Parsing a single page¶

Putting it all together¶

Example: Scraping the HDSI faculty page¶

Example: Scraping the HDSI faculty page¶

Exercise

Web data in practice¶

Summary, next time¶

Next time¶

The `<div>` tag¶

`BeautifulSoup` objects¶

Traversing through `descendants`¶

Using `find`¶

Using `find_all`¶