{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "85289f7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import dask.dataframe as dd\n",
    "from dask.distributed import Client"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1615379f",
   "metadata": {},
   "source": [
    "## Start Dask Scheduler and Workers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e45daa5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2023-04-19 00:38:45,607 - distributed.nanny - WARNING - Restarting worker\n",
      "2023-04-19 00:38:45,612 - distributed.nanny - WARNING - Restarting worker\n",
      "2023-04-19 00:38:45,615 - distributed.nanny - WARNING - Restarting worker\n",
      "2023-04-19 00:38:45,619 - distributed.nanny - WARNING - Restarting worker\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-88630168-de4a-11ed-84db-817420155238</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "\n",
       "        <tr>\n",
       "        \n",
       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
       "        \n",
       "        </tr>\n",
       "\n",
       "        \n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\"></td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        </table>\n",
       "\n",
       "        \n",
       "\n",
       "        \n",
       "            <details>\n",
       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
       "    </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">dc1fef2a</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Workers:</strong> 4\n",
       "                </td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total threads:</strong> 4\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total memory:</strong> 15.62 GiB\n",
       "                </td>\n",
       "            </tr>\n",
       "            \n",
       "            <tr>\n",
       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
       "</tr>\n",
       "\n",
       "            \n",
       "        </table>\n",
       "\n",
       "        <details>\n",
       "            <summary style=\"margin-bottom: 20px;\">\n",
       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
       "            </summary>\n",
       "\n",
       "            <div style=\"\">\n",
       "    <div>\n",
       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
       "        <div style=\"margin-left: 48px;\">\n",
       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-2867af2c-8818-4ccf-b0a0-655f92b0fd01</p>\n",
       "            <table style=\"width: 100%; text-align: left;\">\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Comm:</strong> tcp://127.0.0.1:37113\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Workers:</strong> 4\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total threads:</strong> 4\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Started:</strong> Just now\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total memory:</strong> 15.62 GiB\n",
       "                    </td>\n",
       "                </tr>\n",
       "            </table>\n",
       "        </div>\n",
       "    </div>\n",
       "\n",
       "    <details style=\"margin-left: 48px;\">\n",
       "        <summary style=\"margin-bottom: 20px;\">\n",
       "            <h3 style=\"display: inline;\">Workers</h3>\n",
       "        </summary>\n",
       "\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:45535\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:43185/status\" target=\"_blank\">http://127.0.0.1:43185/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 3.91 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:34625\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-zhmjn6se\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:44207\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40325/status\" target=\"_blank\">http://127.0.0.1:40325/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 3.91 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:39325\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-tsr0itf1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:42029\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:35501/status\" target=\"_blank\">http://127.0.0.1:35501/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 3.91 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:44389\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-r83z_24l\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:43807\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:40331/status\" target=\"_blank\">http://127.0.0.1:40331/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 3.91 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:44079\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> /tmp/dask-worker-space/worker-7fr9za74\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "\n",
       "    </details>\n",
       "</div>\n",
       "\n",
       "        </details>\n",
       "    </div>\n",
       "</div>\n",
       "            </details>\n",
       "        \n",
       "\n",
       "    </div>\n",
       "</div>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:37113' processes=4 threads=4, memory=15.62 GiB>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a Dask scheduler and a number of Dask workers. \n",
    "# If no arguments are specified then it will autodetect the number of CPU cores your system has \n",
    "# and the amount of memory and create workers to appropriately fill that.\n",
    "# It will also start the Dask Dashboard which is useful to \n",
    "# visualize the state of your cluster and computations.\n",
    "\n",
    "client = Client()\n",
    "client.restart()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ba665269",
   "metadata": {},
   "source": [
    "## Read CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3245ba87",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><strong>Dask DataFrame Structure:</strong></div>\n",
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>helpful</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>overall</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>reviewTime</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>npartitions=55</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>object</td>\n",
       "      <td>object</td>\n",
       "      <td>object</td>\n",
       "      <td>object</td>\n",
       "      <td>object</td>\n",
       "      <td>float64</td>\n",
       "      <td>object</td>\n",
       "      <td>float64</td>\n",
       "      <td>object</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "<div>Dask Name: read-csv, 1 graph layer</div>"
      ],
      "text/plain": [
       "Dask DataFrame Structure:\n",
       "               reviewerID    asin reviewerName helpful reviewText  overall summary unixReviewTime reviewTime\n",
       "npartitions=55                                                                                              \n",
       "                   object  object       object  object     object  float64  object        float64     object\n",
       "                      ...     ...          ...     ...        ...      ...     ...            ...        ...\n",
       "...                   ...     ...          ...     ...        ...      ...     ...            ...        ...\n",
       "                      ...     ...          ...     ...        ...      ...     ...            ...        ...\n",
       "                      ...     ...          ...     ...        ...      ...     ...            ...        ...\n",
       "Dask Name: read-csv, 1 graph layer"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read CSV file: Dask doesn't load the data immediately. Dask has just read a few rows \n",
    "# at the start of the file, and inferred the column names and dtypes.\n",
    "\n",
    "user_reviews_ddf = dd.read_csv('demo_data.csv')\n",
    "user_reviews_ddf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "87d39b2b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "878057262af64833bce8581c75a93e47",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# visualize the task graph\n",
    "\n",
    "user_reviews_ddf.visualize()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b106d645",
   "metadata": {},
   "source": [
    "## Mean computation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "310966a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dask operations are evaluated lazily: Dask constructs the task graph of the \n",
    "# computation immediately but “evaluates” them only when necessary.\n",
    "\n",
    "mean_graph = user_reviews_ddf[\"overall\"].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "03d9fb7a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f1a03bfa18754f9b9ab3507f84825fc9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# visualize task graph\n",
    "\n",
    "mean_graph.visualize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7c11cb1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean of 'overall' attribute is 4.163392099728361\n"
     ]
    }
   ],
   "source": [
    "# trigger computation to calculate mean of column\n",
    "\n",
    "result = mean_graph.compute()\n",
    "print(f\"mean of 'overall' attribute is {result}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ee22f91a",
   "metadata": {},
   "source": [
    "## Implicit compute() for len(), head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "da96abfd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>helpful</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>overall</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>reviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A2T0RJ91B0PQ03</td>\n",
       "      <td>B0016CRVLW</td>\n",
       "      <td>Gerald DeWitt</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Beware!  This is NOT the original single versi...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Poor Quality Alternate Take</td>\n",
       "      <td>1.400630e+09</td>\n",
       "      <td>05 21, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A3TYW0XA8HSGWB</td>\n",
       "      <td>B00EKR5S0Q</td>\n",
       "      <td>Linda E. Larson</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>This is my new most favorite k-cup coffee. I c...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Vanilla Starbucks K-cups</td>\n",
       "      <td>1.398557e+09</td>\n",
       "      <td>04 27, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2CME0TQU2IVVB</td>\n",
       "      <td>B001AUPJVO</td>\n",
       "      <td>L5Momma</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>This headset is great!  It worked in our 2007 ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Awesome!</td>\n",
       "      <td>1.355875e+09</td>\n",
       "      <td>12 19, 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A2E5IDLX7R388S</td>\n",
       "      <td>B000055Y57</td>\n",
       "      <td>Jeff Andersen</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Scofield is one of my favorite musicians and i...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Straight ahead Jazz with the Scofield twist</td>\n",
       "      <td>1.402358e+09</td>\n",
       "      <td>06 10, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A3CIEMYUGV6ZMR</td>\n",
       "      <td>0545265355</td>\n",
       "      <td>Adroit</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Wonderful book! I cried, well teared up at a f...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Greatest Book Ever!!!</td>\n",
       "      <td>1.334102e+09</td>\n",
       "      <td>04 11, 2012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       reviewerID        asin     reviewerName helpful   \n",
       "0  A2T0RJ91B0PQ03  B0016CRVLW    Gerald DeWitt  [0, 0]  \\\n",
       "1  A3TYW0XA8HSGWB  B00EKR5S0Q  Linda E. Larson  [0, 0]   \n",
       "2  A2CME0TQU2IVVB  B001AUPJVO          L5Momma  [1, 1]   \n",
       "3  A2E5IDLX7R388S  B000055Y57    Jeff Andersen  [0, 0]   \n",
       "4  A3CIEMYUGV6ZMR  0545265355           Adroit  [0, 0]   \n",
       "\n",
       "                                          reviewText  overall   \n",
       "0  Beware!  This is NOT the original single versi...      1.0  \\\n",
       "1  This is my new most favorite k-cup coffee. I c...      5.0   \n",
       "2  This headset is great!  It worked in our 2007 ...      5.0   \n",
       "3  Scofield is one of my favorite musicians and i...      5.0   \n",
       "4  Wonderful book! I cried, well teared up at a f...      5.0   \n",
       "\n",
       "                                       summary  unixReviewTime   reviewTime  \n",
       "0                  Poor Quality Alternate Take    1.400630e+09  05 21, 2014  \n",
       "1                     Vanilla Starbucks K-cups    1.398557e+09  04 27, 2014  \n",
       "2                                     Awesome!    1.355875e+09  12 19, 2012  \n",
       "3  Straight ahead Jazz with the Scofield twist    1.402358e+09  06 10, 2014  \n",
       "4                        Greatest Book Ever!!!    1.334102e+09  04 11, 2012  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Some functions like len and head also trigger a computation implicitly\n",
    "\n",
    "user_reviews_ddf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "740e808f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6158168"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Calling len() will: \n",
    "# - load actual data, (that is, load data into multiple pandas DataFrames) \n",
    "# - find length of each pandas DataFrame (also known as a partition) \n",
    "# - combine the subtotals to give you the final grand total\n",
    "\n",
    "len(user_reviews_ddf)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9c78248",
   "metadata": {},
   "source": [
    "## Inspecting Individual Partitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "9edb3920",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of partitions = 55\n"
     ]
    }
   ],
   "source": [
    "# Internally, a Dask DataFrame is split into many partitions, where each partition is one Pandas DataFrame. \n",
    "\n",
    "num_partitions = user_reviews_ddf.npartitions\n",
    "print(f\"Total number of partitions = {num_partitions}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "edd2558d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>helpful</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>overall</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>reviewTime</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A2T0RJ91B0PQ03</td>\n",
       "      <td>B0016CRVLW</td>\n",
       "      <td>Gerald DeWitt</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Beware!  This is NOT the original single versi...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Poor Quality Alternate Take</td>\n",
       "      <td>1.400630e+09</td>\n",
       "      <td>05 21, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A3TYW0XA8HSGWB</td>\n",
       "      <td>B00EKR5S0Q</td>\n",
       "      <td>Linda E. Larson</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>This is my new most favorite k-cup coffee. I c...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Vanilla Starbucks K-cups</td>\n",
       "      <td>1.398557e+09</td>\n",
       "      <td>04 27, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2CME0TQU2IVVB</td>\n",
       "      <td>B001AUPJVO</td>\n",
       "      <td>L5Momma</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>This headset is great!  It worked in our 2007 ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Awesome!</td>\n",
       "      <td>1.355875e+09</td>\n",
       "      <td>12 19, 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A2E5IDLX7R388S</td>\n",
       "      <td>B000055Y57</td>\n",
       "      <td>Jeff Andersen</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Scofield is one of my favorite musicians and i...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Straight ahead Jazz with the Scofield twist</td>\n",
       "      <td>1.402358e+09</td>\n",
       "      <td>06 10, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A3CIEMYUGV6ZMR</td>\n",
       "      <td>0545265355</td>\n",
       "      <td>Adroit</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Wonderful book! I cried, well teared up at a f...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Greatest Book Ever!!!</td>\n",
       "      <td>1.334102e+09</td>\n",
       "      <td>04 11, 2012</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111952</th>\n",
       "      <td>A1N2ZAC86P26BF</td>\n",
       "      <td>6303823351</td>\n",
       "      <td>David</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>Help! is probably my favorite of the Beatles m...</td>\n",
       "      <td>4.0</td>\n",
       "      <td>The best of the Beatles films</td>\n",
       "      <td>9.982656e+08</td>\n",
       "      <td>08 20, 2001</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111953</th>\n",
       "      <td>AUFN1J7VJZL83</td>\n",
       "      <td>B002OHE20G</td>\n",
       "      <td>Amanda Banks</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>This heater has worked out very well for a sma...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>eliable, SAFE Heat</td>\n",
       "      <td>1.402099e+09</td>\n",
       "      <td>06 7, 2014</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111954</th>\n",
       "      <td>AGZK126DNQ2FN</td>\n",
       "      <td>1401340970</td>\n",
       "      <td>Cy B. Hilterman \"Cy. Hilterman\"</td>\n",
       "      <td>[2, 2]</td>\n",
       "      <td>As a person that has made many trips to Niagar...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Romance and adventure in the Niagara Falls area</td>\n",
       "      <td>1.250035e+09</td>\n",
       "      <td>08 12, 2009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111955</th>\n",
       "      <td>A1LA51JOIGGD45</td>\n",
       "      <td>1400071550</td>\n",
       "      <td>E.A. West</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>The battle between good and evil continues in ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Heroic battle between good and evil</td>\n",
       "      <td>1.366589e+09</td>\n",
       "      <td>04 22, 2013</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111956</th>\n",
       "      <td>A7THMVS5SV6HF</td>\n",
       "      <td>B00B7EQPNE</td>\n",
       "      <td>mavman70</td>\n",
       "      <td>[1, 2]</td>\n",
       "      <td>We ordered this fer our bluray player app and ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>GRANDSON LOVES IT!!!!</td>\n",
       "      <td>1.366330e+09</td>\n",
       "      <td>04 19, 2013</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>111957 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            reviewerID        asin                     reviewerName helpful   \n",
       "0       A2T0RJ91B0PQ03  B0016CRVLW                    Gerald DeWitt  [0, 0]  \\\n",
       "1       A3TYW0XA8HSGWB  B00EKR5S0Q                  Linda E. Larson  [0, 0]   \n",
       "2       A2CME0TQU2IVVB  B001AUPJVO                          L5Momma  [1, 1]   \n",
       "3       A2E5IDLX7R388S  B000055Y57                    Jeff Andersen  [0, 0]   \n",
       "4       A3CIEMYUGV6ZMR  0545265355                           Adroit  [0, 0]   \n",
       "...                ...         ...                              ...     ...   \n",
       "111952  A1N2ZAC86P26BF  6303823351                            David  [1, 1]   \n",
       "111953   AUFN1J7VJZL83  B002OHE20G                     Amanda Banks  [0, 0]   \n",
       "111954   AGZK126DNQ2FN  1401340970  Cy B. Hilterman \"Cy. Hilterman\"  [2, 2]   \n",
       "111955  A1LA51JOIGGD45  1400071550                        E.A. West  [0, 0]   \n",
       "111956   A7THMVS5SV6HF  B00B7EQPNE                         mavman70  [1, 2]   \n",
       "\n",
       "                                               reviewText  overall   \n",
       "0       Beware!  This is NOT the original single versi...      1.0  \\\n",
       "1       This is my new most favorite k-cup coffee. I c...      5.0   \n",
       "2       This headset is great!  It worked in our 2007 ...      5.0   \n",
       "3       Scofield is one of my favorite musicians and i...      5.0   \n",
       "4       Wonderful book! I cried, well teared up at a f...      5.0   \n",
       "...                                                   ...      ...   \n",
       "111952  Help! is probably my favorite of the Beatles m...      4.0   \n",
       "111953  This heater has worked out very well for a sma...      5.0   \n",
       "111954  As a person that has made many trips to Niagar...      5.0   \n",
       "111955  The battle between good and evil continues in ...      5.0   \n",
       "111956  We ordered this fer our bluray player app and ...      5.0   \n",
       "\n",
       "                                                summary  unixReviewTime   \n",
       "0                           Poor Quality Alternate Take    1.400630e+09  \\\n",
       "1                              Vanilla Starbucks K-cups    1.398557e+09   \n",
       "2                                              Awesome!    1.355875e+09   \n",
       "3           Straight ahead Jazz with the Scofield twist    1.402358e+09   \n",
       "4                                 Greatest Book Ever!!!    1.334102e+09   \n",
       "...                                                 ...             ...   \n",
       "111952                    The best of the Beatles films    9.982656e+08   \n",
       "111953                               eliable, SAFE Heat    1.402099e+09   \n",
       "111954  Romance and adventure in the Niagara Falls area    1.250035e+09   \n",
       "111955              Heroic battle between good and evil    1.366589e+09   \n",
       "111956                            GRANDSON LOVES IT!!!!    1.366330e+09   \n",
       "\n",
       "         reviewTime  \n",
       "0       05 21, 2014  \n",
       "1       04 27, 2014  \n",
       "2       12 19, 2012  \n",
       "3       06 10, 2014  \n",
       "4       04 11, 2012  \n",
       "...             ...  \n",
       "111952  08 20, 2001  \n",
       "111953   06 7, 2014  \n",
       "111954  08 12, 2009  \n",
       "111955  04 22, 2013  \n",
       "111956  04 19, 2013  \n",
       "\n",
       "[111957 rows x 9 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "partition1 = user_reviews_ddf.partitions[0].compute()\n",
    "partition1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "4a5ae3fe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The number of partitions is often automatically determined based on available \n",
    "# physical memory and the number of cores, but can also be manually specified.\n",
    "\n",
    "user_reviews_repart_ddf = user_reviews_ddf.repartition(npartitions=10)  \n",
    "user_reviews_repart_ddf.npartitions"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49f9b27d",
   "metadata": {},
   "source": [
    "## Map-Partition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cb1b8c9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply Python function on each DataFrame partition.\n",
    "# Here we apply a function with arguments and keywords to a DataFrame, resulting in a Series.\n",
    "\n",
    "def myadd(df, a, b=1):\n",
    "    return df.overall + a + b\n",
    "\n",
    "res = user_reviews_ddf.map_partitions(myadd, 1, b=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "0943bdea",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         2.0\n",
       "1         6.0\n",
       "2         6.0\n",
       "3         6.0\n",
       "4         6.0\n",
       "         ... \n",
       "112342    6.0\n",
       "112343    6.0\n",
       "112344    6.0\n",
       "112345    6.0\n",
       "112346    5.0\n",
       "Name: overall, Length: 6158168, dtype: float64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "49864501",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>reviewerID</th>\n",
       "      <th>asin</th>\n",
       "      <th>reviewerName</th>\n",
       "      <th>helpful</th>\n",
       "      <th>reviewText</th>\n",
       "      <th>overall</th>\n",
       "      <th>summary</th>\n",
       "      <th>unixReviewTime</th>\n",
       "      <th>reviewTime</th>\n",
       "      <th>new_col</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A2T0RJ91B0PQ03</td>\n",
       "      <td>B0016CRVLW</td>\n",
       "      <td>Gerald DeWitt</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Beware!  This is NOT the original single versi...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Poor Quality Alternate Take</td>\n",
       "      <td>1.400630e+09</td>\n",
       "      <td>05 21, 2014</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A3TYW0XA8HSGWB</td>\n",
       "      <td>B00EKR5S0Q</td>\n",
       "      <td>Linda E. Larson</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>This is my new most favorite k-cup coffee. I c...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Vanilla Starbucks K-cups</td>\n",
       "      <td>1.398557e+09</td>\n",
       "      <td>04 27, 2014</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2CME0TQU2IVVB</td>\n",
       "      <td>B001AUPJVO</td>\n",
       "      <td>L5Momma</td>\n",
       "      <td>[1, 1]</td>\n",
       "      <td>This headset is great!  It worked in our 2007 ...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Awesome!</td>\n",
       "      <td>1.355875e+09</td>\n",
       "      <td>12 19, 2012</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A2E5IDLX7R388S</td>\n",
       "      <td>B000055Y57</td>\n",
       "      <td>Jeff Andersen</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Scofield is one of my favorite musicians and i...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Straight ahead Jazz with the Scofield twist</td>\n",
       "      <td>1.402358e+09</td>\n",
       "      <td>06 10, 2014</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A3CIEMYUGV6ZMR</td>\n",
       "      <td>0545265355</td>\n",
       "      <td>Adroit</td>\n",
       "      <td>[0, 0]</td>\n",
       "      <td>Wonderful book! I cried, well teared up at a f...</td>\n",
       "      <td>5.0</td>\n",
       "      <td>Greatest Book Ever!!!</td>\n",
       "      <td>1.334102e+09</td>\n",
       "      <td>04 11, 2012</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       reviewerID        asin     reviewerName helpful   \n",
       "0  A2T0RJ91B0PQ03  B0016CRVLW    Gerald DeWitt  [0, 0]  \\\n",
       "1  A3TYW0XA8HSGWB  B00EKR5S0Q  Linda E. Larson  [0, 0]   \n",
       "2  A2CME0TQU2IVVB  B001AUPJVO          L5Momma  [1, 1]   \n",
       "3  A2E5IDLX7R388S  B000055Y57    Jeff Andersen  [0, 0]   \n",
       "4  A3CIEMYUGV6ZMR  0545265355           Adroit  [0, 0]   \n",
       "\n",
       "                                          reviewText  overall   \n",
       "0  Beware!  This is NOT the original single versi...      1.0  \\\n",
       "1  This is my new most favorite k-cup coffee. I c...      5.0   \n",
       "2  This headset is great!  It worked in our 2007 ...      5.0   \n",
       "3  Scofield is one of my favorite musicians and i...      5.0   \n",
       "4  Wonderful book! I cried, well teared up at a f...      5.0   \n",
       "\n",
       "                                       summary  unixReviewTime   reviewTime   \n",
       "0                  Poor Quality Alternate Take    1.400630e+09  05 21, 2014  \\\n",
       "1                     Vanilla Starbucks K-cups    1.398557e+09  04 27, 2014   \n",
       "2                                     Awesome!    1.355875e+09  12 19, 2012   \n",
       "3  Straight ahead Jazz with the Scofield twist    1.402358e+09  06 10, 2014   \n",
       "4                        Greatest Book Ever!!!    1.334102e+09  04 11, 2012   \n",
       "\n",
       "   new_col  \n",
       "0      2.0  \n",
       "1      6.0  \n",
       "2      6.0  \n",
       "3      6.0  \n",
       "4      6.0  "
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Here we apply a function to a Series resulting in a Series\n",
    "\n",
    "res = user_reviews_ddf.overall.map_partitions(lambda x: x+1)\n",
    "df_new = user_reviews_ddf.assign(new_col=res)\n",
    "df_new.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dd69376b",
   "metadata": {},
   "source": [
    "## Groupby"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "143ad455",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The groupby() operation groups data by an attribute and performs operations on these groups\n",
    "# Here we are computing the average rating of each ASIN.\n",
    "\n",
    "# IMPORTANT:\n",
    "\n",
    "# 1) The sort parameter sorts the group keys and is set to True by default, \n",
    "# We can get better performance by turning this off.\n",
    "\n",
    "# 2) Dask assumes that the groupby reduction returns an object that is small enough to fit into memory (DRAM).\n",
    "# By default, groupby methods return an object with only 1 partition. \n",
    "# If the returned object is large (such as the case when there are a large number of groups), it can result in a memory error.\n",
    "\n",
    "product_means = user_reviews_ddf.groupby('asin', sort=False).overall.mean()\n",
    "product_means.npartitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2d80b563",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# If the returned object is large (such as the case when there are a large number of groups), we can increase \n",
    "# the number of output partitions using the split_out argument.\n",
    "\n",
    "product_means = user_reviews_ddf.groupby('asin', sort=False).overall.mean(split_out=4)\n",
    "product_means.npartitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "771f9f7b",
   "metadata": {},
   "outputs": [],
   "source": [
    "res = product_means.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "0292678d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "asin\n",
       "0002115751    4.000000\n",
       "0002226049    3.000000\n",
       "0006492460    5.000000\n",
       "0006514006    4.306818\n",
       "0006540686    4.000000\n",
       "Name: overall, dtype: float64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bf265f3e",
   "metadata": {},
   "source": [
    "## Calling compute() On Related Operations Allows for Task Sharing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9b31352b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 7.51 s, sys: 1.39 s, total: 8.9 s\n",
      "Wall time: 46.1 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "s1 = user_reviews_ddf.groupby(\"asin\").overall.sum().compute()\n",
    "s2 = user_reviews_ddf.groupby(\"asin\").overall.mean().compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "67d193f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 6.11 s, sys: 1.34 s, total: 7.45 s\n",
      "Wall time: 36.3 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "s1 = user_reviews_ddf.groupby(\"asin\").overall.sum()\n",
    "s2 = user_reviews_ddf.groupby(\"asin\").overall.mean()\n",
    "\n",
    "out = dd.compute(s1,s2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "a06e7ee8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "asin\n",
      "0000031887      93.0\n",
      "000100039X     239.0\n",
      "0002007770    1828.0\n",
      "0002115751       4.0\n",
      "0002178559       8.0\n",
      "               ...  \n",
      "B00LHZ093U       5.0\n",
      "B00LK80AYM       5.0\n",
      "B00LKJU5XW       2.0\n",
      "B00LMBWAP4       5.0\n",
      "B00LS14LLY       4.0\n",
      "Name: overall, Length: 2306233, dtype: float64 asin\n",
      "0000031887    4.428571\n",
      "000100039X    4.509434\n",
      "0002007770    4.383693\n",
      "0002115751    4.000000\n",
      "0002178559    4.000000\n",
      "                ...   \n",
      "B00LHZ093U    5.000000\n",
      "B00LK80AYM    5.000000\n",
      "B00LKJU5XW    2.000000\n",
      "B00LMBWAP4    5.000000\n",
      "B00LS14LLY    4.000000\n",
      "Name: overall, Length: 2306233, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "print(out[0], out[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe403ed5",
   "metadata": {},
   "source": [
    "## Do not call compute() multiple times"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "81554a5b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/dask_env/lib/python3.10/site-packages/dask/dataframe/multi.py:1287: UserWarning: Concatenating dataframes with unknown divisions.\n",
      "We're assuming that the indices of each dataframes are \n",
      " aligned. This assumption is not generally safe.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 c1            c2\n",
      "count  2.306233e+06  2.306233e+06\n",
      "mean   1.111721e+01  4.152129e+00\n",
      "std    4.247923e+01  1.149007e+00\n",
      "min    1.000000e+00  1.000000e+00\n",
      "25%    4.000000e+00  4.000000e+00\n",
      "50%    5.000000e+00  4.666667e+00\n",
      "75%    9.000000e+00  5.000000e+00\n",
      "max    8.345000e+03  5.000000e+00\n",
      "25.86827325820923\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "\n",
    "agg_1 = user_reviews_ddf.groupby('asin').overall.sum()\n",
    "agg_1.name = 'c1'\n",
    "agg_2 = user_reviews_ddf.groupby('asin').overall.mean()\n",
    "agg_2.name = 'c2'\n",
    "final = dd.concat([agg_1, agg_2], axis=1)\n",
    "submit = final.describe().compute()\n",
    "\n",
    "end = time.time()\n",
    "runtime = end-start\n",
    "\n",
    "print(submit)\n",
    "print(runtime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "37b84038",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 c1            c2\n",
      "count  2.306233e+06  2.306233e+06\n",
      "mean   1.111721e+01  4.152129e+00\n",
      "std    4.247923e+01  1.149007e+00\n",
      "min    1.000000e+00  1.000000e+00\n",
      "25%    4.000000e+00  4.000000e+00\n",
      "50%    5.000000e+00  4.666667e+00\n",
      "75%    9.000000e+00  5.000000e+00\n",
      "max    8.345000e+03  5.000000e+00\n",
      "57.539384603500366\n"
     ]
    }
   ],
   "source": [
    "start = time.time()\n",
    "\n",
    "agg_1 = user_reviews_ddf.groupby('asin').overall.sum().compute()\n",
    "agg_1.name = 'c1'\n",
    "agg_2 = user_reviews_ddf.groupby('asin').overall.mean().compute()\n",
    "agg_2.name = 'c2'\n",
    "final = dd.concat([agg_1, agg_2], axis=1)\n",
    "submit = final.describe().compute()\n",
    "\n",
    "end = time.time()\n",
    "runtime = end-start\n",
    "\n",
    "print(submit)\n",
    "print(runtime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93667129",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}