{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "85289f7b", "metadata": {}, "outputs": [], "source": [ "import time\n", "import dask.dataframe as dd\n", "from dask.distributed import Client" ] }, { "cell_type": "markdown", "id": "1615379f", "metadata": {}, "source": [ "## Start Dask Scheduler and Workers" ] }, { "cell_type": "code", "execution_count": 2, "id": "e45daa5b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-04-19 00:38:45,607 - distributed.nanny - WARNING - Restarting worker\n", "2023-04-19 00:38:45,612 - distributed.nanny - WARNING - Restarting worker\n", "2023-04-19 00:38:45,615 - distributed.nanny - WARNING - Restarting worker\n", "2023-04-19 00:38:45,619 - distributed.nanny - WARNING - Restarting worker\n" ] }, { "data": { "text/html": [ "
\n", "
\n", "
\n", "

Client

\n", "

Client-88630168-de4a-11ed-84db-817420155238

\n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "
Connection method: Cluster objectCluster type: distributed.LocalCluster
\n", " Dashboard: http://127.0.0.1:8787/status\n", "
\n", "\n", " \n", "\n", " \n", "
\n", "

Cluster Info

\n", "
\n", "
\n", "
\n", "
\n", "

LocalCluster

\n", "

dc1fef2a

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", "\n", " \n", "
\n", " Dashboard: http://127.0.0.1:8787/status\n", " \n", " Workers: 4\n", "
\n", " Total threads: 4\n", " \n", " Total memory: 15.62 GiB\n", "
Status: runningUsing processes: True
\n", "\n", "
\n", " \n", "

Scheduler Info

\n", "
\n", "\n", "
\n", "
\n", "
\n", "
\n", "

Scheduler

\n", "

Scheduler-2867af2c-8818-4ccf-b0a0-655f92b0fd01

\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
\n", " Comm: tcp://127.0.0.1:37113\n", " \n", " Workers: 4\n", "
\n", " Dashboard: http://127.0.0.1:8787/status\n", " \n", " Total threads: 4\n", "
\n", " Started: Just now\n", " \n", " Total memory: 15.62 GiB\n", "
\n", "
\n", "
\n", "\n", "
\n", " \n", "

Workers

\n", "
\n", "\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 0

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:45535\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:43185/status\n", " \n", " Memory: 3.91 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:34625\n", "
\n", " Local directory: /tmp/dask-worker-space/worker-zhmjn6se\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 1

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:44207\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:40325/status\n", " \n", " Memory: 3.91 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:39325\n", "
\n", " Local directory: /tmp/dask-worker-space/worker-tsr0itf1\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 2

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:42029\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:35501/status\n", " \n", " Memory: 3.91 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:44389\n", "
\n", " Local directory: /tmp/dask-worker-space/worker-r83z_24l\n", "
\n", "
\n", "
\n", "
\n", " \n", "
\n", "
\n", "
\n", "
\n", " \n", "

Worker: 3

\n", "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "\n", " \n", "\n", " \n", "\n", "
\n", " Comm: tcp://127.0.0.1:43807\n", " \n", " Total threads: 1\n", "
\n", " Dashboard: http://127.0.0.1:40331/status\n", " \n", " Memory: 3.91 GiB\n", "
\n", " Nanny: tcp://127.0.0.1:44079\n", "
\n", " Local directory: /tmp/dask-worker-space/worker-7fr9za74\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "
\n", "\n", "
\n", "
\n", "
\n", "
\n", " \n", "\n", "
\n", "
" ], "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a Dask scheduler and a number of Dask workers. \n", "# If no arguments are specified then it will autodetect the number of CPU cores your system has \n", "# and the amount of memory and create workers to appropriately fill that.\n", "# It will also start the Dask Dashboard which is useful to \n", "# visualize the state of your cluster and computations.\n", "\n", "client = Client()\n", "client.restart()" ] }, { "cell_type": "markdown", "id": "ba665269", "metadata": {}, "source": [ "## Read CSV" ] }, { "cell_type": "code", "execution_count": 3, "id": "3245ba87", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Dask DataFrame Structure:
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reviewerIDasinreviewerNamehelpfulreviewTextoverallsummaryunixReviewTimereviewTime
npartitions=55
objectobjectobjectobjectobjectfloat64objectfloat64object
...........................
..............................
...........................
...........................
\n", "
\n", "
Dask Name: read-csv, 1 graph layer
" ], "text/plain": [ "Dask DataFrame Structure:\n", " reviewerID asin reviewerName helpful reviewText overall summary unixReviewTime reviewTime\n", "npartitions=55 \n", " object object object object object float64 object float64 object\n", " ... ... ... ... ... ... ... ... ...\n", "... ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ...\n", " ... ... ... ... ... ... ... ... ...\n", "Dask Name: read-csv, 1 graph layer" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Read CSV file: Dask doesn't load the data immediately. Dask has just read a few rows \n", "# at the start of the file, and inferred the column names and dtypes.\n", "\n", "user_reviews_ddf = dd.read_csv('demo_data.csv')\n", "user_reviews_ddf" ] }, { "cell_type": "code", "execution_count": 4, "id": "87d39b2b", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "878057262af64833bce8581c75a93e47", "version_major": 2, "version_minor": 0 }, "text/plain": [ "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# visualize the task graph\n", "\n", "user_reviews_ddf.visualize()" ] }, { "cell_type": "markdown", "id": "b106d645", "metadata": {}, "source": [ "## Mean computation" ] }, { "cell_type": "code", "execution_count": 5, "id": "310966a4", "metadata": {}, "outputs": [], "source": [ "# Dask operations are evaluated lazily: Dask constructs the task graph of the \n", "# computation immediately but “evaluates” them only when necessary.\n", "\n", "mean_graph = user_reviews_ddf[\"overall\"].mean()" ] }, { "cell_type": "code", "execution_count": 6, "id": "03d9fb7a", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f1a03bfa18754f9b9ab3507f84825fc9", "version_major": 2, "version_minor": 0 }, "text/plain": [ "CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# visualize task graph\n", "\n", "mean_graph.visualize()" ] }, { "cell_type": "code", "execution_count": 7, "id": "7c11cb1c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mean of 'overall' attribute is 4.163392099728361\n" ] } ], "source": [ "# trigger computation to calculate mean of column\n", "\n", "result = mean_graph.compute()\n", "print(f\"mean of 'overall' attribute is {result}\")" ] }, { "cell_type": "markdown", "id": "ee22f91a", "metadata": {}, "source": [ "## Implicit compute() for len(), head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "da96abfd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reviewerIDasinreviewerNamehelpfulreviewTextoverallsummaryunixReviewTimereviewTime
0A2T0RJ91B0PQ03B0016CRVLWGerald DeWitt[0, 0]Beware! This is NOT the original single versi...1.0Poor Quality Alternate Take1.400630e+0905 21, 2014
1A3TYW0XA8HSGWBB00EKR5S0QLinda E. Larson[0, 0]This is my new most favorite k-cup coffee. I c...5.0Vanilla Starbucks K-cups1.398557e+0904 27, 2014
2A2CME0TQU2IVVBB001AUPJVOL5Momma[1, 1]This headset is great! It worked in our 2007 ...5.0Awesome!1.355875e+0912 19, 2012
3A2E5IDLX7R388SB000055Y57Jeff Andersen[0, 0]Scofield is one of my favorite musicians and i...5.0Straight ahead Jazz with the Scofield twist1.402358e+0906 10, 2014
4A3CIEMYUGV6ZMR0545265355Adroit[0, 0]Wonderful book! I cried, well teared up at a f...5.0Greatest Book Ever!!!1.334102e+0904 11, 2012
\n", "
" ], "text/plain": [ " reviewerID asin reviewerName helpful \n", "0 A2T0RJ91B0PQ03 B0016CRVLW Gerald DeWitt [0, 0] \\\n", "1 A3TYW0XA8HSGWB B00EKR5S0Q Linda E. Larson [0, 0] \n", "2 A2CME0TQU2IVVB B001AUPJVO L5Momma [1, 1] \n", "3 A2E5IDLX7R388S B000055Y57 Jeff Andersen [0, 0] \n", "4 A3CIEMYUGV6ZMR 0545265355 Adroit [0, 0] \n", "\n", " reviewText overall \n", "0 Beware! This is NOT the original single versi... 1.0 \\\n", "1 This is my new most favorite k-cup coffee. I c... 5.0 \n", "2 This headset is great! It worked in our 2007 ... 5.0 \n", "3 Scofield is one of my favorite musicians and i... 5.0 \n", "4 Wonderful book! I cried, well teared up at a f... 5.0 \n", "\n", " summary unixReviewTime reviewTime \n", "0 Poor Quality Alternate Take 1.400630e+09 05 21, 2014 \n", "1 Vanilla Starbucks K-cups 1.398557e+09 04 27, 2014 \n", "2 Awesome! 1.355875e+09 12 19, 2012 \n", "3 Straight ahead Jazz with the Scofield twist 1.402358e+09 06 10, 2014 \n", "4 Greatest Book Ever!!! 1.334102e+09 04 11, 2012 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Some functions like len and head also trigger a computation implicitly\n", "\n", "user_reviews_ddf.head()" ] }, { "cell_type": "code", "execution_count": 10, "id": "740e808f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6158168" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Calling len() will: \n", "# - load actual data, (that is, load data into multiple pandas DataFrames) \n", "# - find length of each pandas DataFrame (also known as a partition) \n", "# - combine the subtotals to give you the final grand total\n", "\n", "len(user_reviews_ddf)" ] }, { "cell_type": "markdown", "id": "c9c78248", "metadata": {}, "source": [ "## Inspecting Individual Partitions" ] }, { "cell_type": "code", "execution_count": 11, "id": "9edb3920", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total number of partitions = 55\n" ] } ], "source": [ "# Internally, a Dask DataFrame is split into many partitions, where each partition is one Pandas DataFrame. \n", "\n", "num_partitions = user_reviews_ddf.npartitions\n", "print(f\"Total number of partitions = {num_partitions}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "edd2558d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reviewerIDasinreviewerNamehelpfulreviewTextoverallsummaryunixReviewTimereviewTime
0A2T0RJ91B0PQ03B0016CRVLWGerald DeWitt[0, 0]Beware! This is NOT the original single versi...1.0Poor Quality Alternate Take1.400630e+0905 21, 2014
1A3TYW0XA8HSGWBB00EKR5S0QLinda E. Larson[0, 0]This is my new most favorite k-cup coffee. I c...5.0Vanilla Starbucks K-cups1.398557e+0904 27, 2014
2A2CME0TQU2IVVBB001AUPJVOL5Momma[1, 1]This headset is great! It worked in our 2007 ...5.0Awesome!1.355875e+0912 19, 2012
3A2E5IDLX7R388SB000055Y57Jeff Andersen[0, 0]Scofield is one of my favorite musicians and i...5.0Straight ahead Jazz with the Scofield twist1.402358e+0906 10, 2014
4A3CIEMYUGV6ZMR0545265355Adroit[0, 0]Wonderful book! I cried, well teared up at a f...5.0Greatest Book Ever!!!1.334102e+0904 11, 2012
..............................
111952A1N2ZAC86P26BF6303823351David[1, 1]Help! is probably my favorite of the Beatles m...4.0The best of the Beatles films9.982656e+0808 20, 2001
111953AUFN1J7VJZL83B002OHE20GAmanda Banks[0, 0]This heater has worked out very well for a sma...5.0eliable, SAFE Heat1.402099e+0906 7, 2014
111954AGZK126DNQ2FN1401340970Cy B. Hilterman \"Cy. Hilterman\"[2, 2]As a person that has made many trips to Niagar...5.0Romance and adventure in the Niagara Falls area1.250035e+0908 12, 2009
111955A1LA51JOIGGD451400071550E.A. West[0, 0]The battle between good and evil continues in ...5.0Heroic battle between good and evil1.366589e+0904 22, 2013
111956A7THMVS5SV6HFB00B7EQPNEmavman70[1, 2]We ordered this fer our bluray player app and ...5.0GRANDSON LOVES IT!!!!1.366330e+0904 19, 2013
\n", "

111957 rows × 9 columns

\n", "
" ], "text/plain": [ " reviewerID asin reviewerName helpful \n", "0 A2T0RJ91B0PQ03 B0016CRVLW Gerald DeWitt [0, 0] \\\n", "1 A3TYW0XA8HSGWB B00EKR5S0Q Linda E. Larson [0, 0] \n", "2 A2CME0TQU2IVVB B001AUPJVO L5Momma [1, 1] \n", "3 A2E5IDLX7R388S B000055Y57 Jeff Andersen [0, 0] \n", "4 A3CIEMYUGV6ZMR 0545265355 Adroit [0, 0] \n", "... ... ... ... ... \n", "111952 A1N2ZAC86P26BF 6303823351 David [1, 1] \n", "111953 AUFN1J7VJZL83 B002OHE20G Amanda Banks [0, 0] \n", "111954 AGZK126DNQ2FN 1401340970 Cy B. Hilterman \"Cy. Hilterman\" [2, 2] \n", "111955 A1LA51JOIGGD45 1400071550 E.A. West [0, 0] \n", "111956 A7THMVS5SV6HF B00B7EQPNE mavman70 [1, 2] \n", "\n", " reviewText overall \n", "0 Beware! This is NOT the original single versi... 1.0 \\\n", "1 This is my new most favorite k-cup coffee. I c... 5.0 \n", "2 This headset is great! It worked in our 2007 ... 5.0 \n", "3 Scofield is one of my favorite musicians and i... 5.0 \n", "4 Wonderful book! I cried, well teared up at a f... 5.0 \n", "... ... ... \n", "111952 Help! is probably my favorite of the Beatles m... 4.0 \n", "111953 This heater has worked out very well for a sma... 5.0 \n", "111954 As a person that has made many trips to Niagar... 5.0 \n", "111955 The battle between good and evil continues in ... 5.0 \n", "111956 We ordered this fer our bluray player app and ... 5.0 \n", "\n", " summary unixReviewTime \n", "0 Poor Quality Alternate Take 1.400630e+09 \\\n", "1 Vanilla Starbucks K-cups 1.398557e+09 \n", "2 Awesome! 1.355875e+09 \n", "3 Straight ahead Jazz with the Scofield twist 1.402358e+09 \n", "4 Greatest Book Ever!!! 1.334102e+09 \n", "... ... ... \n", "111952 The best of the Beatles films 9.982656e+08 \n", "111953 eliable, SAFE Heat 1.402099e+09 \n", "111954 Romance and adventure in the Niagara Falls area 1.250035e+09 \n", "111955 Heroic battle between good and evil 1.366589e+09 \n", "111956 GRANDSON LOVES IT!!!! 1.366330e+09 \n", "\n", " reviewTime \n", "0 05 21, 2014 \n", "1 04 27, 2014 \n", "2 12 19, 2012 \n", "3 06 10, 2014 \n", "4 04 11, 2012 \n", "... ... \n", "111952 08 20, 2001 \n", "111953 06 7, 2014 \n", "111954 08 12, 2009 \n", "111955 04 22, 2013 \n", "111956 04 19, 2013 \n", "\n", "[111957 rows x 9 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "partition1 = user_reviews_ddf.partitions[0].compute()\n", "partition1" ] }, { "cell_type": "code", "execution_count": 13, "id": "4a5ae3fe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The number of partitions is often automatically determined based on available \n", "# physical memory and the number of cores, but can also be manually specified.\n", "\n", "user_reviews_repart_ddf = user_reviews_ddf.repartition(npartitions=10) \n", "user_reviews_repart_ddf.npartitions" ] }, { "cell_type": "markdown", "id": "49f9b27d", "metadata": {}, "source": [ "## Map-Partition" ] }, { "cell_type": "code", "execution_count": 14, "id": "cb1b8c9b", "metadata": {}, "outputs": [], "source": [ "# Apply Python function on each DataFrame partition.\n", "# Here we apply a function with arguments and keywords to a DataFrame, resulting in a Series.\n", "\n", "def myadd(df, a, b=1):\n", " return df.overall + a + b\n", "\n", "res = user_reviews_ddf.map_partitions(myadd, 1, b=0)" ] }, { "cell_type": "code", "execution_count": 15, "id": "0943bdea", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2.0\n", "1 6.0\n", "2 6.0\n", "3 6.0\n", "4 6.0\n", " ... \n", "112342 6.0\n", "112343 6.0\n", "112344 6.0\n", "112345 6.0\n", "112346 5.0\n", "Name: overall, Length: 6158168, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.compute()" ] }, { "cell_type": "code", "execution_count": 16, "id": "49864501", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
reviewerIDasinreviewerNamehelpfulreviewTextoverallsummaryunixReviewTimereviewTimenew_col
0A2T0RJ91B0PQ03B0016CRVLWGerald DeWitt[0, 0]Beware! This is NOT the original single versi...1.0Poor Quality Alternate Take1.400630e+0905 21, 20142.0
1A3TYW0XA8HSGWBB00EKR5S0QLinda E. Larson[0, 0]This is my new most favorite k-cup coffee. I c...5.0Vanilla Starbucks K-cups1.398557e+0904 27, 20146.0
2A2CME0TQU2IVVBB001AUPJVOL5Momma[1, 1]This headset is great! It worked in our 2007 ...5.0Awesome!1.355875e+0912 19, 20126.0
3A2E5IDLX7R388SB000055Y57Jeff Andersen[0, 0]Scofield is one of my favorite musicians and i...5.0Straight ahead Jazz with the Scofield twist1.402358e+0906 10, 20146.0
4A3CIEMYUGV6ZMR0545265355Adroit[0, 0]Wonderful book! I cried, well teared up at a f...5.0Greatest Book Ever!!!1.334102e+0904 11, 20126.0
\n", "
" ], "text/plain": [ " reviewerID asin reviewerName helpful \n", "0 A2T0RJ91B0PQ03 B0016CRVLW Gerald DeWitt [0, 0] \\\n", "1 A3TYW0XA8HSGWB B00EKR5S0Q Linda E. Larson [0, 0] \n", "2 A2CME0TQU2IVVB B001AUPJVO L5Momma [1, 1] \n", "3 A2E5IDLX7R388S B000055Y57 Jeff Andersen [0, 0] \n", "4 A3CIEMYUGV6ZMR 0545265355 Adroit [0, 0] \n", "\n", " reviewText overall \n", "0 Beware! This is NOT the original single versi... 1.0 \\\n", "1 This is my new most favorite k-cup coffee. I c... 5.0 \n", "2 This headset is great! It worked in our 2007 ... 5.0 \n", "3 Scofield is one of my favorite musicians and i... 5.0 \n", "4 Wonderful book! I cried, well teared up at a f... 5.0 \n", "\n", " summary unixReviewTime reviewTime \n", "0 Poor Quality Alternate Take 1.400630e+09 05 21, 2014 \\\n", "1 Vanilla Starbucks K-cups 1.398557e+09 04 27, 2014 \n", "2 Awesome! 1.355875e+09 12 19, 2012 \n", "3 Straight ahead Jazz with the Scofield twist 1.402358e+09 06 10, 2014 \n", "4 Greatest Book Ever!!! 1.334102e+09 04 11, 2012 \n", "\n", " new_col \n", "0 2.0 \n", "1 6.0 \n", "2 6.0 \n", "3 6.0 \n", "4 6.0 " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Here we apply a function to a Series resulting in a Series\n", "\n", "res = user_reviews_ddf.overall.map_partitions(lambda x: x+1)\n", "df_new = user_reviews_ddf.assign(new_col=res)\n", "df_new.head()" ] }, { "cell_type": "markdown", "id": "dd69376b", "metadata": {}, "source": [ "## Groupby" ] }, { "cell_type": "code", "execution_count": 17, "id": "143ad455", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The groupby() operation groups data by an attribute and performs operations on these groups\n", "# Here we are computing the average rating of each ASIN.\n", "\n", "# IMPORTANT:\n", "\n", "# 1) The sort parameter sorts the group keys and is set to True by default, \n", "# We can get better performance by turning this off.\n", "\n", "# 2) Dask assumes that the groupby reduction returns an object that is small enough to fit into memory (DRAM).\n", "# By default, groupby methods return an object with only 1 partition. \n", "# If the returned object is large (such as the case when there are a large number of groups), it can result in a memory error.\n", "\n", "product_means = user_reviews_ddf.groupby('asin', sort=False).overall.mean()\n", "product_means.npartitions" ] }, { "cell_type": "code", "execution_count": 18, "id": "2d80b563", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If the returned object is large (such as the case when there are a large number of groups), we can increase \n", "# the number of output partitions using the split_out argument.\n", "\n", "product_means = user_reviews_ddf.groupby('asin', sort=False).overall.mean(split_out=4)\n", "product_means.npartitions" ] }, { "cell_type": "code", "execution_count": 21, "id": "771f9f7b", "metadata": {}, "outputs": [], "source": [ "res = product_means.compute()" ] }, { "cell_type": "code", "execution_count": 22, "id": "0292678d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "asin\n", "0002115751 4.000000\n", "0002226049 3.000000\n", "0006492460 5.000000\n", "0006514006 4.306818\n", "0006540686 4.000000\n", "Name: overall, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.head()" ] }, { "cell_type": "markdown", "id": "bf265f3e", "metadata": {}, "source": [ "## Calling compute() On Related Operations Allows for Task Sharing" ] }, { "cell_type": "code", "execution_count": 20, "id": "9b31352b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 7.51 s, sys: 1.39 s, total: 8.9 s\n", "Wall time: 46.1 s\n" ] } ], "source": [ "%%time\n", "s1 = user_reviews_ddf.groupby(\"asin\").overall.sum().compute()\n", "s2 = user_reviews_ddf.groupby(\"asin\").overall.mean().compute()" ] }, { "cell_type": "code", "execution_count": 21, "id": "67d193f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 6.11 s, sys: 1.34 s, total: 7.45 s\n", "Wall time: 36.3 s\n" ] } ], "source": [ "%%time\n", "s1 = user_reviews_ddf.groupby(\"asin\").overall.sum()\n", "s2 = user_reviews_ddf.groupby(\"asin\").overall.mean()\n", "\n", "out = dd.compute(s1,s2)" ] }, { "cell_type": "code", "execution_count": 22, "id": "a06e7ee8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "asin\n", "0000031887 93.0\n", "000100039X 239.0\n", "0002007770 1828.0\n", "0002115751 4.0\n", "0002178559 8.0\n", " ... \n", "B00LHZ093U 5.0\n", "B00LK80AYM 5.0\n", "B00LKJU5XW 2.0\n", "B00LMBWAP4 5.0\n", "B00LS14LLY 4.0\n", "Name: overall, Length: 2306233, dtype: float64 asin\n", "0000031887 4.428571\n", "000100039X 4.509434\n", "0002007770 4.383693\n", "0002115751 4.000000\n", "0002178559 4.000000\n", " ... \n", "B00LHZ093U 5.000000\n", "B00LK80AYM 5.000000\n", "B00LKJU5XW 2.000000\n", "B00LMBWAP4 5.000000\n", "B00LS14LLY 4.000000\n", "Name: overall, Length: 2306233, dtype: float64\n" ] } ], "source": [ "print(out[0], out[1])" ] }, { "cell_type": "markdown", "id": "fe403ed5", "metadata": {}, "source": [ "## Do not call compute() multiple times" ] }, { "cell_type": "code", "execution_count": 20, "id": "81554a5b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/dask_env/lib/python3.10/site-packages/dask/dataframe/multi.py:1287: UserWarning: Concatenating dataframes with unknown divisions.\n", "We're assuming that the indices of each dataframes are \n", " aligned. This assumption is not generally safe.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " c1 c2\n", "count 2.306233e+06 2.306233e+06\n", "mean 1.111721e+01 4.152129e+00\n", "std 4.247923e+01 1.149007e+00\n", "min 1.000000e+00 1.000000e+00\n", "25% 4.000000e+00 4.000000e+00\n", "50% 5.000000e+00 4.666667e+00\n", "75% 9.000000e+00 5.000000e+00\n", "max 8.345000e+03 5.000000e+00\n", "25.86827325820923\n" ] } ], "source": [ "start = time.time()\n", "\n", "agg_1 = user_reviews_ddf.groupby('asin').overall.sum()\n", "agg_1.name = 'c1'\n", "agg_2 = user_reviews_ddf.groupby('asin').overall.mean()\n", "agg_2.name = 'c2'\n", "final = dd.concat([agg_1, agg_2], axis=1)\n", "submit = final.describe().compute()\n", "\n", "end = time.time()\n", "runtime = end-start\n", "\n", "print(submit)\n", "print(runtime)" ] }, { "cell_type": "code", "execution_count": 21, "id": "37b84038", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " c1 c2\n", "count 2.306233e+06 2.306233e+06\n", "mean 1.111721e+01 4.152129e+00\n", "std 4.247923e+01 1.149007e+00\n", "min 1.000000e+00 1.000000e+00\n", "25% 4.000000e+00 4.000000e+00\n", "50% 5.000000e+00 4.666667e+00\n", "75% 9.000000e+00 5.000000e+00\n", "max 8.345000e+03 5.000000e+00\n", "57.539384603500366\n" ] } ], "source": [ "start = time.time()\n", "\n", "agg_1 = user_reviews_ddf.groupby('asin').overall.sum().compute()\n", "agg_1.name = 'c1'\n", "agg_2 = user_reviews_ddf.groupby('asin').overall.mean().compute()\n", "agg_2.name = 'c2'\n", "final = dd.concat([agg_1, agg_2], axis=1)\n", "submit = final.describe().compute()\n", "\n", "end = time.time()\n", "runtime = end-start\n", "\n", "print(submit)\n", "print(runtime)" ] }, { "cell_type": "code", "execution_count": null, "id": "93667129", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }