# Initialize Spark

import pyspark
from pyspark.sql import SparkSession, Row

print (spark.version, pyspark.version.__version__)

Starting Spark application

SparkSession available as 'spark'.

3.0.1-amzn-0 3.0.1+amzn.0


# Record the starting time of execution for timing this notebook

import time
start_time = time.time()


# Read data from HDFS or S3 - For the purposes of this assignment, you should read data from HDFS
# Although you can read directly from S3 theoretically.

# Provide the HDFS file path of the 5M dataset.
dataFileName = 'hdfs:///data/BookReviews_5M.txt'
#dataFileName='s3://week9-mgta495/data/week9-data/BookReviews_1M.txt'

# Read data from the above file path and convert it to a dataframe. 
textDF = spark.read.text(dataFileName)


# YOUR CODE HERE to print the schema
textDF.printSchema()

# YOUR CODE HERE to print the first 25 rows of the dataframe
textDF.show(25)

root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
|This was the firs...|
|Also after going ...|
|As with all of Ms...|
|I've not read any...|
|This romance nove...|
|Carolina Garcia A...|
|Not only can she ...|
|Once again Garcia...|
|The timing is jus...|
|Engaging. Dark. R...|
|Set amid the back...|
|This novel is a d...|
|If readers are ad...|
| Reviewed by Phyllis|
|      APOOO BookClub|
|A guilty pleasure...|
|In the tradition ...|
|Beryl Unger, top ...|
|What follows is a...|
|The book flap say...|
|I'd never before ...|
|The novel's narra...|
|It is centered on...|
|If you like moder...|
|Beryl Unger is a ...|
+--------------------+
only showing top 25 rows


# Do not change this cell. 

# NOTE: Counterintuitively, column objects do NOT store any data; instead they store column expressions (transformations). 
#       The below function takes in a column object, and adds more expressions to it to make a more complex transformation. 
#       Once we have a column object representing the expressions we want, use DataFrame.select(column) to apply the expressions

from pyspark.sql.functions import regexp_replace, trim, col, lower
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces."""
    return trim(lower(regexp_replace(column, "[^A-Za-z0-9 ]", ""))).alias("sentence")


# Recommended: take a look at the contents of a column object returned from removePunctuations. What's in there? 
# No answers or outputs required for this cell. 
print(removePunctuation(textDF.value))

Column<b'trim(lower(regexp_replace(value, [^A-Za-z0-9 ], ))) AS `sentence`'>


# execute the column expressions generated by removePunctuation() to clean the sentences
# After that, use the show() function to print the first 25 rows of the dataframe
# Hint: you'll need the Column object returned by removePunctuations(). 

# YOUR CODE HERE for printing the expected output.
rmv_punc_DF = textDF.select(removePunctuation(textDF.value))
rmv_punc_DF.show(25)

+--------------------+
|            sentence|
+--------------------+
|this was the firs...|
|also after going ...|
|as with all of ms...|
|ive not read any ...|
|this romance nove...|
|carolina garcia a...|
|not only can she ...|
|once again garcia...|
|the timing is jus...|
|engaging dark rea...|
|set amid the back...|
|this novel is a d...|
|if readers are ad...|
| reviewed by phyllis|
|      apooo bookclub|
|a guilty pleasure...|
|in the tradition ...|
|beryl unger top e...|
|what follows is a...|
|the book flap say...|
|id never before r...|
|the novels narrat...|
|it is centered on...|
|if you like moder...|
|beryl unger is a ...|
+--------------------+
only showing top 25 rows


# We assemble the 'split' and 'explode' column expressions, then apply them to the sentence column

from pyspark.sql.functions import split, explode

# YOUR CODE HERE for printing the first 5 rows of the dataframe after the required operations 
DF1 = rmv_punc_DF.select(split("sentence"," ").alias("csv"))
DF2 = DF1.select(explode("csv").alias("word"))
DF2.show(5)

+-----+
| word|
+-----+
| this|
|  was|
|  the|
|first|
| time|
+-----+
only showing top 5 rows


# Let's filter out all empty rows in the dataframe. 
# Hint: You may use the length() method provided to select rows where sentence length is greater than 0
from pyspark.sql.functions import length


#YOUR CODE HERE
DF3 = DF2.filter(length(DF2.word) >= 1)
DF3.show(25)

+--------------+
|          word|
+--------------+
|          this|
|           was|
|           the|
|         first|
|          time|
|             i|
|          read|
|garciaaguilera|
|             i|
|          came|
|          upon|
|           the|
|          name|
|            of|
|          this|
|          book|
|            on|
|          live|
|          with|
|         regis|
|           and|
|         kelly|
|          this|
|          book|
|           was|
+--------------+
only showing top 25 rows


# Group the dataframe by unique words, then count each group
# Hint: how do you group rows in a DataFrame? 

# YOUR CODE HERE
DF4 = DF3.groupby('word').count()
DF4.show(25)

+--------------------+-----+
|                word|count|
+--------------------+-----+
|              online|40394|
|              brands|19389|
|           squealing|  138|
|               input|33909|
|            priority| 2840|
|             flashed|  806|
|                hope|33903|
|            12months|   16|
|            everyday|12140|
|             embrace|  128|
|              filing|  538|
|       userinterface|   90|
|   purchasesupgrades|    1|
|               bebut|   24|
|             gazebos|    2|
|              outfit|  543|
|           invisable|   21|
|            techsavy|   59|
|             elevate|  409|
|             comping|    6|
|           viewpoint|  129|
|              spared|  162|
|            tiltonly|   14|
|           batteryit|   29|
|overtheshouldersa...|    1|
+--------------------+-----+
only showing top 25 rows


# Sort the dataframe by the 'count' column
# Hint: the DataFrame.count() function collides with the counts column we want to use. 
#       How else can we specify the column to sort by? 

# Uncomment the next two lines and fill your code
wordCountsSortedDF = DF4.sort("count",ascending=False)
wordCountsSortedDF.show(25)

+-----+--------+
| word|   count|
+-----+--------+
|  the|10642903|
|    i| 6326216|
|   to| 5607568|
|  and| 5537690|
|    a| 5166838|
|   it| 4654902|
|   is| 3242588|
|  for| 2860227|
| this| 2845219|
|   of| 2782166|
|   my| 2319813|
|   in| 2147373|
| with| 2046990|
| that| 1983044|
|   on| 1758801|
|  you| 1754054|
| have| 1632887|
|  but| 1508591|
|  not| 1460730|
|  was| 1434985|
|   as| 1185866|
|  are| 1007811|
|   so|  994529|
|great|  988223|
| very|  893737|
+-----+--------+
only showing top 25 rows


# Print the time since execution start - You will need this value later.
print(time.time() - start_time)

51.43699026107788


# Save results to S3

wordCountsSortedDF.coalesce(1).write.csv("s3://week9-mgta495/data/week9-data/result.csv", header=True, mode="overwrite")


# Stop Spark session

spark.stop()

Dataset	#Master Nodes	#Core Nodes	Runtime_1	Runtime_2	Runtime_3	Mean	Std
1M	1	1	44.94	44.98	44.87	44.93	0.056
5M	1	1	114.71	114.89	114.76	114.79	0.093
5M	1	3	49.48	49.52	49.58	148.58	0.050

MGTA 495: Assignment Week 9¶

Word Count on Amazon EMR¶

Tasks:¶

Due: Tuesday 9th March 11:59 PM PST¶

1. Upload the 1M dataset to S3¶

2. Copy the 5M dataset to HDFS¶

3. Start Spark Session¶

4. Examine the data¶

5. Clean the data¶

6. Get dataframe containing unique words and their counts¶

7. Sort the word count dataframe in a descending manner.¶

8. Record the execution time¶

9. Save the sorted word counts directly to S3 as a CSV file¶

Note:¶

You only need to run the section 9 and section 10 once for the 5M dataset.¶

Section 11 requires you to run multiple iterations of this Notebook, and for that you can comment out the code in section 9 so that it's easier for you to run.¶

10. Download the CSV file from S3 to your local machine and create the expected CSV output file¶

Note on Autograder¶

11. Execution times on different dataset and settings.¶

12. Screenshots of terminated EMR clusters¶