# initialize Spark

import pyspark
from pyspark.sql import SparkSession, Row

conf = pyspark.SparkConf().setAll([('spark.master', 'local[4]'),
                                   ('spark.app.name', 'Word Count')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()


# Verify that the correct versions of spark and pyspark are installed.
# print (spark.version, pyspark.version.__version__)
assert(spark.version == '3.0.1')
assert(pyspark.version.__version__ == '3.0.1')


# record the starting time of execution for timing this notebook

import time
start_time = time.time()


# Read data from HDFS
dataFileName = "hdfs:////W7/BookReviews_1M.txt"


# Read data from the file above, convert it to a dataframe.
df = spark.read.text(dataFileName) \
      .cache()


# We provide the following function for building a column expression for Task 1. 
# Do not change this cell. 

# NOTE: Counterintuitively, column objects do NOT store any data; instead they store column expressions (transformations). 
#       The below function takes in a column object, and adds more expressions to it to make a more complex transformation. 
#       Once we have a column object representing the expressions we want, use DataFrame.select(column) to apply the expressions

from pyspark.sql.functions import regexp_replace, trim, col, lower
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces."""
    return trim(lower(regexp_replace(column, "[^A-Za-z0-9 ]", ""))).alias("sentence")


# Recommended: take a look at the contents of a column object returned from removePunctuations. What's in there? 

print(removePunctuation(df.value))

Column<b'trim(lower(regexp_replace(value, [^A-Za-z0-9 ], ))) AS `sentence`'>


# execute the column expressions generated by removePunctuation() to clean the sentences
# After that, use the show() function to print the first 25 rows of the dataframe
# Hint: you'll need the Column object returned by removePunctuations(). 
df_rmv_punc = df.select(removePunctuation(df.value))
df_rmv_punc.show(25)

+--------------------+
|            sentence|
+--------------------+
|this was the firs...|
|also after going ...|
|as with all of ms...|
|ive not read any ...|
|this romance nove...|
|carolina garcia a...|
|not only can she ...|
|once again garcia...|
|the timing is jus...|
|engaging dark rea...|
|set amid the back...|
|this novel is a d...|
|if readers are ad...|
| reviewed by phyllis|
|      apooo bookclub|
|a guilty pleasure...|
|in the tradition ...|
|beryl unger top e...|
|what follows is a...|
|the book flap say...|
|id never before r...|
|the novels narrat...|
|it is centered on...|
|if you like moder...|
|beryl unger is a ...|
+--------------------+
only showing top 25 rows


# We assemble the 'split' and 'explode' column expressions, then apply them to the sentence column
from pyspark.sql.functions import split, explode

# YOUR CODE HERE for printing the first 5 rows of the dataframe after the required operations
df1 = df_rmv_punc.select(split("sentence", " ").alias('csv'))
df2 = df1.select(explode('csv').alias('word'))
df2.show(5)

+-----+
| word|
+-----+
| this|
|  was|
|  the|
|first|
| time|
+-----+
only showing top 5 rows


# Let's filter out all empty rows in the dataframe. 

from pyspark.sql.functions import length

# Hint: You may use the length() method provided to select rows where sentence length is greater than 1
df3 = df2.filter(length("word") > 0)
df3.show(25)

+--------------+
|          word|
+--------------+
|          this|
|           was|
|           the|
|         first|
|          time|
|             i|
|          read|
|garciaaguilera|
|             i|
|          came|
|          upon|
|           the|
|          name|
|            of|
|          this|
|          book|
|            on|
|          live|
|          with|
|         regis|
|           and|
|         kelly|
|          this|
|          book|
|           was|
+--------------+
only showing top 25 rows


## Group the dataframe by unique words, then count each group
df4 = df3.groupBy('word').count()
# Hint: how do you group rows in a DataFrame? 
df4.show(25)
# YOUR CODE HERE

+-----------+-----+
|       word|count|
+-----------+-----+
|      still|52574|
|       hope| 6729|
|       some|74982|
|      those|22067|
|        few|33375|
|    degrade|  343|
|  bookshelf|  900|
| amazonings|    1|
|  recognize| 2008|
|      inner|  819|
|     harder| 1441|
|    lyrical|   14|
|  viewpoint|   37|
|handicapped|   51|
|      spoil|   84|
|   historys|    2|
|   everyday| 2493|
|  meursault|    1|
|        art| 1291|
|  involving|  142|
|  connected| 9172|
|     spared|   43|
|     doubts|  320|
|      1970s|  175|
|     brands| 4228|
+-----------+-----+
only showing top 25 rows


# Sort the dataframe by the 'count' column
wordCountsSortedDF = df4.sort("count",ascending=False)
# Hint: the DataFrame.count() function collides with the counts column we want to use. 
#       How else can we specify the column to sort by? 

# Uncomment the next two lines and fill your code
# wordCountsSortedDF = <Your code>
# wordCountsSortedDF.show(25)


wordCountsSortedDF.show(25)

+-----+-------+
| word|  count|
+-----+-------+
|  the|2053274|
|    i|1228198|
|  and|1079515|
|   to|1070092|
|    a|1026310|
|   it| 850403|
|   is| 633026|
|  for| 574222|
|   of| 568148|
| this| 552837|
|   my| 446637|
|   in| 420544|
| with| 398539|
| that| 387286|
|  you| 359814|
|   on| 337447|
| have| 322965|
|  but| 293731|
|  not| 279774|
|  was| 259645|
|   as| 234367|
|  are| 217605|
|great| 195349|
|   so| 188233|
| they| 175861|
+-----+-------+
only showing top 25 rows


# print the time since execution start - This will be needed in section 10.
print(time.time() - start_time)

34.142656087875366


# Save results to HDFS

wordCountsSortedDF.coalesce(1).write.csv("hdfs:///wordCountsSorted.csv", header=True, mode="overwrite")


# Stop Spark session

spark.stop()

#Cores	Runtime_1	Runtime_2	Runtime_3	Mean	Std
1	90.33	90.61	90.71	90.55	0.197
2	44.35	44.65	44.67	44.56	0.179
4	34.14	34.62	34.34	34.37	0.241

MGTA 495: Analytics Assignment 2 - Week 7 - Word Count¶

Setup of PySpark¶

Tasks:¶

Due date: 23rd February 11:59 PM PST¶

1. Copy data file(s) to HDFS¶

2. Start Spark Session¶

3. Load Data¶

4. Clean the data¶

5. Get dataframe containing unique words and their counts¶

Task:¶

Expected output:¶

The output after grouping unique words would be:¶

6. Sort the word count dataframe¶

7. Record the execution time¶

8. Save the sorted word counts to HDFS as a CSV file¶

9. Copy the results from HDFS to the local file system¶

10. Execution times on different number of cores¶

Note on Autograder¶