# Initialize Spark
import pyspark
from pyspark.sql import SparkSession


conf = pyspark.SparkConf().setAll([('spark.master', 'local[1]'),
                                   ('spark.app.name', 'Basic Setup')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()


# Verify that the correct versions of spark and pyspark are installed.
# print (spark.version, pyspark.version.__version__)
assert(spark.version == '3.0.1')
assert(pyspark.version.__version__ == '3.0.1')

# Don't remove this print statement. It's going to be used for Autograder.
print('All set.')

All set.


# Read data from HDFS
# You'll need to run the hadoop fs -copyFromLocal command before you can use this path.

dataFileName = "hdfs:////W6/BookReviews_1M.txt"

# Read data from the file above, convert it to a dataframe, and print the number of rows in that dataframe.
df = spark.read.text(dataFileName) \
      .cache()

# YOUR CODE HERE FOR READING DATA USING SPARK
print('Number of lines = ', df.count())

Number of lines =  1000000


# Use the printSchema function to print the dataframe's schema
df.printSchema()

root
 |-- value: string (nullable = true)


# Use the show() function to show the first 25 rows of the dataframe.
df.show(25)

+--------------------+
|               value|
+--------------------+
|This was the firs...|
|Also after going ...|
|As with all of Ms...|
|I've not read any...|
|This romance nove...|
|Carolina Garcia A...|
|Not only can she ...|
|Once again Garcia...|
|The timing is jus...|
|Engaging. Dark. R...|
|Set amid the back...|
|This novel is a d...|
|If readers are ad...|
| Reviewed by Phyllis|
|      APOOO BookClub|
|A guilty pleasure...|
|In the tradition ...|
|Beryl Unger, top ...|
|What follows is a...|
|The book flap say...|
|I'd never before ...|
|The novel's narra...|
|It is centered on...|
|If you like moder...|
|Beryl Unger is a ...|
+--------------------+
only showing top 25 rows


# Stop Spark session
spark.stop()

MGTA 495: Analytics Assignment 1¶

Setup of PySpark¶

Tasks:¶

Due date: Refer to Gradescope¶

1. Copy data file(s) to HDFS¶

2. Start Spark Session¶

3. Load Data¶

4. Examine the data¶