Exercise 05 - Develop word count program

pyspark
spark-shell
apache-spark
scala
python
Exercise 05 - Develop word count program
0.0 0

#21

#Python
5:
#pyspark

hdfs dfs -ls /public/randomtextwriter

hdfs dfs -du -s -h /public/randomtextwriter

hdfs fsck -blocks -files -locations /public/randomtextwriter

pyspark --master yarn
–conf spark.ui.port=12546
–num-executors 10
–executor-cores 20
–executor-memory 3G
–packages com.databricks:spark-avro_2.10:2.0.1

wordFile = sc.textFile("/public/randomtextwriter")
words = wordFile.flatMap(lambda x: x.split(" "))
for i in words.take(10): print(i)

wordCount = words.map(lambda x: (x, 1)).
reduceByKey(lambda x, y: x+y)

for i in wordCount.take(10): print(i)

#convert to DF and save in avro fmt.
wordCountDF = wordCount.toDF(schema=[“word”, “count”])
wordCountDF.coalesce(8).write.format(“com.databricks.spark.avro”).
save("/user/rjshekar90/solutions/solution05/wordcount")

#validate

for i in sc.textFile("/user/rjshekar90/solutions/solution05/wordcount").take(10):
print(i)

sqlContext.load("/user/rjshekar90/pblm5/soln", “com.databricks.spark.avro”).show()