Exercise 05 - Develop word count program

pyspark
spark-shell
apache-spark
scala
python

#21

#Python
5:
#pyspark

hdfs dfs -ls /public/randomtextwriter

hdfs dfs -du -s -h /public/randomtextwriter

hdfs fsck -blocks -files -locations /public/randomtextwriter

pyspark --master yarn
–conf spark.ui.port=12546
–num-executors 10
–executor-cores 20
–executor-memory 3G
–packages com.databricks:spark-avro_2.10:2.0.1

wordFile = sc.textFile("/public/randomtextwriter")
words = wordFile.flatMap(lambda x: x.split(" "))
for i in words.take(10): print(i)

wordCount = words.map(lambda x: (x, 1)).
reduceByKey(lambda x, y: x+y)

for i in wordCount.take(10): print(i)

#convert to DF and save in avro fmt.
wordCountDF = wordCount.toDF(schema=[“word”, “count”])
wordCountDF.coalesce(8).write.format(“com.databricks.spark.avro”).
save("/user/rjshekar90/solutions/solution05/wordcount")

#validate

for i in sc.textFile("/user/rjshekar90/solutions/solution05/wordcount").take(10):
print(i)

sqlContext.load("/user/rjshekar90/pblm5/soln", “com.databricks.spark.avro”).show()


#23

seqFile = sc.sequenceFile("/public/randomtextwriter/")
lines = seqFile.map(lambda t: t[0]+" "+t[1])

counts = lines.
flatMap(lambda line: line.split(" ")).
filter(lambda word: word !=’’).
map(lambda word: (word,1)).
reduceByKey(lambda a,b:a+b)

from pyspark.sql import Row
countsDF = counts.coalesce(8).map(lambda r:Row(word=r[0],count=r[1])).toDF()

countsDF.write.format(“com.databricks.spark.avro”).save("/user/…")