Some exercises for practice

hive
apache-spark

#1

I was trying to come up with some difficult questions to practice in hive and spark.

Could think of the below. You can try these and ofcourse share your exercises to.

  1. Most selling product (But Quantity not by Cost) for every month in the database (Between July 2013 to July 2014)
  2. Who are the top 10 revenue generating customers?
  3. What are the top 10 revenue generating products?
  4. Top 5 revenue generating deparments;
  5. Top 5 revenue generating cities (from address of Customers)

Please provide answers in Scala or Python or Spark SQL


#2

package com.fs.f2;

import org.apache.spark.sql.DataFrameReader;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

public class Excercise1 {
public static void main(String args[]) throws Exception {
SparkSession session = SparkSession.builder().appName(“Stack”).master(“local[]").getOrCreate();
DataFrameReader dataFrameReader = session.read();
Dataset responses = dataFrameReader.option(“header”, “true”).csv(“file:///home/shivapriya/sales@.csv”);
responses.toDF();
System.out.println(“");
responses.show(20);
System.out.println("
”);
Dataset revenue = responses.select(“Quantity”, “Product”).orderBy(org.apache.spark.sql.functions.col(“Quantity”).desc());
revenue.show(20);
System.out.println("Most Selling Products
*********”);
Dataset mostSellingProduct=responses.select(“Product”,“Year”,“Quarter”).filter(responses.col(“Year”).startsWith(“2014”));
mostSellingProduct.show();
Dataset revenues = mostSellingProduct.orderBy(“Product”);
System.out.println(“topGeneratingProducts**********”);

    Dataset<Row> topGeneratingProducts=responses.select("Product","Revenue").orderBy(org.apache.spark.sql.functions.col("Revenue").desc());
    topGeneratingProducts.show(5);

    System.out.println("***********topGeneratingDepartmentss*********************");


    Dataset<Row> topGeneratingdepts=responses.select("Product line","Revenue").orderBy(org.apache.spark.sql.functions.col("Revenue").desc());
    topGeneratingdepts.show(5);


    System.out.println("***********topGeneratingCountries*********************");


    Dataset<Row> topGeneratingCities=responses.select("Retailer country","Revenue").orderBy(org.apache.spark.sql.functions.col("Revenue").desc());
    topGeneratingCities.show(5);
}

}