Exercise 02 - Develop and run map reduce job for word count on the cluster

eclipse
java
mapreduce
hadoop
#1

Problem Statement

  • Develop map reduce job using map reduce APIs
  • Bundle it and run it on the cluster on /public/randomtextwriter

Please provide the following

  • Code to compute word count
  • hadoop jar command
  • hadoop fs -ls command on the output
0 Likes

#2

#Code to compute word count

package com.itversity.demott;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool {

public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		
		String words[] = value.toString().split(" ");
		for(String word:words){
			context.write(new Text(word),new LongWritable(1));
		}
	}
}

public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
	
	@Override
	protected void reduce(Text word, Iterable<LongWritable> values,Context context) 
			throws IOException, InterruptedException {
		// TODO Auto-generated method stub
	
	long sum = 0;
	for(LongWritable val: values){
		sum = (sum+val.get());
		
	}
	
	context.write(word, new LongWritable(sum) );
	
	}
}


@Override
public int run(String[] arg0) throws Exception {
	// TODO Auto-generated method stub
	Job job = Job.getInstance(getConf());
	job.setJarByClass(getClass());
	job.setJobName("Sneh's Word Count");
	job.setMapperClass(WordCountMapper.class);
	job.setReducerClass(WordCountReducer.class);
	
	job.setCombinerClass(LongSumReducer.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(LongWritable.class);
	FileInputFormat.setInputPaths(job, new Path(arg0[0]));
	FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
	
	
	return job.waitForCompletion(true) ? 0 : 1;
}

public static void main(String[] args) throws Exception {
	
	System.exit(ToolRunner.run(new WordCount(), args));
	
}

}


#hadoop jar command

[infosnehasish@gw01 ~]$ hadoop jar demott-0.0.1-SNAPSHOT.jar com.itversity.demott.WordCount /public/randomtextwriter /user/infosnehasish/dec23out00


#hadoop fs -ls command on the output

-rw-r–r-- 3 infosnehasish hdfs 0 2016-12-23 01:17 dec23out00/_SUCCESS
-rw-r–r-- 3 infosnehasish hdfs 363427984 2016-12-23 01:17 dec23out00/part-r-00000

0 Likes

#3

package com.tavant.bg;

import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.fs.Path;

public class Wordcount extends Configured implements Tool {

public static class WordcountMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
	public void map(LongWritable lineOffset, Text record, Context output) 
			throws IOException, InterruptedException {
	String[] words = record.toString().split(" ");
	
	      for(String word:words){
	    	  output.write(new Text(word), new LongWritable(1));
	      }
	}
}

public int run(String[] arg0) throws Exception {
	// TODO Auto-generated method stub

Job job = Job.getInstance(getConf());

	job.setJarByClass(getClass());

	job.setMapperClass(WordcountMapper.class);

// job.setMapOutputKeyClass(Text.class);
// job.setMapOutputValueClass(LongWritable.class);

	job.setReducerClass(LongSumReducer.class);
	
	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(LongWritable.class);
	
	//job.setNumReduceTasks(1);
	
	job.setInputFormatClass(TextInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	FileInputFormat.setInputPaths(job, new Path(arg0[0]));
	FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
	return job.waitForCompletion(true) ? 0 : 1;
}


public static void main(String args[]) throws Exception {
	System.exit(ToolRunner.run(new Wordcount(), args));
}

}

command: hadoop jar traBG.jar com.tavant.bg.Wordcount /user/aruncse11/arun/word1/sample1.txt /user/aruncse11/arun/word2

[aruncse11@gw01 ~]$ hdfs dfs -ls /user/aruncse11/arun/word2
Found 2 items
-rw-r–r-- 3 aruncse11 hdfs 0 2016-12-23 01:23 /user/aruncse11/arun/word2/_SUCCESS
-rw-r–r-- 3 aruncse11 hdfs 687 2016-12-23 01:23 /user/aruncse11/arun/word2/part-r-00000
[

0 Likes

#4

Code to compute word count

import java.io.IOException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MRWordCount extends Configured implements Tool{
	
	public static class MRWordCountMap extends Mapper<LongWritable, Text, Text, LongWritable> {
		
		public void map(LongWritable offset, Text rec, Context context) throws IOException, InterruptedException{
			
			String[] str = rec.toString().split(" ");
			
			for(String s: str){
				context.write(new Text(s), new LongWritable(1));
			}
		}
	}
	
	public static class MRWordCountReduce extends Reducer<Text, LongWritable, Text, LongWritable>{
		
		public void reduce(Text rec, Iterable<LongWritable> itr, Context context) throws IOException, InterruptedException{
			
			long cnt =0;
			
			for(LongWritable i: itr){
				cnt = cnt + i.get();
			}
			context.write(rec, new LongWritable(cnt));	
		}
	}

	@Override
	public int run(String[] arg0) throws Exception {
		
		Job job = Job.getInstance(getConf());
		
		job.setJarByClass(getClass());
				
		job.setMapperClass(MRWordCountMap.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		job.setReducerClass(MRWordCountReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		
		job.setNumReduceTasks(3);
		job.setCombinerClass(MRWordCountReduce.class);
		
		FileInputFormat.setInputPaths(job, new Path(arg0[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

		return job.waitForCompletion(true)?0:1;
	}
	
	public static void main(String args[]) throws Exception{
		System.exit(ToolRunner.run(new MRWordCount() , args));
	}

}

hadoop jar command

hadoop jar \
MapReducePgms-0.0.1-SNAPSHOT.jar \
mapreduce.java.pgms.MRWordCount \
/public/randomtextwriter \
/user/jasonbourne/MRWCountRTW

hadoop fs -ls command on the output

[jasonbourne@gw01 ~]$ hadoop fs -ls -h /user/jasonbourne/MRWCountRTW
Found 4 items
-rw-r--r--   3 jasonbourne hdfs          0 2016-12-23 01:22 /user/jasonbourne/MRWCountRTW/_SUCCESS
-rw-r--r--   3 jasonbourne hdfs    115.5 M 2016-12-23 01:22 /user/jasonbourne/MRWCountRTW/part-r-00000
-rw-r--r--   3 jasonbourne hdfs    115.6 M 2016-12-23 01:22 /user/jasonbourne/MRWCountRTW/part-r-00001
-rw-r--r--   3 jasonbourne hdfs    115.5 M 2016-12-23 01:22 /user/jasonbourne/MRWCountRTW/part-r-00002
0 Likes

#5

Code to compute word count

package com.itversity;
import java.io.IOException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool {

public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String[] words = value.toString().split(" ");
		for (String word : words) {
			context.write(new Text(word), new LongWritable(1));
		}
	}

}

/*@Override*/
public int run(String[] arg0) throws Exception {
	// TODO Auto-generated method stub
	Job job = Job.getInstance(getConf());
	job.setJarByClass(getClass());
	job.setMapperClass(WordCountMapper.class);
	job.setReducerClass(LongSumReducer.class);

	job.setOutputKeyClass(Text.class);
	job.setOutputValueClass(LongWritable.class);
	job.setNumReduceTasks(4);
	FileInputFormat.setInputPaths(job, new Path(arg0[0]));
	FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

	return job.waitForCompletion(true) ? 0 : 1;
}

public static void main(String[] args) throws Exception {
	System.exit(ToolRunner.run(new WordCount(), args));
}

}


hadoop jar itversity-0.0.1-SNAPSHOT.jar com.itversity.WordCount /public/randomtextwriter /user/parulshine92/WCOut


hdfs dfs -ls /user/parulshine92/WCOut
Found 5 items
-rw-r–r-- 3 parulshine92 hdfs 0 2016-12-23 01:35 /user/parulshine92/WCOut/_SUCCESS
-rw-r–r-- 3 parulshine92 hdfs 90846422 2016-12-23 01:35 /user/parulshine92/WCOut/part-r-00000
-rw-r–r-- 3 parulshine92 hdfs 90828355 2016-12-23 01:34 /user/parulshine92/WCOut/part-r-00001
-rw-r–r-- 3 parulshine92 hdfs 90875020 2016-12-23 01:34 /user/parulshine92/WCOut/part-r-00002
-rw-r–r-- 3 parulshine92 hdfs 90878187 2016-12-23 01:35 /user/parulshine92/WCOut/part-r-00003

0 Likes

#6

hadoop fs -ls /user/paramesh/wordcountoutput
Found 2 items
-rw-r–r-- 3 paramesh hdfs 0 2016-12-23 01:35 /user/paramesh/wordcountoutput/_SUCCESS
-rw-r–r-- 3 paramesh hdfs 363427984 2016-12-23 01:35 /user/paramesh/wordcountoutput/part-r-00000

spark-submit --class “hbase.crudoperations.WordCount”
–master yarn
–executor-memory 512m
–total-executor-cores 1
hbase-0.0.1-wordcount.jar
/public/randomtextwriter /user/paramesh/wordcountoutput prod

0 Likes