Map Reduce example : Number of words per each line


we also offer , online and classroom trainings
we support in POC
author: Bharat (sree ram)
contact : 04042026071
_______________________________________________________________________

This program will find number of words per each line.

input hdfs file:

    mydir/file1.txt
_______________________________

hadoop execution model is mapreduce
mapreduce is a backend business process logic
advantage of mapreduce is it will not sort raw data
but output of the mapper will be sorted

________________________________________________

o/p by this program:

  line1  5
  line2  7
  line3  10
  line4  8

__________________________________________________

 package my.map.red.app;

 import java.io.IOException;

 import java.util.StringTokenizer;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.hadoop.fs.Path;

 import org.apache.hadoop.io.Text;

 import org.apache.hadoop.io.IntWritable;

 import org.apache.hadoop.io.LongWritable;

 import org.apache.hadoop.mapreduce.Mapper;

 import org.apache.hadoop.mapreduce.Reducer;

 import org.apache.hadoop.mapreduce.Job;

 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 import org.apache.hadoop.mapreduce.util.GenericOptionsParser;

 public class LineWordsCount
 {
    public static class MapForLineWordsCount extends Mapper<LongWritable, Text, Text,

IntWritable>
    {
           int lineno = 1;
           public void map(LongWritable key, Text value, Context con) throws IOException,

InterruptedException
           {
               String line = value.toString();

               StringTokenizer token = new StringTokenizer(line);
               while(token.hasMoreTokens())
               {
                  String word = token.nextToken();
                  String l = "line"+lineno;
                  Text outputKey = new Text(l);
                  IntWritable outputValue = new IntWritable(1);
                  con.write(outputKey, outputValue);
               }

               lineno++;

            } // end of map()
    } //end of Mapper Class

  /*
     output of the mapper phase :
   
      <line1, <1,1,1,1,1>>
      <line2, <1,1,1,1,1,1,1>>
      <line3, <1,1,1,1,1,1,1,1,1,1>>
      <line4, <1,1,1,1,1,1,1,1>>

  */

   public static class ReduceForLineWordsCount extends Reducer<Text, IntWritable, Text,

IntWritable>
   {
          public void reduce(Text line, Iterable<IntWritable> values, Context con) throws

IOException, InterruptedException
          {

               int sum = 0;

               for(IntWritable value : values)
               {

                   sum += value.get();

               }

               con.write(line , new IntWritable(sum));

          } // end of reduce()
   } // end of Reducer class
/*

 output of the reducer

   line1 5
   line2 7
   line3 10
   line4 8

*/

 // job definition

   public static void  main(String[] args) throws Exception
   {

           Configuration c = new Configuration();

           String[] files = new GenericOptionsParser(c, args).getRemainingArgs();

           Path input = new Path(files[0]);

           Path output = new Path(files[1]);

           Job j = new Job(c, "Linewordscount");

           j.setJarByClass(LineWordsCount.class);

           j.setMapperClass(MapForLineWordsCount.class);

           j.setCombinerClass(ReduceForLineWordsCount.class);

           j.setReducerClass(ReduceForLineWordsCount.class);

           j.setOutputKeyClass(Text.class);

           j.setOutputValueClass(IntWritable.class);

           FileInputFormat.addInputPath(j, input);

           FileOutputFormat.setOutputPath(j, output);

           System.exit(j.waitForCompletion(true) ? 0:1);

   } // end of main()

} end of main class

4 comments:

  1. Thanks for the code, but I didn't get the output in a sorted manner. How can I achieve that?

    ReplyDelete
  2. Here mapper's o/p is (longWritable, Text) and i/p for reducer is (text,IntWritable) so this will fail while jar cmd execution with IOException.

    ReplyDelete

  3. In Hadoop, MapReduce is a calculation that decomposes large manipulation jobs into individual tasks that can be executed in parallel cross a cluster of servers. The results of tasks can be joined together to compute final results.
    Mapreduce program example
    Hadoop fs command using java api
    ..

    ReplyDelete
  4. really Good blog post.provided a helpful information.I hope that you will post more updates like thisHadoop Admin Online Training Bangalore

    ReplyDelete