Big data (Introduction to Hadoop MapReduce code and programming model)

Keywords: Programming Hadoop Apache Java

MapReduce programming model

MapReduce divides the whole operation process into two stages: Map stage and Reduce stage

The Map phase consists of a certain number of Map tasks
Input data format analysis: InputFormat
Input data processing: Mapper
Data grouping: Partitioner

The Reduce phase consists of a certain number of Reduce tasks
Data remote copy
Data is sorted by key
Data processing: Reducer
Data output format: OutputFormat

Map stage
InputFormat (default TextInputFormat)
    Combiner(local Reducer)
Reduce stage
OutputFormat (default TextOutputFormat)

Java programming interface

Java programming interface;
Old API: java package: org.apache.hadoop.mapred
New API: java package: org.apache.hadoop.mapreduce
New API has better expansibility;

The two programming interfaces are only exposed to users in different forms, and the internal execution engine is the same;


Java old and new API

Starting from Hadoop 1.0.0, all distributions contain both old and new API s;

Example 1: WordCount problem

WordCount problem - map stage

WordCount problem - reduce phase

WordCount problem design and implementation of mapper

WordCount problem: design and implementation of reducer

WordCount problem - data flow

Sample code



import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 * Word statistics
 * @author huang
public class WordCountTest {

	public static class MyMapper extends Mapper<Object, Text, Text, IntWritable>{
		//First define two outputs, k2,v2
		Text k2 = new Text() ;
		IntWritable v2 = new IntWritable() ;
		 * hello you
		 * hello me
		 * 1.<k1,v2> It's the form of < 0, hello you >, < 10, hello me >
		 * Convert to
		 * 2.<k2,v2>--> <hello,1><you,1><hello,1><me,1>
		 * */
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//Process the data in each line to get the word
			String[] words = value.toString().split(" ");
			for (String word : words) {
				k2.set(word);			//Word is the word in every line
				v2.set(1);				//The number of times each word appears is 1
				context.write(k2, v2);	//output
	//3. partition all k2 and v2 output
	//4. After the shuffle phase, the result is < Hello, {1,1} > < me, {1} > < you, {1} >
	//Phase 3 and 4 are all done by hadoop framework itself
	public static class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable>{
		protected void reduce(Text key, Iterable<IntWritable> values,
				Context context) throws IOException, InterruptedException {
			//Define two outputs first
			IntWritable v3 = new IntWritable() ;
			int count = 0 ;
			for (IntWritable value : values) {
				count += value.get() ;
			//Output result data
			context.write(key, v3);
	//We have finished the function writing of map and reduce, and put them together to mapreduce for execution
	public static void main(String[] args) throws Exception {
		//Load configuration information
		Configuration conf = new Configuration() ;
		//Set tasks
		Job job = Job.getInstance(conf, "word count") ;
		//Specify the mapper/reducer business class to be used by the job
		//Specifies the kv type of the final output data
		//Specify the directory of the job's input original file
		FileInputFormat.addInputPath(job, new Path(args[0]));
		//Specify the directory where the output of the job is located
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true)?0:1) ;



import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//Maximum value
public class MapReduceCaseMax extends Configured implements Tool{

	//Writing map
	public static class MaxMapper extends Mapper<Object, Text, LongWritable, NullWritable>{
		//Define a minimum
		long max = Long.MIN_VALUE ;
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			//Cut string, default separator space, tab
			StringTokenizer st = new StringTokenizer(value.toString()) ;
				//Get two values
				String num1 = st.nextToken() ;
				String num2 = st.nextToken() ;
				//Conversion type
				long n1 = Long.parseLong(num1) ;
				long n2 = Long.parseLong(num2) ;
				//Judgement and comparison
				if(n1 > max){
					max = n1 ;
				if(n2 > max){
					max = n2 ;
		protected void cleanup(Context context)
				throws IOException, InterruptedException {
			context.write(new LongWritable(max), NullWritable.get());
	public int run(String[] args) throws Exception {
		/*Set task and main class*/
		Job job = Job.getInstance(getConf(), "MaxFiles") ;
		/*Set the class of the map method*/
		/*Set the type of key and value output*/
		/*Set input and output parameters*/
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		/*Submit the job to the cluster and wait for the task to complete*/
		boolean isSuccess = job.waitForCompletion(true);
		return isSuccess ? 0 : 1 ;
	public static void main(String[] args) throws Exception {
		int res = MapReduceCaseMax(), args) ;

Posted by covert215 on Sun, 03 Nov 2019 15:15:53 -0800