Writing MapReduce Program
Writing wordcount Program
Scenario: There are a lot of files in which words are stored and one word occupies one line.
Task: How to count the number of occurrences of each word
Similar application scenarios:
Statistics of the most popular K search terms in search engines
Statistical search word frequency to help optimize search word prompts
- Edit pom.xml and add the jar package
maven adds jar packages to query: https://mvnrepository.com/
<!-- Add warehouse --> <repositories> <repository> <id>apache</id> <url>http://maven.apache.org</url> </repository> </repositories <!-- Add dependency --> <dependencies> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-core</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0</version> </dependency> </dependencies>
- Writing Mapper classes
package com.bigdata.wordcount; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> { @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { // Receive each line of string String line = value.toString(); // Segmentation data String[] words = line.split(" "); // Cyclic output, 1 for each word for(String w: words) { context.write(new Text(w), new LongWritable(1)); } } }
- Writing Reducer classes
package com.bigdata.wordcount; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> { @Override protected void reduce(Text key2, Iterable<LongWritable> value2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { // receive data Text key3 = key2; // Define a counter long count = 0; // Cyclic statistics value2s for(LongWritable i: value2s) { count += i.get(); } // output context.write(key3, new LongWritable(count)); } }
- Write main classes and submit MR jobs
package com.bigdata.wordcount; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { public static void main( String[] args ) throws Exception { // Create jobs Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(WordCount.class); job.setJobName("wordcount"); // Configuring map jobs job.setMapperClass(WordCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); FileInputFormat.setInputPaths(job, new Path("/words.txt")); // Configure reduce jobs job.setReducerClass(WordCountReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileOutputFormat.setOutputPath(job, new Path("/wordcount")); // Submit homework System.exit(job.waitForCompletion(true)? 0 : 1); } }
- Export jar packages and submit hadoop runs