Level 1: data cleaning
Task description
This task: clean the data according to certain rules.
Programming requirements
According to the prompt, add code in the editor on the right to clean the data according to certain rules.
Data description is as follows: a.txt;
Data segmentation method: one or more spaces;
Data location: / user/test/input/a.txt;
2005 01 01 16 -6 -28 10157 260 31 8 0 -9999
2005 01 01 16 -6 -28 10157 260 31 8 0 -9999
Month month day hour temperature humidity barometric pressure wind direction wind speed weather condition 1h rainfall 6h rainfall
sky.txt;
Data segmentation method: comma;
Data location: data/sky.txt or / user/test/input/sky.txt.
1. Cumulus
1 cumulus
Weather conditions cumulus
Cleaning rules:
Convert separator to comma;
Clear illegal data: if the field length is insufficient, the wind direction is not [0360], the wind speed is negative, the air pressure is negative, the weather condition is not [0,10], the humidity is not [0100], and the temperature is not [- 40,50];
join the data of a.txt and sky.txt according to the weather conditions to change the weather conditions into their corresponding cloud genera;
Sort the data entering the same partition; Sorting rules: (1) key is the same month and day in the same year; (2) In ascending order of daily temperature; (3) If the temperature is the same, it shall be in ascending order of wind speed; (4) If the wind speed is the same, press the strong descending order.
Set the data source file path and cleaned data storage path: the data source path is: / user/test/input/a.txt (HDFS); The cleaned data is stored in: / user/test/output (HDFS).
After data cleaning, it is as follows:
2005,01,01,16, - 6, - 2810157260,31, cirrus cloud, 0, - 9999
Test description
The platform will test the code you write:
Start hadoop: start-all.sh on the command line before profiling;
Weather: encapsulates objects;
WeatherMap: map side operation;
WeatherReduce: reduce side operation;
Auto: user defined partition;
WeatherTest: test result class.
See the test set on the right for the specific expected output of this level.
Because the big data training consumes a lot of resources and the map/reduce operation is time-consuming, the evaluation time is long, about 60 seconds. Please wait patiently.
Start your mission. I wish you success!
code implementation
command line
start-all.sh
Code file
step1/com/Weather.java
package com; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; /**Encapsulated object*/ public class Weather implements WritableComparable<Weather> { //year private String year; //month private String month; //day private String day; //hour private String hour; //temperature private int temperature; //humidity private String dew; //Air pressure / pressure private int pressure; //wind direction private String wind_direction; //wind speed private int wind_speed; //weather condition private String sky_condition; //1 hour rainfall private String rain_1h; //6-hour rainfall private String rain_6h; public String getYear() { return year; } public void setYear(String year) { this.year = year; } public String getMonth() { return month; } public void setMonth(String month) { this.month = month; } public String getDay() { return day; } public void setDay(String day) { this.day = day; } public String getHour() { return hour; } public void setHour(String hour) { this.hour = hour; } public int getTemperature() { return temperature; } public void setTemperature(int temperature) { this.temperature = temperature; } public String getDew() { return dew; } public void setDew(String dew) { this.dew = dew; } public int getPressure() { return pressure; } public void setPressure(int pressure) { this.pressure = pressure; } public String getWind_direction() { return wind_direction; } public void setWind_direction(String wind_direction) { this.wind_direction = wind_direction; } public int getWind_speed() { return wind_speed; } public void setWind_speed(int wind_speed) { this.wind_speed = wind_speed; } public String getSky_condition() { return sky_condition; } public void setSky_condition(String sky_condition) { this.sky_condition = sky_condition; } public String getRain_1h() { return rain_1h; } public void setRain_1h(String rain_1h) { this.rain_1h = rain_1h; } public String getRain_6h() { return rain_6h; } public void setRain_6h(String rain_6h) { this.rain_6h = rain_6h; } /********** begin **********/ @Override public String toString() { return year + "," + month + "," + day + "," + hour + "," + temperature + "," + dew + "," + pressure + "," + wind_direction + "," + wind_speed + "," + sky_condition + "," + rain_1h + "," + rain_6h; } /********** end **********/ public Weather() { } public Weather(String year, String month, String day, String hour, int temperature, String dew, int pressure, String wind_direction, int wind_speed, String sky_condition, String rain_1h, String rain_6h) { this.year = year; this.month = month; this.day = day; this.hour = hour; this.temperature = temperature; this.dew = dew; this.pressure = pressure; this.wind_direction = wind_direction; this.wind_speed = wind_speed; this.sky_condition = sky_condition; this.rain_1h = rain_1h; this.rain_6h = rain_6h; } public void readFields(DataInput in) throws IOException { year = in.readUTF(); month = in.readUTF(); day = in.readUTF(); hour = in.readUTF(); temperature = in.readInt(); dew = in.readUTF(); pressure = in.readInt(); wind_direction = in.readUTF(); wind_speed = in.readInt(); sky_condition = in.readUTF(); rain_1h = in.readUTF(); rain_6h = in.readUTF(); } public void write(DataOutput out) throws IOException { out.writeUTF(year); out.writeUTF(month); out.writeUTF(day); out.writeUTF(hour); out.writeInt(temperature); out.writeUTF(dew); out.writeInt(pressure); out.writeUTF(wind_direction); out.writeInt(wind_speed); out.writeUTF(sky_condition); out.writeUTF(rain_1h); out.writeUTF(rain_6h); } public int compareTo(Weather o) { /********** begin **********/ int tmp = this.month.compareTo(o.month); if (tmp == 0) { tmp = this.day.compareTo(o.day); if (tmp == 0) { tmp = this.temperature - o.temperature; if (tmp == 0) { tmp = this.wind_speed - o.wind_speed; if (tmp == 0) { tmp = o.pressure - this.pressure; return tmp; } return tmp; } return tmp; } return tmp; } return tmp; /********** end **********/ } }
step1/com/WeatherMap.java
Insert the code slice here package com; import java.io.*; import java.util.HashMap; import java.util.Map.Entry; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import javax.sound.midi.Soundbank; public class WeatherMap extends Mapper<LongWritable, Text, Weather, NullWritable> { /********** begin **********/ Text text = new Text(); HashMap<String, String> map = new HashMap<String, String>(); @Override protected void setup(Context context) throws IOException, InterruptedException { File f=new File("data/sky.txt"); InputStream inputStream = new FileInputStream(f); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream)); String line = null; while ((line = bufferedReader.readLine()) != null) { System.out.println(line); String[] split = line.split(","); map.put(split[0], split[1]); } bufferedReader.close(); inputStream.close(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String split[] = line.split("\\s+"); String year = split[0]; String month = split[1]; String day = split[2]; String hour = split[3]; int temperature = Integer.valueOf(split[4]); String dew = split[5]; int pressure = Integer.valueOf(split[6]); String wind_direction = split[7]; int wind_speed = Integer.valueOf(split[8]); String sky_condition = split[9]; String rain_1h = split[10]; String rain_6h = split[11]; if (split.length != 12 || pressure < 0 || Integer.valueOf(wind_direction) < 0 || Integer.valueOf(wind_direction) > 360 || Integer.valueOf(sky_condition) < 0 || Integer.valueOf(sky_condition) > 10 || temperature< -40 || temperature>50 || Integer.valueOf(dew)< 0 || Integer.valueOf(dew)>100 || wind_speed<0 ) { return; } for (Entry<String, String> entry : map.entrySet()) { if (sky_condition.equals(entry.getKey())) { sky_condition = entry.getValue(); } } Weather weather = new Weather(year, month, day, hour, temperature, dew, pressure, wind_direction, wind_speed, sky_condition, rain_1h, rain_6h); context.write(weather, NullWritable.get()); } /********** end **********/ }
step1/com/WeatherReduce.java
package com; import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Reducer; public class WeatherReduce extends Reducer<Weather, NullWritable, Weather, NullWritable> { /********** begin **********/ @Override protected void reduce(Weather key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { for (NullWritable nullWritable : values) { context.write(key, NullWritable.get()); } } /********** end **********/ }
step1/com/Auto.java
package com; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Partitioner; /*** Custom partition***/ public class Auto extends Partitioner<Weather, NullWritable> { /********** begin **********/ public static Map<String, Integer> provinceDict = new HashMap<String, Integer>(); static { int a = 0; for (int i = 1980; i <= 1981; i++) { provinceDict.put(i + "", a); a++; } } public int getPartition(Weather key, NullWritable nullWritable, int numPartitions) { Integer id = provinceDict.get(key.toString().substring(0, 4)); return id == null ? 2 : id; } /********** end **********/ }
step1/com/WeatherTest.java
package com; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WeatherTest { public static void main(String[] args) throws Exception { /********** begin **********/ Configuration configuration = new Configuration(); Job job = Job.getInstance(configuration); job.setJarByClass(WeatherTest.class); job.setMapperClass(WeatherMap.class); job.setMapOutputKeyClass(Weather.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(WeatherReduce.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Weather.class); job.setNumReduceTasks(3); job.setPartitionerClass(Auto.class); Path inPath = new Path("/user/test/input/a.txt"); Path out = new Path("/user/test/output"); FileInputFormat.setInputPaths(job, inPath); FileOutputFormat.setOutputPath(job, out); job.waitForCompletion(true); /********** end **********/ } }