使用 Hadoop MapReduce 实现历年最高温度统计 - ToB企服应用市场:ToB评测及商务社交产业平台

0151234567890123456789012345678901234567890123456789012345678901234567890123456789012345
YYYYMMdd[TIME] [TEMPERATURE] ...

复制代码

0029029070999991901010106004+64333+023450FM12+000599999V0202701N015919999999N0000001N9-00781+99999102001ADDGF108991999999999999999999999
0029029070999991901010106004+64333+023450FM12+000599999V0202701N015919999999N0000001N9+01231+99999102001ADDGF108991999999999999999999
0029029070999991910010106004+64333+023450FM12+000599999V0202701N015919999999N0000001N9+01501+99999102001ADDGF108991999999999999999999
0029029070999991910010106004+64333+023450FM12+000599999V0202701N015919999999N0000001N9-00231+99999102001ADDGF108991999999999999999999
0029029070999991920010106004+64333+023450FM12+000599999V0202701N015919999999N0000001N9+02451+99999102001ADDGF108991999999999999999999

复制代码

package com.dxd.dxd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class TempMap extends Mapper<LongWritable, Text, Text, IntWritable>{
// 定义输出的键值对类型
private Text k2;
private IntWritable v2;
// 初始化方法，自动在执行map方法时调用，非必须, 延迟内存分配的时间, 提高程序的启动速度
// 该方法可省略，直接定义+创建对象即可，如下
//private Text k2 = new Text();
//private IntWritable v2 = new IntWritable();
@Override
protected void setup(Context context) throws IOException, InterruptedException{
k2 = new Text();
v2 = new IntWritable();
}
// map 方法：提取数据中的年份和温度信息，并将其写入 context
@Override
protected void map(LongWritable key,Text value, Context context) throws IOException, InterruptedException{
// 为方便按index处理该行数据，转化为String类
String line = value.toString();
// 提取年份，数据从 15 到 19 位，不包含第19位，字符串索引第一位为0
String year = line.substring(15, 19);
//double temperature = Double.parseDouble(line.substring(87,92));
//↑ 若为浮点型
// 提取温度，数据从 87 到 92 位，转化为整数，不包含第92位
int temperature = Integer.parseInt(line.substring(87, 92));
// 提取质量标记，数据从 92 到 93 位，不包含第93位
String check = line.substring(92, 93);
//check.matches("[]")
//[01459] ---- check="0hello87hhh" ---- true
//只要有0 1 4 5 9 即可返回true
//[^01459] ---- ^ == 非 ---- check="0hello87hhh" ---- false
// 过滤无效数据，温度为 9999 或质量标记不符合要求时跳过
if (Math.abs(temperature) == 9999 || check.matches("[^01459]")) {
return;
}
// 设置输出的键和值
k2.set(year); // 键为年份
v2.set(temperature); // 值为温度
// 将年份和温度作为键值对输出
//需处理异常：Unhandled exceptions: java.io.IOException, java.lang.InterruptedException
context.write(k2, v2);
//context..write(new Text(year),new IntWritable(temperature));可替换上面三行
}
}

复制代码

package com.dxd.dxd;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class TempReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
// reduce 方法：接收每个年份对应的多个温度值，求出最高温度
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int max = Integer.MIN_VALUE;
// 遍历该年份的所有温度，找到最大值
for (IntWritable value : values) {
max = Math.max(max, value.get());
}
// 将年份和最高温度输出
//需处理异常：Unhandled exceptions: java.io.IOException, java.lang.InterruptedException
context.write(key, new IntWritable(max));
}
}

复制代码

package com.dxd.dxd;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class TempDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// 设置用户权限
System.setProperty("HADOOP_USER_NAME", "root");
// 配置作业
Configuration configuration = new Configuration();//需处理异常：Unhandled exception:java.io.IOException
Job job = Job.getInstance(configuration);
// 指定包含MapReduce作业代码的JAR文件的主类即包含main()方法的类
job.setJarByClass(TempDriver.class);
// 设置 Mapper、Reducer 类
job.setMapperClass(TempMap.class);
job.setReducerClass(TempReduce.class);
// 设置 Combiner 类，减少数据传输量（可选）
job.setCombinerClass(TempReduce.class);
// 设置最终输出的键值类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 设置输入输出路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交作业并等待完成，该行有Exception需处理
System.exit(job.waitForCompletion(true) ? 0 : 1);
//Unhandled exceptions: java.lang.InterruptedException, java.lang.ClassNotFoundException
}
}

复制代码

System.exit(job.waitForCompletion(true) ? 0 : 1);

复制代码

System.exit(job.waitForCompletion(true) ? 0 : 1);

复制代码