Hadoop 0.20.2中怎么使用MultipleOutputFormat实现多文件输出和完全自定义文件名

最近项目需要在 hadoop 0.20.2 中使用 MultipleOutputFormat,可是发现0.20.2中无法使用这个方法。尽管0.19.2中的方法老的方法org.apache.hadoop.mapred.lib.MultipleOutputFormat 还是可以继续在0.20.2中使用,但是 org.apache.hadoop.mapred 下的方法都是标记为“已过时”,在hadoop下个版本中可能就不能使用了。hadoop 0.20.2中是推荐使用 Configuration 替换 JobConf,而这个老的方法org.apache.hadoop.mapred.lib.MultipleOutputFormat 中还是使用的JobConf,就是说还没有新的可替换API。

网上已经有人写出了解决的方法:http://blog.csdn.net/inkfish/archive/2010/01/08/5156651.aspx

官网上看到有人写了针对0.20.2的API:https://issues.apache.org/jira/browse/MAPREDUCE-1145

新的用于hadoop 0.21 的API也讨论过:https://issues.apache.org/jira/browse/MAPREDUCE-370

目前在hadoop 0.21或更新的版本中可以使用新的方法了,对于hadoop 0.20.2的用户,可以打个patch,或者升级到新版本就能使用新的API了:

https://hudson.apache.org/hudson/job/Hadoop-Mapreduce-trunk/ws/trunk/src/java/org/apache/hadoop/mapreduce/lib/output/MultipleOutputs.java

hadoop 0.20.2还只是一个中间版本,并不是所有API都升级到最新了,没有提供的API只能自己写了。

由于项目中为了兼容hadoop0.19.2的 MultipleOutputFormat 输出,需要对文件名要能完成的自定义,而且不需要输出key的值。最终生成的文件名类似于common-part-00000,mall-part-00000的形式。所以,我参考网上的做法作了些修改。
LineRecordWriter.java:

/**
 * 该类的功能是输出一行,输出key,分隔符,value的值
 * */
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

public class LineRecordWriter<K, V> extends RecordWriter<K, V> {
    private static final String utf8 = "UTF-8";
    private static final byte[] newline;
    static {
        try {
            newline = "n".getBytes(utf8);
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
        }
    }

    protected DataOutputStream out;
    private final byte[] keyValueSeparator;

    public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
        this.out = out;
        try {
            this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
        } catch (UnsupportedEncodingException uee) {
            throw new IllegalArgumentException("can't find " + utf8 + " encoding");
        }
    }

    public LineRecordWriter(DataOutputStream out) {
        this(out, "t");
    }

    private void writeObject(Object o) throws IOException {
        if (o instanceof Text) {
            Text to = (Text) o;
            out.write(to.getBytes(), 0, to.getLength());
        } else {
            out.write(o.toString().getBytes(utf8));
        }
    }

    // 可以根据需要输出key,keyValueSeparator或者value其中一个或多个
    public synchronized void write(K key, V value) throws IOException {
    /*
      // 我这里只输出value的值
        boolean nullValue = value == null || value instanceof NullWritable;
        if (nullValue) {
            return;
        } else {
            writeObject(value);
        }
        out.write(newline);
    */

      // 输出key,分隔符和value
        boolean nullKey = key == null || key instanceof NullWritable;
        boolean nullValue = value == null || value instanceof NullWritable;
        if (nullKey &amp;&amp; nullValue) {
            return;
        }
        if (!nullKey) {
            writeObject(key);
        }
        if (!(nullKey || nullValue)) {
            out.write(keyValueSeparator);
        }
        if (!nullValue) {
            writeObject(value);
        }
        out.write(newline);

    }

    public synchronized void close(TaskAttemptContext context) throws IOException {
        out.close();
    }
}

MultipleOutputFormat.java

import java.io.DataOutputStream;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.HashMap;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;

public abstract class MultipleOutputFormat<K extends WritableComparable<?>, V extends Writable>
        extends FileOutputFormat<K, V> {
    private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();

    static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
    }

    private MultiRecordWriter writer = null;

    public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException,
            InterruptedException {
        if (writer == null) {
            writer = new MultiRecordWriter(job, getTaskOutputPath(job));
        }
        return writer;
    }

    private Path getTaskOutputPath(TaskAttemptContext conf) throws IOException {
        Path workPath = null;
        OutputCommitter committer = super.getOutputCommitter(conf);
        if (committer instanceof FileOutputCommitter) {
            workPath = ((FileOutputCommitter) committer).getWorkPath();
        } else {
            Path outputPath = super.getOutputPath(conf);
            if (outputPath == null) {
                throw new IOException("Undefined job output-path");
            }
            workPath = outputPath;
        }
        return workPath;
    }

    /**通过key, value, conf来确定输出文件名(含扩展名)*/
    protected abstract String generateFileNameForKeyValue(K key, V value, String name);

    public class MultiRecordWriter extends RecordWriter<K, V> {
        /**RecordWriter的缓存*/
        private HashMap<String, RecordWriter<K, V>> recordWriters = null;
        private TaskAttemptContext job = null;
        /**输出目录*/
        private Path workPath = null;
        public MultiRecordWriter(TaskAttemptContext job, Path workPath) {
            super();
            this.job = job;
            this.workPath = workPath;
            recordWriters = new HashMap<String, RecordWriter<K, V>>();
        }
        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            Iterator<RecordWriter<K, V>> values = this.recordWriters.values().iterator();
            while (values.hasNext()) {
                values.next().close(context);
            }
            this.recordWriters.clear();
        }

        @Override
        public void write(K key, V value) throws IOException, InterruptedException {

            // get the partition number
            TaskID taskId = job.getTaskAttemptID().getTaskID();
            int partition = taskId.getId();

            //得到输出文件名,注意这里会传入taskId,就是 00000
            String baseName = generateFileNameForKeyValue(key, value, NUMBER_FORMAT.format(partition));

            RecordWriter<K, V> rw = this.recordWriters.get(baseName);
            if (rw == null) {
                rw = getBaseRecordWriter(job, baseName);
                this.recordWriters.put(baseName, rw);
            }
            rw.write(key, value);
        }

        private RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job, String baseName)
                throws IOException, InterruptedException {
            Configuration conf = job.getConfiguration();
            boolean isCompressed = getCompressOutput(job);
            // 硬编码,应该通过configuration从配置从读取分隔符
            String keyValueSeparator = ",";
            RecordWriter<K, V> recordWriter = null;
            if (isCompressed) {
                Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job,
                        GzipCodec.class);
                CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
                Path file = new Path(workPath, baseName + codec.getDefaultExtension());
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
                recordWriter = new LineRecordWriter<K, V>(new DataOutputStream(codec
                        .createOutputStream(fileOut)), keyValueSeparator);
            } else {
                Path file = new Path(workPath, baseName);
                FSDataOutputStream fileOut = file.getFileSystem(conf).create(file, false);
                recordWriter = new LineRecordWriter<K, V>(fileOut, keyValueSeparator);
            }
            return recordWriter;
        }
    }
}

我们的程序只需要继承MultipleOutputFormat,实现generateFileNameForKeyValue方法即可。generateFileNameForKeyValue方法有三个参数,key,value,name,这个name就是partion之后的number,如00000

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount202 {

    public static class TokenizerMapper extends
            Mapper<Object, Text, Text, IntWritable> {

        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        @Override
        public void map(Object key, Text value, Context context)
                throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class IntSumReducer extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        @Override
        public void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    static public class MyMultipleTextOutputFormat extends
            MultipleOutputFormat<Text, IntWritable> {
        protected String generateFileNameForKeyValue(Text key, IntWritable value, String name) {
            char c = key.toString().charAt(0);
            if (c >= 'a' &amp;&amp; c <= 'z') {
                return "common-part-" + name;
            }else if(c >= 'A' &amp;&amp; c <= 'Z'){
                return "mall-part-" + name;
            }
            //return "other-part";         //it is a bug
           return "other-part" + name;
           }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }

        Job job = new Job(conf);
        job.setJarByClass(WordCount202.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setNumReduceTasks(5);
        job.setOutputFormatClass(MyMultipleTextOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

程序执行的结果如下:

[admin@localhost steven]$ hadoop jar WordCount.jar /app/input /app/output
11/01/06 23:19:32 INFO input.FileInputFormat: Total input paths to process : 2
11/01/06 23:19:32 INFO mapred.JobClient: Running job: job_201012211532_0105
11/01/06 23:19:33 INFO mapred.JobClient:  map 0% reduce 0%
11/01/06 23:19:41 INFO mapred.JobClient:  map 100% reduce 0%
11/01/06 23:19:53 INFO mapred.JobClient:  map 100% reduce 60%
11/01/06 23:19:54 INFO mapred.JobClient:  map 100% reduce 100%
11/01/06 23:19:56 INFO mapred.JobClient: Job complete: job_201012211532_0105
11/01/06 23:19:56 INFO mapred.JobClient: Counters: 18
11/01/06 23:19:56 INFO mapred.JobClient:   Job Counters
11/01/06 23:19:56 INFO mapred.JobClient:     Launched reduce tasks=5
11/01/06 23:19:56 INFO mapred.JobClient:     Rack-local map tasks=1
11/01/06 23:19:56 INFO mapred.JobClient:     Launched map tasks=2
11/01/06 23:19:56 INFO mapred.JobClient:     Data-local map tasks=1
11/01/06 23:19:56 INFO mapred.JobClient:   FileSystemCounters
11/01/06 23:19:56 INFO mapred.JobClient:     FILE_BYTES_READ=1134
11/01/06 23:19:56 INFO mapred.JobClient:     HDFS_BYTES_READ=1038
11/01/06 23:19:56 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=2554
11/01/06 23:19:56 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=100
11/01/06 23:19:56 INFO mapred.JobClient:   Map-Reduce Framework
11/01/06 23:19:56 INFO mapred.JobClient:     Reduce input groups=50
11/01/06 23:19:56 INFO mapred.JobClient:     Combine output records=50
11/01/06 23:19:56 INFO mapred.JobClient:     Map input records=16
11/01/06 23:19:56 INFO mapred.JobClient:     Reduce shuffle bytes=979
11/01/06 23:19:56 INFO mapred.JobClient:     Reduce output records=50
11/01/06 23:19:56 INFO mapred.JobClient:     Spilled Records=100
11/01/06 23:19:56 INFO mapred.JobClient:     Map output bytes=1186
11/01/06 23:19:56 INFO mapred.JobClient:     Combine input records=68
11/01/06 23:19:56 INFO mapred.JobClient:     Map output records=68
11/01/06 23:19:56 INFO mapred.JobClient:     Reduce input records=50
[admin@localhost steven]$ hadoop fs -ls /app/output
Found 10 items
drwxr-xr-x   - admin supergroup          0 2011-01-06 23:19 /app/output/_logs
-rw-r--r--   3 admin supergroup          6 2011-01-06 23:19 /app/output/common-part-00000
-rw-r--r--   3 admin supergroup          4 2011-01-06 23:19 /app/output/common-part-00001
-rw-r--r--   3 admin supergroup          8 2011-01-06 23:19 /app/output/common-part-00002
-rw-r--r--   3 admin supergroup         10 2011-01-06 23:19 /app/output/common-part-00003
-rw-r--r--   3 admin supergroup         10 2011-01-06 23:19 /app/output/common-part-00004
-rw-r--r--   3 admin supergroup          6 2011-01-06 23:19 /app/output/mall-part-00000
-rw-r--r--   3 admin supergroup          2 2011-01-06 23:19 /app/output/mall-part-00002
-rw-r--r--   3 admin supergroup          2 2011-01-06 23:19 /app/output/mall-part-00003
-rw-r--r--   3 admin supergroup          6 2011-01-06 23:19 /app/output/other-part-00000
-rw-r--r--   3 admin supergroup          6 2011-01-06 23:19 /app/output/other-part-00002

发表评论

电子邮件地址不会被公开。 必填项已用*标注