hadoop 使用map将SequenFile里的小文件解压出来

上例中将HDFS里小文件通过mapper压缩到一个文件中,本例将这些小文件解压出来。

mapreduce可以按SequenceFile的key进行分片。

1、mapper

public class MultiOutputMapper extends Mapper<Text,BytesWritable,NullWritable,Text> {
    private MultipleOutputs<NullWritable,Text> multipleOutputs;
    private long splitLength;

    /**
     * Called once at the beginning of the task.
     *
     * @param context
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
        InputSplit split = context.getInputSplit();
        splitLength = split.getLength();
    }

    /**
     * Called once for each key/value pair in the input split. Most applications
     * should override this, but the default is the identity function.
     *
     * @param key
     * @param value
     * @param context
     */
    @Override
    protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
        System.out.printf("split length:%s,value length:%s,bytes length:%s",splitLength,value.getLength(),value.getBytes().length);
        int length = value.getLength();
        byte[] bytes = new byte[length];
        System.arraycopy(value.getBytes(),0,bytes,0,length);

        Text contents = new Text(bytes);
        //根据SequenceFile里key的原路径路径生成文件
//        multipleOutputs.write(NullWritable.get(),new Text(bytes),key.toString());

        //在output里输出文件
        Path path = new Path(key.toString());
        String outputFileName = String.format("%s/%s","2019",path.getName());
        multipleOutputs.write(NullWritable.get(),new Text(bytes),outputFileName);
//
//        multipleOutputs.write(NullWritable.get(),new Text(value.getBytes()),key.toString());//这句是错的。
//        通过测试,对于SequenceFile,是按key进入分片,value的length是实际长度,value.getbytes的长度是value的buff长度,两个不一定相等
//        split length:88505,value length:4364,bytes length:6546
    }

    /**
     * Called once at the end of the task.
     *
     * @param context
     */
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        multipleOutputs.close();
        super.cleanup(context);
    }
}

 

2、job

public class SequenceFileToSmallFileConverter {

    public static void main(String[] args) throws Exception{

        long startTime = System.currentTimeMillis();

        Configuration conf = new Configuration();


        Path outPath = new Path(args[1]);
        FileSystem fileSystem = outPath.getFileSystem(conf);
        //删除输出路径
        if(fileSystem.exists(outPath))
        {
            fileSystem.delete(outPath,true);
        }

        Job job = Job.getInstance(conf,"SequenceFileToSmallFileConverter");
        job.setJarByClass(SequenceFileToSmallFileConverter.class);

        job.setMapperClass(MultiOutputMapper.class);
        job.setNumReduceTasks(0);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        //TextOutputFormat会在每行文本后面加入换行符号,如果是这个文本作为一个整体来处理,最后就会比预期多一个换行符号
//        job.setOutputFormatClass(TextOutputFormat.class);

        //WholeTextOutputFormat与TextOutputFormat的区别就是没有在每行写入换行符
        job.setOutputFormatClass(WholeTextOutputFormat.class);

        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(BytesWritable.class);



        FileInputFormat.addInputPath(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));


        int exitCode = job.waitForCompletion(true) ? 0:1;

        long endTime = System.currentTimeMillis();
        long timeSpan = endTime - startTime;
        System.out.println("运行耗时:"+timeSpan+"毫秒。");

        System.exit(exitCode);
    }
}
public class WholeTextOutputFormat<K, V> extends FileOutputFormat<K, V> {
    public static String SEPARATOR = "mapreduce.output.textoutputformat.separator";
    /**
     * @deprecated Use {@link #SEPARATOR}
     */
    @Deprecated
    public static String SEPERATOR = SEPARATOR;
    protected static class LineRecordWriter<K, V>
            extends RecordWriter<K, V> {
        private static final byte[] NEWLINE =
                "\n".getBytes(StandardCharsets.UTF_8);

        protected DataOutputStream out;
        private final byte[] keyValueSeparator;

        public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
            this.out = out;
            this.keyValueSeparator =
                    keyValueSeparator.getBytes(StandardCharsets.UTF_8);
        }

        public LineRecordWriter(DataOutputStream out) {
            this(out, "\t");
        }

        /**
         * Write the object to the byte stream, handling Text as a special
         * case.
         * @param o the object to print
         * @throws IOException if the write throws, we pass it on
         */
        private void writeObject(Object o) throws IOException {
            if (o instanceof Text) {
                Text to = (Text) o;
                out.write(to.getBytes(), 0, to.getLength());
            } else {
                out.write(o.toString().getBytes(StandardCharsets.UTF_8));
            }
        }

        public synchronized void write(K key, V value)
                throws IOException {

            boolean nullKey = key == null || key instanceof NullWritable;
            boolean nullValue = value == null || value instanceof NullWritable;
            if (nullKey && nullValue) {
                return;
            }
            if (!nullKey) {
                writeObject(key);
            }
            if (!(nullKey || nullValue)) {
                out.write(keyValueSeparator);
            }
            if (!nullValue) {
                writeObject(value);
            }
//            out.write(NEWLINE);将文本当做整体,各key之间不主动加入换行符号
        }

        public synchronized
        void close(TaskAttemptContext context) throws IOException {
            out.close();
        }
    }

    public RecordWriter<K, V>
    getRecordWriter(TaskAttemptContext job
    ) throws IOException, InterruptedException {
        Configuration conf = job.getConfiguration();
        boolean isCompressed = getCompressOutput(job);
        String keyValueSeparator= conf.get(SEPARATOR, "\t");
        CompressionCodec codec = null;
        String extension = "";
        if (isCompressed) {
            Class<? extends CompressionCodec> codecClass =
                    getOutputCompressorClass(job, GzipCodec.class);
            codec = ReflectionUtils.newInstance(codecClass, conf);
            extension = codec.getDefaultExtension();
        }
        Path file = getDefaultWorkFile(job, extension);
        FileSystem fs = file.getFileSystem(conf);
        FSDataOutputStream fileOut = fs.create(file, false);
        if (isCompressed) {
            return new WholeTextOutputFormat.LineRecordWriter<>(
                    new DataOutputStream(codec.createOutputStream(fileOut)),
                    keyValueSeparator);
        } else {
            return new WholeTextOutputFormat.LineRecordWriter<>(fileOut, keyValueSeparator);
        }
    }
}

3、验证,压入SequenceFile和解压后的文件完全相同。

[hadoop@bigdata-senior01 ~]$ hadoop fs -checksum /demo/1.txt-m-00000 /demo3/1.txt
/demo/1.txt-m-00000    MD5-of-0MD5-of-512CRC32C    0000020000000000000000007b6bd9c9f517a6ea12ede79fd43700ca
/demo3/1.txt    MD5-of-0MD5-of-512CRC32C    0000020000000000000000007b6bd9c9f517a6ea12ede79fd43700ca

 

posted @ 2019-02-19 16:11  我是属车的  阅读(244)  评论(0编辑  收藏  举报