mapreduce的多种格式文件输出-自定义OutputFormat
/** * @description: mapreduce多种格式的文件输出方式 */ public class MultipleTypeOutputFormat<K, V> extends FileOutputFormat<K, V> { private static final String ORCEXTENSION = ".orc"; private static final String CSVEXTENSION = ".csv"; public static final String SKIP_TEMP_DIRECTORY = "orc.mapreduce.output.skip-temporary-directory"; public MultipleTypeOutputFormat() { } /** * 具体数据写出对象 * * @param job the information about the current task. * @return * @throws IOException * @throws InterruptedException */ @Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { // 根据需要,你可以在这里添加逻辑以决定使用SequenceFileOutputFormat还是TextOutputFormat //根据基准路径和输出文件截出标识 String outputNameStr = job.getConfiguration().get(BASE_OUTPUT_NAME); String dirOutputStr = job.getConfiguration().get(OUTDIR); if (outputNameStr.contains(":")) { outputNameStr = outputNameStr.split(":")[1]; } if (dirOutputStr.contains(":")) { dirOutputStr = dirOutputStr.split(":")[1]; } //输出格式标识 String flag = ""; if (outputNameStr.startsWith(dirOutputStr)) { String pathStr = outputNameStr.substring(dirOutputStr.length() + 1, outputNameStr.length()); if (pathStr.contains("/")) { flag = pathStr.split("/")[0]; } else if (pathStr.contains(File.separator)) { flag = pathStr.split(File.separator)[0]; } } //从这个方法里面可以获取一个configuration Configuration configuration = job.getConfiguration(); //根据标识输出相应的数据 switch (flag) { case "cleandata"://清洗明细结果ORC格式 case "basetotaldata"://清洗基准值ORC格式 case "calcdata"://清洗计算出异常记录结果ORC格式 //文件的输出路径 Path file = this.getDefaultWorkFile(job, ORCEXTENSION); TypeDescription schema = TypeDescription.fromString("struct<did:string,dno:bigint,dtm:bigint,kind:int,typ:bigint,val:string>"); OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(new Configuration()); //该类型新版本新增的 CompressionKind zlib = CompressionKind.ZSTD; Writer writer = OrcFile.createWriter(file, writerOptions.setSchema(schema).compress(zlib)); OrcMapreduceRecordWriter orcMapreduceRecordWriter = new OrcMapreduceRecordWriter(writer); return orcMapreduceRecordWriter; case "infodata"://清洗场景识别明细数据ORC格式 //文件的输出路径 file = this.getDefaultWorkFile(job, ORCEXTENSION); schema = TypeDescription.fromString("struct<did:string,dno:string,dtm:bigint,kind:int,typ:bigint,val:string>"); writerOptions = OrcFile.writerOptions(new Configuration()); zlib = CompressionKind.ZSTD; writer = OrcFile.createWriter(file, writerOptions.setSchema(schema).compress(zlib)); orcMapreduceRecordWriter = new OrcMapreduceRecordWriter(writer); return orcMapreduceRecordWriter; case "cleancsvdata"://清洗结果CSV格式 file = this.getDefaultWorkFile(job, CSVEXTENSION); Configuration conf = job.getConfiguration(); String keyValueSeparator = conf.get(TextOutputFormat.SEPERATOR, "\t"); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } return null; } /** * 输出job的工作路径 * * @param context the task context * @param extension an extension to add to the filename * @return * @throws IOException */ @Override public Path getDefaultWorkFile(TaskAttemptContext context, String extension) throws IOException { if (context.getConfiguration().getBoolean(SKIP_TEMP_DIRECTORY, false)) { return new Path(getOutputPath(context), getUniqueFile(context, getOutputName(context), extension)); } else { //自定义 map 输出和 reduce 输出 String fileNameprefix = context.getConfiguration().get("fileNameprefix"); if (StringUtils.isNotBlank(fileNameprefix)) { String outputPath = context.getConfiguration().get("outputPath"); String fileName = getMUniqueFile(context, fileNameprefix, extension); return new Path(outputPath, fileName); } else { //默认方式 FileOutputCommitter committer = (FileOutputCommitter) getOutputCommitter(context); return new Path(committer.getWorkPath(), getUniqueFile(context, getOutputName(context), extension)); } } } /** * 自定义 模拟源码对文件名进行编写 * * @param context * @param name * @param extension * @return */ public synchronized static String getMUniqueFile(TaskAttemptContext context, String name, String extension) { TaskID taskId = context.getTaskAttemptID().getTaskID(); int partition = taskId.getId(); StringBuilder result = new StringBuilder(); result.append(name); result.append('-'); result.append(NumberFormat.getInstance().format(partition)); result.append(extension); return result.toString(); } /** * 输出job的提交对象 * * @param context the task context * @return * @throws IOException */ @Override public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException { return super.getOutputCommitter(context); } protected static class LineRecordWriter<K, V> extends RecordWriter<K, V> { private static final String utf8 = "UTF-8"; private static final byte[] newline; static { try { newline = "\n".getBytes(utf8); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + utf8 + " encoding"); } } protected DataOutputStream out; private final byte[] keyValueSeparator; /** * 按行输出 * @param out * @param keyValueSeparator */ public LineRecordWriter(DataOutputStream out, String keyValueSeparator) { this.out = out; try { this.keyValueSeparator = keyValueSeparator.getBytes(utf8); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + utf8 + " encoding"); } } public LineRecordWriter(DataOutputStream out) { this(out, "\t"); } /** * Write the object to the byte stream, handling Text as a special * case. * * @param o the object to print * @throws IOException if the write throws, we pass it on */ private void writeObject(Object o) throws IOException { if (o instanceof Text) { Text to = (Text) o; out.write(to.getBytes(), 0, to.getLength()); } else { out.write(o.toString().getBytes(utf8)); } } public synchronized void write(K key, V value) throws IOException { boolean nullKey = key == null || key instanceof NullWritable; boolean nullValue = value == null || value instanceof NullWritable; if (nullKey && nullValue) { return; } if (!nullKey) { writeObject(key); } if (!(nullKey || nullValue)) { out.write(keyValueSeparator); } if (!nullValue) { writeObject(value); } out.write(newline); } public synchronized void close(TaskAttemptContext context) throws IOException { out.close(); } } }
注意,不同类型的标识取的的比较low,很难通用,下次注意,另外在reduce的输出是的<K,V>指定为
//指定reduce输出 job.setOutputKeyClass(NullWritable.class);//red输出的key job.setOutputValueClass(Writable.class);//red输出的value