从图中我们可以看出,AppServer集群想Hadoop集群提交作业,两者之间的数据交互,只需用到Hadoop的HDFS地址和Java API。在AppServer上的应用不会影响到Hadoop集群的正常运行。
package cn.hadoop.hdfs.main; import java.io.IOException; import java.util.Random; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import cn.hadoop.hdfs.util.SystemConfig; /** * @Date Apr 23, 2015 * * @Author dengjie * * @Note Wordcount的例子是一个比较经典的mapreduce例子,可以叫做Hadoop版的hello world。 * 它将文件中的单词分割取出,然后shuffle,sort(map过程),接着进入到汇总统计 * (reduce过程),最后写道hdfs中。基本流程就是这样。 */ public class WordCountV2 { private static Logger logger = LoggerFactory.getLogger(WordCountV2.class); private static Configuration conf; /** * 设置高可用集群连接信息 */ static { String tag = SystemConfig.getProperty("dev.tag"); String[] hosts = SystemConfig.getPropertyArray(tag + ".hdfs.host", ","); conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://cluster1"); conf.set("dfs.nameservices", "cluster1"); conf.set("dfs.ha.namenodes.cluster1", "nna,nns"); conf.set("dfs.namenode.rpc-address.cluster1.nna", hosts[0]); conf.set("dfs.namenode.rpc-address.cluster1.nns", hosts[1]); conf.set("dfs.client.failover.proxy.provider.cluster1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); } public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); /** * 源文件:a b b * * map之后: * * a 1 * * b 1 * * b 1 */ public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString());// 整行读取 while (itr.hasMoreTokens()) { word.set(itr.nextToken());// 按空格分割单词 context.write(word, one);// 每次统计出来的单词+1 } } } /** * reduce之前: * * a 1 * * b 1 * * b 1 * * reduce之后: * * a 1 * * b 2 */ public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get();// 分组累加 } result.set(sum); context.write(key, result);// 按相同的key输出 } } public static void main(String[] args) { try { if (args.length < 1) { logger.info("args length is 0"); run("hello.txt"); } else { logger.info("args length is not 0"); run(args[0]); } } catch (Exception ex) { ex.printStackTrace(); logger.error(ex.getMessage()); } } private static void run(String name) throws Exception { long randName = new Random().nextLong();// 重定向输出目录 logger.info("output name is [" + randName + "]"); Job job = Job.getInstance(conf); job.setJarByClass(WordCountV2.class); job.setMapperClass(TokenizerMapper.class);// 指定Map计算的类 job.setCombinerClass(IntSumReducer.class);// 合并的类 job.setReducerClass(IntSumReducer.class);// Reduce的类 job.setOutputKeyClass(Text.class);// 输出Key类型 job.setOutputValueClass(IntWritable.class);// 输出值类型 String sysInPath = SystemConfig.getProperty("hdfs.input.path.v2"); String realInPath = String.format(sysInPath, name); String syOutPath = SystemConfig.getProperty("hdfs.output.path.v2"); String realOutPath = String.format(syOutPath, randName); FileInputFormat.addInputPath(job, new Path(realInPath));// 指定输入路径 FileOutputFormat.setOutputPath(job, new Path(realOutPath));// 指定输出路径 System.exit(job.waitForCompletion(true) ? 0 : 1);// 执行完MR任务后退出应用 } }
mvn assembly:assembly
scp hadoop-ubas-1.0.0-jar-with-dependencies.jar hadoop@apps:~/
java -jar hadoop-ubas-1.0.0-jar-with-dependencies.jar
java.io.IOException: No FileSystem for scheme: hdfs at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2584) at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2591) at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91) at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630) at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612) at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370) at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:169) at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:354) at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296) at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(FileInputFormat.java:518) at cn.hadoop.hdfs.main.WordCountV2.run(WordCountV2.java:134) at cn.hadoop.hdfs.main.WordCountV2.main(WordCountV2.java:108) 2015-05-10 23:31:21 ERROR [WordCountV2.main] - No FileSystem for scheme: hdfs
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
[hadoop@apps example]$ java -jar hadoop-ubas-1.0.0-jar-with-dependencies.jar 2015-05-11 00:08:15 INFO [SystemConfig.main] - Successfully loaded default properties. 2015-05-11 00:08:15 INFO [WordCountV2.main] - args length is 0 2015-05-11 00:08:15 INFO [WordCountV2.main] - output name is [6876390710620561863] 2015-05-11 00:08:16 WARN [NativeCodeLoader.main] - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2015-05-11 00:08:17 INFO [deprecation.main] - session.id is deprecated. Instead, use dfs.metrics.session-id 2015-05-11 00:08:17 INFO [JvmMetrics.main] - Initializing JVM Metrics with processName=JobTracker, sessionId= 2015-05-11 00:08:17 WARN [JobSubmitter.main] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 2015-05-11 00:08:17 INFO [FileInputFormat.main] - Total input paths to process : 1 2015-05-11 00:08:18 INFO [JobSubmitter.main] - number of splits:1 2015-05-11 00:08:18 INFO [JobSubmitter.main] - Submitting tokens for job: job_local519626586_0001 2015-05-11 00:08:18 INFO [Job.main] - The url to track the job: http://localhost:8080/ 2015-05-11 00:08:18 INFO [Job.main] - Running job: job_local519626586_0001 2015-05-11 00:08:18 INFO [LocalJobRunner.Thread-14] - OutputCommitter set in config null 2015-05-11 00:08:18 INFO [LocalJobRunner.Thread-14] - OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 2015-05-11 00:08:18 INFO [LocalJobRunner.Thread-14] - Waiting for map tasks 2015-05-11 00:08:18 INFO [LocalJobRunner.LocalJobRunner Map Task Executor #0] - Starting task: attempt_local519626586_0001_m_000000_0 2015-05-11 00:08:18 INFO [Task.LocalJobRunner Map Task Executor #0] - Using ResourceCalculatorProcessTree : [ ] 2015-05-11 00:08:18 INFO [MapTask.LocalJobRunner Map Task Executor #0] - Processing split: hdfs://cluster1/home/hdfs/test/in/hello.txt:0+24 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - (EQUATOR) 0 kvi 26214396(104857584) 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - mapreduce.task.io.sort.mb: 100 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - soft limit at 83886080 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - bufstart = 0; bufvoid = 104857600 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - kvstart = 26214396; length = 6553600 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer 2015-05-11 00:08:19 INFO [LocalJobRunner.LocalJobRunner Map Task Executor #0] - 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - Starting flush of map output 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - Spilling map output 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - bufstart = 0; bufend = 72; bufvoid = 104857600 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - kvstart = 26214396(104857584); kvend = 26214352(104857408); length = 45/6553600 2015-05-11 00:08:19 INFO [MapTask.LocalJobRunner Map Task Executor #0] - Finished spill 0 2015-05-11 00:08:19 INFO [Task.LocalJobRunner Map Task Executor #0] - Task:attempt_local519626586_0001_m_000000_0 is done. And is in the process of committing 2015-05-11 00:08:19 INFO [LocalJobRunner.LocalJobRunner Map Task Executor #0] - map 2015-05-11 00:08:19 INFO [Task.LocalJobRunner Map Task Executor #0] - Task 'attempt_local519626586_0001_m_000000_0' done. 2015-05-11 00:08:19 INFO [LocalJobRunner.LocalJobRunner Map Task Executor #0] - Finishing task: attempt_local519626586_0001_m_000000_0 2015-05-11 00:08:19 INFO [LocalJobRunner.Thread-14] - map task executor complete. 2015-05-11 00:08:19 INFO [LocalJobRunner.Thread-14] - Waiting for reduce tasks 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - Starting task: attempt_local519626586_0001_r_000000_0 2015-05-11 00:08:19 INFO [Task.pool-6-thread-1] - Using ResourceCalculatorProcessTree : [ ] 2015-05-11 00:08:19 INFO [ReduceTask.pool-6-thread-1] - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@16769723 2015-05-11 00:08:19 INFO [MergeManagerImpl.pool-6-thread-1] - MergerManager: memoryLimit=177399392, maxSingleShuffleLimit=44349848, mergeThreshold=117083600, ioSortFactor=10, memToMemMergeOutputsThreshold=10 2015-05-11 00:08:19 INFO [EventFetcher.EventFetcher for fetching Map Completion Events] - attempt_local519626586_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events 2015-05-11 00:08:19 INFO [LocalFetcher.localfetcher#1] - localfetcher#1 about to shuffle output of map attempt_local519626586_0001_m_000000_0 decomp: 50 len: 54 to MEMORY 2015-05-11 00:08:19 INFO [InMemoryMapOutput.localfetcher#1] - Read 50 bytes from map-output for attempt_local519626586_0001_m_000000_0 2015-05-11 00:08:19 INFO [MergeManagerImpl.localfetcher#1] - closeInMemoryFile -> map-output of size: 50, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->50 2015-05-11 00:08:19 INFO [EventFetcher.EventFetcher for fetching Map Completion Events] - EventFetcher is interrupted.. Returning 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - 1 / 1 copied. 2015-05-11 00:08:19 INFO [MergeManagerImpl.pool-6-thread-1] - finalMerge called with 1 in-memory map-outputs and 0 on-disk map-outputs 2015-05-11 00:08:19 INFO [Merger.pool-6-thread-1] - Merging 1 sorted segments 2015-05-11 00:08:19 INFO [Merger.pool-6-thread-1] - Down to the last merge-pass, with 1 segments left of total size: 46 bytes 2015-05-11 00:08:19 INFO [MergeManagerImpl.pool-6-thread-1] - Merged 1 segments, 50 bytes to disk to satisfy reduce memory limit 2015-05-11 00:08:19 INFO [MergeManagerImpl.pool-6-thread-1] - Merging 1 files, 54 bytes from disk 2015-05-11 00:08:19 INFO [MergeManagerImpl.pool-6-thread-1] - Merging 0 segments, 0 bytes from memory into reduce 2015-05-11 00:08:19 INFO [Merger.pool-6-thread-1] - Merging 1 sorted segments 2015-05-11 00:08:19 INFO [Merger.pool-6-thread-1] - Down to the last merge-pass, with 1 segments left of total size: 46 bytes 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - 1 / 1 copied. 2015-05-11 00:08:19 INFO [deprecation.pool-6-thread-1] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords 2015-05-11 00:08:19 INFO [Job.main] - Job job_local519626586_0001 running in uber mode : false 2015-05-11 00:08:19 INFO [Job.main] - map 100% reduce 0% 2015-05-11 00:08:19 INFO [Task.pool-6-thread-1] - Task:attempt_local519626586_0001_r_000000_0 is done. And is in the process of committing 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - 1 / 1 copied. 2015-05-11 00:08:19 INFO [Task.pool-6-thread-1] - Task attempt_local519626586_0001_r_000000_0 is allowed to commit now 2015-05-11 00:08:19 INFO [FileOutputCommitter.pool-6-thread-1] - Saved output of task 'attempt_local519626586_0001_r_000000_0' to hdfs://cluster1/home/hdfs/test/out/6876390710620561863/_temporary/0/task_local519626586_0001_r_000000 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - reduce > reduce 2015-05-11 00:08:19 INFO [Task.pool-6-thread-1] - Task 'attempt_local519626586_0001_r_000000_0' done. 2015-05-11 00:08:19 INFO [LocalJobRunner.pool-6-thread-1] - Finishing task: attempt_local519626586_0001_r_000000_0 2015-05-11 00:08:19 INFO [LocalJobRunner.Thread-14] - reduce task executor complete. 2015-05-11 00:08:20 INFO [Job.main] - map 100% reduce 100% 2015-05-11 00:08:20 INFO [Job.main] - Job job_local519626586_0001 completed successfully 2015-05-11 00:08:20 INFO [Job.main] - Counters: 38 File System Counters FILE: Number of bytes read=77813788 FILE: Number of bytes written=78928898 FILE: Number of read operations=0 FILE: Number of large read operations=0 FILE: Number of write operations=0 HDFS: Number of bytes read=48 HDFS: Number of bytes written=24 HDFS: Number of read operations=13 HDFS: Number of large read operations=0 HDFS: Number of write operations=4 Map-Reduce Framework Map input records=2 Map output records=12 Map output bytes=72 Map output materialized bytes=54 Input split bytes=108 Combine input records=12 Combine output records=6 Reduce input groups=6 Reduce shuffle bytes=54 Reduce input records=6 Reduce output records=6 Spilled Records=12 Shuffled Maps =1 Failed Shuffles=0 Merged Map outputs=1 GC time elapsed (ms)=53 CPU time spent (ms)=0 Physical memory (bytes) snapshot=0 Virtual memory (bytes) snapshot=0 Total committed heap usage (bytes)=241442816 Shuffle Errors BAD_ID=0 CONNECTION=0 IO_ERROR=0 WRONG_LENGTH=0 WRONG_MAP=0 WRONG_REDUCE=0 File Input Format Counters Bytes Read=24 File Output Format Counters Bytes Written=24
