Hadoop:第二个程序操作HDFS -> 【获取Datanode名】【写文件】【WordCount计数】
本代码包含功能:获取DataNode名,并写入到HDFS文件系统中的文件hdfs:///copyOftest.c中。
并计数文件hdfs:///copyOftest.c中的wordcount计数,有别于Hadoop的examples中的读取本地文件系统中的文件,这次读取的是HDFS中的文件。
package com.fora;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class FileOperate {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
init();/*初始化文件*/
Configuration conf = new Configuration();
Job job = new Job(conf, "word count");
job.setJarByClass(FileOperate.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
/* set the path of input and output*/
FileInputFormat.addInputPath(job, new Path("hdfs:///copyOftest.c"));
FileOutputFormat.setOutputPath(job, new Path("hdfs:///wordcount"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()){
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{
int sum = 0;
for (IntWritable val : values){
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void init()throws IOException {
/*copy local file to hdfs*/
Configuration config = new Configuration();
FileSystem hdfs = null;
String srcFile = "/test.c";
String dstFile = "hdfs:///copyOftest.c";
System.out.print("copy success!\n");
hdfs = FileSystem.get(config);
Path srcPath = new Path(srcFile);
Path dstPath = new Path(dstFile);
hdfs.copyFromLocalFile(srcPath, dstPath);
String fileName = "hdfs:///copyOftest.c";
Path path = new Path(fileName);
FileStatus fileStatus =null;
fileStatus = hdfs.getFileStatus(path);
System.out.println(fileStatus.getBlockSize());
FileSystem fs = FileSystem.get(config);
DistributedFileSystem hdfs1 = (DistributedFileSystem) fs;
DatanodeInfo[] dataNodeStats = hdfs1.getDataNodeStats();
/*create a file on hdfs*/
Path Outputpath = new Path("hdfs:///output/listOfDatanode");
FSDataOutputStream outputStream = hdfs.create(Outputpath);
String[] names = new String[dataNodeStats.length];
for (int i = 0; i < dataNodeStats.length; i++) {
names[i] = dataNodeStats[i].getHostName();/*get the list of datanodes*/
System.out.println(names[i]);
/*write the list of datanodes to file on hdfs*/
outputStream.write(names[i].getBytes(), 0, names[i].length());
}
}
}
运行结果:
[root@master bin]# hadoop jar HDFS.jar com.fora.FileOperate copy success! 67108864 master slave1 11/07/21 15:45:23 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 11/07/21 15:45:23 INFO input.FileInputFormat: Total input paths to process : 1 11/07/21 15:45:23 INFO mapred.JobClient: Running job: job_201107210917_0003 11/07/21 15:45:24 INFO mapred.JobClient: map 0% reduce 0% 11/07/21 15:45:31 INFO mapred.JobClient: map 100% reduce 0% 11/07/21 15:45:43 INFO mapred.JobClient: map 100% reduce 100% 11/07/21 15:45:45 INFO mapred.JobClient: Job complete: job_201107210917_0003 11/07/21 15:45:45 INFO mapred.JobClient: Counters: 17 11/07/21 15:45:45 INFO mapred.JobClient: Job Counters 11/07/21 15:45:45 INFO mapred.JobClient: Launched reduce tasks=1 11/07/21 15:45:45 INFO mapred.JobClient: Rack-local map tasks=1 11/07/21 15:45:45 INFO mapred.JobClient: Launched map tasks=1 11/07/21 15:45:45 INFO mapred.JobClient: FileSystemCounters 11/07/21 15:45:45 INFO mapred.JobClient: FILE_BYTES_READ=228 11/07/21 15:45:45 INFO mapred.JobClient: HDFS_BYTES_READ=126 11/07/21 15:45:45 INFO mapred.JobClient: FILE_BYTES_WRITTEN=488 11/07/21 15:45:45 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=146 11/07/21 15:45:45 INFO mapred.JobClient: Map-Reduce Framework 11/07/21 15:45:45 INFO mapred.JobClient: Reduce input groups=19 11/07/21 15:45:45 INFO mapred.JobClient: Combine output records=19 11/07/21 15:45:45 INFO mapred.JobClient: Map input records=8 11/07/21 15:45:45 INFO mapred.JobClient: Reduce shuffle bytes=228 11/07/21 15:45:45 INFO mapred.JobClient: Reduce output records=19 11/07/21 15:45:45 INFO mapred.JobClient: Spilled Records=38 11/07/21 15:45:45 INFO mapred.JobClient: Map output bytes=211 11/07/21 15:45:45 INFO mapred.JobClient: Combine input records=22 11/07/21 15:45:45 INFO mapred.JobClient: Map output records=22 11/07/21 15:45:45 INFO mapred.JobClient: Reduce input records=19 [root@master bin]# hadoop dfs -ls / Found 6 items -rw-r--r-- 1 root supergroup 126 2011-07-21 15:45 /copyOftest.c -rw-r--r-- 1 root supergroup 26 2011-07-21 15:16 /listOfDatanode drwxr-xr-x - root supergroup 0 2011-07-21 15:45 /output -rw-r--r-- 1 root supergroup 10400 2011-07-20 16:51 /test.txt drwxr-xr-x - root supergroup 0 2011-07-20 16:09 /tmp drwxr-xr-x - root supergroup 0 2011-07-21 15:45 /wordcount [root@master bin]# hadoop dfs -ls /wordcount Found 2 items drwxr-xr-x - root supergroup 0 2011-07-21 15:45 /wordcount/_logs -rw-r--r-- 1 root supergroup 146 2011-07-21 15:45 /wordcount/part-r-00000 [root@master bin]# hadoop dfs -cat /wordcount/part-r-00000 2011-07-21 1 File 1 Hadoop 1 System! 1 a 1 aimed 1 at 1 coping 1 file 3 from 1 from:fora 1 is 1 local 1 system 1 thank 1 the 1 this 2 to 1 you! 1 [root@master bin]#