Hadoop:第二个程序操作HDFS -> 【获取Datanode名】【写文件】【WordCount计数】

本代码包含功能:获取DataNode名,并写入到HDFS文件系统中的文件hdfs:///copyOftest.c中。
并计数文件hdfs:///copyOftest.c中的wordcount计数,有别于Hadoop的examples中的读取本地文件系统中的文件,这次读取的是HDFS中的文件

package com.fora;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;


public class FileOperate {

public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

init();/*初始化文件*/

Configuration conf
= new Configuration();

Job job
= new Job(conf, "word count");
job.setJarByClass(FileOperate.
class);

job.setMapperClass(TokenizerMapper.
class);
job.setCombinerClass(IntSumReducer.
class);
job.setReducerClass(IntSumReducer.
class);

job.setOutputKeyClass(Text.
class);
job.setOutputValueClass(IntWritable.
class);

/* set the path of input and output*/
FileInputFormat.addInputPath(job,
new Path("hdfs:///copyOftest.c"));
FileOutputFormat.setOutputPath(job,
new Path("hdfs:///wordcount"));

System.exit(job.waitForCompletion(
true) ? 0 : 1);
}

public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(Object key, Text value, Context context) throws IOException, InterruptedException {

StringTokenizer itr
= new StringTokenizer(value.toString());
while (itr.hasMoreTokens()){
word.set(itr.nextToken());
context.write(word, one);
}
}
}

public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();

public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException{

int sum = 0;
for (IntWritable val : values){
sum
+= val.get();
}
result.set(sum);
context.write(key, result);
}
}

public static void init()throws IOException {

/*copy local file to hdfs*/
Configuration config
= new Configuration();
FileSystem hdfs
= null;
String srcFile
= "/test.c";
String dstFile
= "hdfs:///copyOftest.c";
System.out.print(
"copy success!\n");
hdfs
= FileSystem.get(config);
Path srcPath
= new Path(srcFile);
Path dstPath
= new Path(dstFile);
hdfs.copyFromLocalFile(srcPath, dstPath);

String fileName
= "hdfs:///copyOftest.c";
Path path
= new Path(fileName);
FileStatus fileStatus
=null;

fileStatus
= hdfs.getFileStatus(path);
System.out.println(fileStatus.getBlockSize());

FileSystem fs
= FileSystem.get(config);
DistributedFileSystem hdfs1
= (DistributedFileSystem) fs;
DatanodeInfo[] dataNodeStats
= hdfs1.getDataNodeStats();

/*create a file on hdfs*/
Path Outputpath
= new Path("hdfs:///output/listOfDatanode");
FSDataOutputStream outputStream
= hdfs.create(Outputpath);

String[] names
= new String[dataNodeStats.length];
for (int i = 0; i < dataNodeStats.length; i++) {
names[i]
= dataNodeStats[i].getHostName();/*get the list of datanodes*/
System.out.println(names[i]);
/*write the list of datanodes to file on hdfs*/
outputStream.write(names[i].getBytes(),
0, names[i].length());
}
}

}

运行结果:
[root@master bin]# hadoop jar HDFS.jar com.fora.FileOperate
copy success!
67108864
master
slave1
11/07/21 15:45:23 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
11/07/21 15:45:23 INFO input.FileInputFormat: Total input paths to process : 1
11/07/21 15:45:23 INFO mapred.JobClient: Running job: job_201107210917_0003
11/07/21 15:45:24 INFO mapred.JobClient:  map 0% reduce 0%
11/07/21 15:45:31 INFO mapred.JobClient:  map 100% reduce 0%
11/07/21 15:45:43 INFO mapred.JobClient:  map 100% reduce 100%
11/07/21 15:45:45 INFO mapred.JobClient: Job complete: job_201107210917_0003
11/07/21 15:45:45 INFO mapred.JobClient: Counters: 17
11/07/21 15:45:45 INFO mapred.JobClient:   Job Counters 
11/07/21 15:45:45 INFO mapred.JobClient:     Launched reduce tasks=1
11/07/21 15:45:45 INFO mapred.JobClient:     Rack-local map tasks=1
11/07/21 15:45:45 INFO mapred.JobClient:     Launched map tasks=1
11/07/21 15:45:45 INFO mapred.JobClient:   FileSystemCounters
11/07/21 15:45:45 INFO mapred.JobClient:     FILE_BYTES_READ=228
11/07/21 15:45:45 INFO mapred.JobClient:     HDFS_BYTES_READ=126
11/07/21 15:45:45 INFO mapred.JobClient:     FILE_BYTES_WRITTEN=488
11/07/21 15:45:45 INFO mapred.JobClient:     HDFS_BYTES_WRITTEN=146
11/07/21 15:45:45 INFO mapred.JobClient:   Map-Reduce Framework
11/07/21 15:45:45 INFO mapred.JobClient:     Reduce input groups=19
11/07/21 15:45:45 INFO mapred.JobClient:     Combine output records=19
11/07/21 15:45:45 INFO mapred.JobClient:     Map input records=8
11/07/21 15:45:45 INFO mapred.JobClient:     Reduce shuffle bytes=228
11/07/21 15:45:45 INFO mapred.JobClient:     Reduce output records=19
11/07/21 15:45:45 INFO mapred.JobClient:     Spilled Records=38
11/07/21 15:45:45 INFO mapred.JobClient:     Map output bytes=211
11/07/21 15:45:45 INFO mapred.JobClient:     Combine input records=22
11/07/21 15:45:45 INFO mapred.JobClient:     Map output records=22
11/07/21 15:45:45 INFO mapred.JobClient:     Reduce input records=19
[root@master bin]# hadoop dfs  -ls /
Found 6 items
-rw-r--r--   1 root supergroup        126 2011-07-21 15:45 /copyOftest.c
-rw-r--r--   1 root supergroup         26 2011-07-21 15:16 /listOfDatanode
drwxr-xr-x   - root supergroup          0 2011-07-21 15:45 /output
-rw-r--r--   1 root supergroup      10400 2011-07-20 16:51 /test.txt
drwxr-xr-x   - root supergroup          0 2011-07-20 16:09 /tmp
drwxr-xr-x   - root supergroup          0 2011-07-21 15:45 /wordcount
[root@master bin]# hadoop dfs -ls /wordcount
Found 2 items
drwxr-xr-x   - root supergroup          0 2011-07-21 15:45 /wordcount/_logs
-rw-r--r--   1 root supergroup        146 2011-07-21 15:45 /wordcount/part-r-00000
[root@master bin]# hadoop dfs -cat /wordcount/part-r-00000
2011-07-21      1
File    1
Hadoop  1
System! 1
a       1
aimed   1
at      1
coping  1
file    3
from    1
from:fora       1
is      1
local   1
system  1
thank   1
the     1
this    2
to      1
you!    1
[root@master bin]# 



  

posted @ 2011-07-21 15:57  ForA  阅读(1476)  评论(0编辑  收藏  举报