java 数据流相关
直接代码吧,很清楚:
package mahout; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.net.MalformedURLException; import java.net.URL; import org.apache.hadoop.io.IOUtils; public class GetLibData { //将数据集进行分割,单个文件太大了。 public static void main(String[] args) throws Exception { File src = new File("D:\\hadoop相关\\数据集\\links-simple-sorted\\links-simple-sorted.txt"); FileInputStream fis = new FileInputStream(src); BufferedReader reader = new BufferedReader(new InputStreamReader(fis)); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("data/wike-part100000")))); String line = null; int count = 0; while((line = reader.readLine()) != null && count < 100001){ writer.write(line); writer.write("\n"); count++; } writer.close(); fis.close(); reader.close(); } //下载数据集从网上 private static void test1() throws MalformedURLException, IOException, FileNotFoundException { URL libUrl = new URL("http://www.occamslab.com/petricek/data/ratings.dat"); InputStream in = libUrl.openStream(); FileOutputStream fos = new FileOutputStream(new File("data/test.dat")); IOUtils.copyBytes(in, fos, 4096);; in.close(); fos.close(); } }
当然数据集可以自己制造。