使用hash拆分文件

package readImgUrl;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

public class ClassifyUrl {
	
	private static int HASHLEN = 100;
	
	private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";
	
	private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";
	
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
		classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");
//		rank_filedata("2");
		
//		String s = judgeFileCode(src_file);
//		String s = codeString(src_file);
//		System.out.println(s);
	}

	/**
	 * 对一个文件进行排序
	 */
	public static void rank_filedata(String filename){
		String path1 = file_dir+filename+".txt";
		String path2 = file_dir+filename+"_"+".txt";
		List<String> list = reader_list(path1);
		System.out.println(list.size());
		// 排序,通过泛型和匿名类来实现  
        Collections.sort(list, new Comparator<String>() {  
            public int compare(String s1, String s2) {
            	String h1 = s1.split("	")[1];
            	String h2 = s2.split("	")[1];
            	return h1.compareTo(h2);
            }  
        });
		writer_list(list, path2);
	}
	/**
	 * 读取文件,返回list
	 * @param path
	 * @return
	 */
	public static List reader_list(String path){
		List<String> lineList = new ArrayList();
		try {
			BufferedReader reader = new BufferedReader(new FileReader(path));
			String line = reader.readLine();
			while(null != line){
				lineList.add(line);
				line = reader.readLine();
			}
			reader.close();
			return lineList;
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
		return null;
	}
	/**
	 * 将List写入文件
	 * @param line
	 */
	public static void writer_list(List list, String path){
		try {
			BufferedWriter writer = new BufferedWriter(new FileWriter(path));
			for(int i=0; i<list.size(); i++){
				String line = (String)list.get(i);
				writer.write(line+"\r\n");
			}			
			writer.close();
			
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
	}
	/**
	 * 从文件中逐行读取数据,分类写入0-99个文件
	 */
	public static void classify_url(String path){
		try {
			BufferedReader reader ;
			String filecode = judgeFileCode(path);
			reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));
//			BufferedReader reader = new BufferedReader(new FileReader(path));
			String line = reader.readLine();
			int line_num = 0;
//			while(line_num<4101000){
//				reader.readLine();
//				line_num++;
//			}
			while(null != line){
				try {
					String host = new URL(line.split("	")[1]).getHost();
					int type = hash(host.toCharArray());
//					writer(type+"", line);
				} catch (Exception e) {
					// TODO: handle exception
					e.printStackTrace();
				}
				line = reader.readLine();
				line_num++;
				if(line_num%100==0){
//					System.out.println(line_num);
					char [] cc = line.toCharArray();
					for(char c: cc){
						if(isCnorEn(c)){
							System.out.println(line);
							break;
						}
					}
//					break;
				}
			}
			reader.close();
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
	}
	/**
	 * 判断是中文还是英文字符
	 */
	static boolean isCnorEn(char c) {
		if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
				|| (c >= 0x0000 && c <= 0x00FF)) // 英文字符
			return true;
		return false;
//		if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
//				) // 
//			return true;
//		return false;
	}
	/**
	 * 给定一个字符串,返回hash后的int值
	 * @param word
	 * @return
	 */
	public static int hash(char[] word) {
		int index = 0; 
	    int i=0;
	    while(i<word.length) {
	        index += index * 31 + word[i];
	        i++;
	    }
	    return Math.abs(index % HASHLEN);
	} 
	/**
	 * 将line写入filename中(文件不存在则先建立)
	 * @param filename
	 * @param line
	 */
	public static void writer(String filename, String line){
		String path = file_dir+filename+".txt";
		try {
			File file = new File(path);
			if(!file.isFile()){
				file.createNewFile();
			}
			String filecode = judgeFileCode(src_file);
			OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
//			BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
			if(null != line){
				writer.write(line+"\r\n");
			}
			writer.close();
			
		} catch (Exception e) {
			// TODO: handle exception
			e.printStackTrace();
		}
	}
	
	public static String judgeFileCode(String path){
		try {
			File file = new File(path);  
			InputStream in= new java.io.FileInputStream(file);  
			byte[] b = new byte[3];  
			in.read(b);  
			in.close();  
			if (b[0] == -17 && b[1] == -69 && b[2] == -65)  {
//				System.out.println(file.getName() + ":编码为UTF-8");
				return "UTF-8";
			}
			else{
//				System.out.println(file.getName() + ":可能是GBK,也可能是其他编码");
				return "GBK";
			}
		} catch (Exception e) {
			// TODO: handle exception
		}
		return null;
	}

	/**
     * 判断文件的编码格式
     * @param fileName :file
     * @return 文件编码格式
     * @throws Exception
     */
    public static String codeString(String fileName) throws Exception{
        BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
        int p = (bin.read() << 8) + bin.read();
        String code = null;
        //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
        switch (p) {
            case 0xefbb:
                code = "UTF-8";
                break;
            case 0xfffe:
                code = "Unicode";
                break;
            case 0xfeff:
                code = "UTF-16BE";
                break;
            case 0x5c75:
                code = "ANSI|ASCII" ;
                break ;
            default:
                code = "GBK";
        }
         
        return code;
    }

}

posted on 2015-01-12 15:46  长456风  阅读(549)  评论(0编辑  收藏  举报

导航