类HashMap，但内存减少到原1/6的实现

时间:2009-10-28 11:08:33来源:网络作者:未知点击:429次

java中的Map在提供方便实用的同时，也存在内存浪费巨大的问题。当Map中的Entry数量达到1000万条以上的时候，需要数G的内存空间 .这里提到的Map使用形式为HashMap<String,Byte>,平均每个key在20个字符左右，最多不超过200

java中的Map在提供方便实用的同时，也存在内存浪费巨大的问题。当Map中的Entry数量达到1000万条以上的时候，需要数G的内存空间 .这里提到的Map使用形式为HashMap<String,Byte>,平均每个key在20个字符左右，最多不超过200字符.

在实际情况下，有差不多5/6的内存浪费在存放实际数据无关的地方.在一些一次写入多次读去的地方，完全没有必要浪费这么多的资源，下面就通过一个简单的实现说明。

算法说明:

按照key的长度信息将所有的entry放进不同的队列中，为了方便此时的entry队列已经排好序，当然也可以加进内存后再排序.

现在有很多的队列了，在查询的时候根据查询词长度选择一个队列，在队列中通过2分法查找.

算法适用性:

适用于key值集中在一定范围，value为简单类型(byte、int、long、float、double等)，数据量在百万条以上，内存匮乏的情况.

view plaincopy to clipboardprint?
·········10········20········30········40········50········60········70········80········90········100·······110·······120·······130·······140·······150
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class BSortMap{

    private static Map<Integer,BSortMap> bulk = new HashMap<Integer, BSortMap>();

    final private byte[] entrys; //entry数组
    private int count = 0;
    final private int keyLength; //key的字节大小
    final private int entryLength; //每一个Entry的字节数

    public BSortMap(int capacity, int keyLen){
        this.keyLength = keyLen;
        entryLength = keyLen + 1;
        entrys = new byte[capacity * entryLength];
    }

    public int size(){
        return count;
    }

    /**
     * 添加记录条目。条目都是已经排好序的。
     * src格式为<key,value>
     * @param src
     */
    final public void add(byte src[]){
        System.arraycopy(src, 0, entrys, count * entryLength, entryLength);
        count++;
    }

    final private int compare(final int begin,final byte[] b){
        int i = 0;
        for (; entrys[begin+i] == b[i] && i < b.length - 1; i++)
            ;
        return entrys[begin + i] - b[i];
    }

    /**
     * 获取与key关联的value值
     * @param key
     * @return 如果不存在key关联的value则返回-1
     */
    final public byte get(final byte[] key){
        int i = 0;
        int j = count-1;
        int mid;
        while(i<=j){
            mid = (i + j)>>1;
            final int ret =compare(mid*entryLength,key);
            if(ret==0){
                return entrys[mid*entryLength + keyLength]; //返回结果
            } else if (ret < 0){
                i = mid+1;
            }else{
                j = mid -1;
            }

        }
        return -1;

    }

    public static void main(String args[]) throws IOException{
        File dir = new File("D:/workspace/partion_keyword/sort");
        File[] files = dir.listFiles();

        for(File f:files){

            DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(f)));
            /*
             * f为根据key值已经排好序的文件
             * 文件格式为：
             * entrysCount: 一个整形数字，值为总的记录条目
             * keyLength: 一个整形数字，值为关键字的字节长度
             * <key,value>列表
             */
            final int entrysCount = in.readInt();
            final int keyLength = in.readInt();
            byte[] buffer = new byte[keyLength + 1];
            BSortMap bst = new BSortMap(entrysCount,keyLength);
            int i = 0;
            while(in.available() > 0){
                int l = in.read(buffer);
                while(l != keyLength +1){
                    System.err.println("not equal."+l);
                }
                bst.add(buffer);
                i++;
            }
            bulk.put(keyLength,bst);
            if (entrysCount != i)
                System.err.println(f.getName()+":"+entrysCount+","+i);
            in.close();
        }

        BufferedReader read = new BufferedReader(new FileReader("D:/eclipse/workspace/conf/wiki_kws.data"));
        String line ;
        int count = 0;
        long start = System.currentTimeMillis();
        while((line = read.readLine())!=null){
            byte key[] = line.trim().getBytes();
            BSortMap bt = bulk.get(key.length);
            if(bt != null && bt.get(key) != -1){
                count++;
            }
        }
        long end = System.currentTimeMillis();
        System.out.println(end - start);
        System.out.println("count:"+count);

        int totalcount = 0;
        for(BSortMap s: bulk.values()){
            totalcount += s.size();
        }
        System.out.println("total count:"+totalcount);
        read.close();

        /*while(true){
            Scanner in = new Scanner(System.in);
            String key = in.next().trim();
            if(key.equalsIgnoreCase("exit"))
                break;
            int len = key.trim().getBytes().length;
            BSort bt = bulk.get(len);
            byte v = bt.get(key.getBytes());
            System.err.println(key+"="+v);
        }*/

    }
}
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class BSortMap{

private static Map<Integer,BSortMap> bulk = new HashMap<Integer, BSortMap>();

final private byte[] entrys; //entry数组
private int count = 0;
final private int keyLength; //key的字节大小
final private int entryLength; //每一个Entry的字节数

public BSortMap(int capacity, int keyLen){
  this.keyLength = keyLen;
  entryLength = keyLen + 1;
  entrys = new byte[capacity * entryLength];
}

public int size(){
  return count;
}

/**
* 添加记录条目。条目都是已经排好序的。
* src格式为<key,value>
* @param src
*/
final public void add(byte src[]){
  System.arraycopy(src, 0, entrys, count * entryLength, entryLength);
  count++;
}

final private int compare(final int begin,final byte[] b){
  int i = 0;
  for (; entrys[begin+i] == b[i] && i < b.length - 1; i++)
   ;
  return entrys[begin + i] - b[i];
}

/**
* 获取与key关联的value值
* @param key
* @return 如果不存在key关联的value则返回-1
*/
final public byte get(final byte[] key){
  int i = 0;
  int j = count-1;
  int mid;
  while(i<=j){
   mid = (i + j)>>1;
   final int ret =compare(mid*entryLength,key);
   if(ret==0){
    return entrys[mid*entryLength + keyLength]; //返回结果
   } else if (ret < 0){
    i = mid+1;
   }else{
    j = mid -1;
   }

  }
  return -1;

}

public static void main(String args[]) throws IOException{
  File dir = new File("D:/workspace/partion_keyword/sort");
  File[] files = dir.listFiles();

  for(File f:files){

   DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(f)));
   /*
    * f为根据key值已经排好序的文件
    * 文件格式为：
    * entrysCount: 一个整形数字，值为总的记录条目
    * keyLength: 一个整形数字，值为关键字的字节长度
    * <key,value>列表
    */
   final int entrysCount = in.readInt();
   final int keyLength = in.readInt();
   byte[] buffer = new byte[keyLength + 1];
   BSortMap bst = new BSortMap(entrysCount,keyLength);
   int i = 0;
   while(in.available() > 0){
    int l = in.read(buffer);
    while(l != keyLength +1){
     System.err.println("not equal."+l);
    }
    bst.add(buffer);
    i++;
   }
   bulk.put(keyLength,bst);
   if (entrysCount != i)
    System.err.println(f.getName()+":"+entrysCount+","+i);
   in.close();
  }

  BufferedReader read = new BufferedReader(new FileReader("D:/eclipse/workspace/conf/wiki_kws.data"));
  String line ;
  int count = 0;
  long start = System.currentTimeMillis();
  while((line = read.readLine())!=null){
   byte key[] = line.trim().getBytes();
   BSortMap bt = bulk.get(key.length);
   if(bt != null && bt.get(key) != -1){
    count++;
   }
  }
  long end = System.currentTimeMillis();
  System.out.println(end - start);
  System.out.println("count:"+count);

  int totalcount = 0;
  for(BSortMap s: bulk.values()){
   totalcount += s.size();
  }
  System.out.println("total count:"+totalcount);
  read.close();

  /*while(true){
   Scanner in = new Scanner(System.in);
   String key = in.next().trim();
   if(key.equalsIgnoreCase("exit"))
    break;
   int len = key.trim().getBytes().length;
   BSort bt = bulk.get(len);
   byte v = bt.get(key.getBytes());
   System.err.println(key+"="+v);
  }*/

}
}
测试结果:

类HashMap，但内存减少到原1/6的实现时间:2009-10-28 11:08:33来源:网络作者:未知点击:430次 java中的Map在提供方便实用的同时，也存在内存浪费巨大的问题。当Map中的Entry数量达到1000万条以上的时候，需要数G的内存空间 .这里提到的Map使用形式为HashMap<String,Byte>,平均每个key在20个字符左右，最多不超过200
测试数据为7138595条Entry，分布在250个队列中.

HashMap<String,Byte>的结果：

时间: 3780、3814

内存: 892M

此方法的结果：
时间:5792、5758

内存: 158M

本篇文章来源于：开发学院 http://edu.codepub.com 原文链接：http://edu.codepub.com/2009/1028/16973_2.php

posted @ 2011-05-22 19:52 babykick 阅读(1284) 评论(0) 编辑收藏举报

刷新页面返回顶部

类HashMap，但内存减少到原1/6的实现

类HashMap，但内存减少到原1/6的实现

公告