针对mongodb亿级别或者十亿级别的模糊查询,效率不高,解决方式是使用Es查询,这样就需要把数据导入的ES中

完整的代码实现如下所示:(仅供参考)

import java.io.IOException;
import java.net.UnknownHostException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.codec.binary.Base64;
import org.apache.http.HttpHost;
import org.bson.types.ObjectId;
import org.elasticsearch.action.bulk.BulkItemResponse;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;

import com.mongodb.BasicDBObject;
import com.mongodb.DB;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoException;


public class Test {

    public static void main(String[] args) throws IOException {
        int pageSize=10000;

        try {
            MongoClient mongo = new MongoClient("localhost", 27017);

            /**** Get database ****/
            // if database doesn't exists, MongoDB will create it for you
            DB db = mongo.getDB("www");

            /**** Get collection / table from 'testdb' ****/
            // if collection doesn't exists, MongoDB will create it for you
            DBCollection table = db.getCollection("person");
            
             RestHighLevelClient client = new RestHighLevelClient(
                        RestClient.builder(
                                new HttpHost("localhost", 9200, "http")));
            DBCursor dbObjects;            
            Long cnt=table.count();
            System.out.println(table.getStats().toString());
            Long page=getPageSize(cnt,pageSize);
            ObjectId lastIdObject=null;    
            Long start=System.currentTimeMillis();
            long ss=start;
            for(Long i=0L;i<page;i++) {
                start=System.currentTimeMillis();
                dbObjects=getCursorForCollection(table, lastIdObject, pageSize);
                System.out.println("第"+(i+1)+"次查询,耗时:"+(System.currentTimeMillis()-start)+" 毫秒");
                List<DBObject> objs=dbObjects.toArray();
                start=System.currentTimeMillis();
                batchInsertToEsSync(client,objs,"person","doc");
                lastIdObject=(ObjectId) objs.get(objs.size()-1).get("_id");
                System.out.println("第"+(i+1)+"次插入,耗时:"+(System.currentTimeMillis()-start)+" 毫秒");                
            }            
            System.out.println("耗时:"+(System.currentTimeMillis()-ss)/1000+"秒");    
        } catch (UnknownHostException e) {
            e.printStackTrace();
        } catch (MongoException e) {
            e.printStackTrace();
        }

    
    }
    
    public static void batchInsertToEsSync(RestHighLevelClient client,List<DBObject> objs,String tableName,String type) throws IOException {
        BulkRequest bulkRequest=new BulkRequest();
        for(DBObject obj:objs) {
            IndexRequest req = new IndexRequest(tableName, type);            
            Map<String,Object> map=new HashMap<>();
            for(String key:obj.keySet()) {
                if("_id".equalsIgnoreCase(key)) {
                    map.put("id", obj.get(key));
                }else {
                    String valStr="";
                    Object val=obj.get(key);
                    if(val!=null) {
                        valStr=Base64.encodeBase64String(val.toString().getBytes());
                    }
                    map.put(key, valStr);
                }
            }
            req.id(map.get("id").toString());
            req.source(map, XContentType.JSON);
            bulkRequest.add(req);
        }   
        BulkResponse bulkResponse=client.bulk(bulkRequest);
        for (BulkItemResponse bulkItemResponse : bulkResponse) {
            if (bulkItemResponse.isFailed()) { 
                System.out.println(bulkItemResponse.getId()+","+bulkItemResponse.getFailureMessage());
            }
        }
    }
    
    public static DBCursor getCursorForCollection(DBCollection collection,ObjectId lastIdObject,int pageSize) {
        DBCursor dbObjects=null;
        if(lastIdObject==null) {
            lastIdObject=(ObjectId) collection.findOne().get("_id");
        }
        BasicDBObject query=new BasicDBObject();
        query.append("_id",new BasicDBObject("$gt",lastIdObject));
        BasicDBObject sort=new BasicDBObject();
        sort.append("_id",1);
        dbObjects=collection.find(query).limit(pageSize).sort(sort);
        return dbObjects;
    }
    
    public static Long getPageSize(Long cnt,int pageSize) {
        return cnt%pageSize==0?cnt/pageSize:cnt/pageSize+1;
    }

 

posted on 2018-11-05 15:34  一天不进步,就是退步  阅读(536)  评论(1编辑  收藏  举报