mongodb复杂查询示例

以oracle的测试schema为例,emp和dept表,原始数据如下

SQL> select * from emp;

     EMPNO ENAME      JOB              MGR HIREDATE              SAL       COMM     DEPTNO
---------- ---------- --------- ---------- -------------- ---------- ---------- ----------
      7369 SMITH      CLERK           7902 17-12月-80            800                    20
      7499 ALLEN      SALESMAN        7698 20-2月 -81           1600        300         30
      7521 WARD       SALESMAN        7698 22-2月 -81           1250        500         30
      7566 JONES      MANAGER         7839 02-4月 -81           2975                    20
      7654 MARTIN     SALESMAN        7698 28-9月 -81           1250       1400         30
      7698 BLAKE      MANAGER         7839 01-5月 -81           2850                    30
      7782 CLARK      MANAGER         7839 09-6月 -81           2450                    10
      7839 KING       PRESIDENT            17-11月-81           5000                    10
      7844 TURNER     SALESMAN        7698 08-9月 -81           1500          0         30
      7900 JAMES      CLERK           7698 03-12月-81            950                    30
      7902 FORD       ANALYST         7566 03-12月-81           3000                    20
      7934 MILLER     CLERK           7782 23-1月 -82           1300                    10

已选择12行。

SQL> select * from dept;

    DEPTNO DNAME          LOC
---------- -------------- -------------
        10 ACCOUNTING     NEW YORK
        20 RESEARCH       DALLAS
        30 SALES          CHICAGO
        40 OPERATIONS     BOSTON

完成如下查询,

a 列出每个部门的名称及其人数 
b 列出工资最高的头三名员工姓名及其工资 
c 列出工资比上司高的员工姓名 
d 列出平均工资高于公司总平均工资的部门名称

首先,需要建立mongodb集合,一开始,我采用完全模拟emp和dept表的方式,发现很多情况下都需要join,而mongodb并不支持join,做起来比较迂回。于是决定,把所有数据都放在同一个集合(emp_dept)内,新加元素TABLE用来表示是来源于哪个table的数据。以下是初始化emp_dept集合的ruby代码

#!/usr/bin/env ruby
# 20140307, init_emp_dept.rb
###
###


require "rubygems"
require "mongo"

mongoconn = Mongo::MongoClient.new("localhost", 27017)
db = mongoconn.db("test")

# 7369 SMITH      CLERK           7902 17-12月-80            800                    20
# 7499 ALLEN      SALESMAN        7698 20-2月 -81           1600        300         30
# 7521 WARD       SALESMAN        7698 22-2月 -81           1250        500         30
# 7566 JONES      MANAGER         7839 02-4月 -81           2975                    20
# 7654 MARTIN     SALESMAN        7698 28-9月 -81           1250       1400         30
# 7698 BLAKE      MANAGER         7839 01-5月 -81           2850                    30
# 7782 CLARK      MANAGER         7839 09-6月 -81           2450                    10
# 7839 KING       PRESIDENT            17-11月-81           5000                    10
# 7844 TURNER     SALESMAN        7698 08-9月 -81           1500          0         30
# 7900 JAMES      CLERK           7698 03-12月-81            950                    30
# 7902 FORD       ANALYST         7566 03-12月-81           3000                    20
# 7934 MILLER     CLERK           7782 23-1月 -82           1300                    10

coll = db.collection("emp_dept")
coll.insert({ TABLE:"EMP", EMPNO:7369, ENAME:"SMITH", JOB:"CLERK", MGR:7902, HIREDATE:"19801217", SAL:800, DEPTNO:20 })
coll.insert({ TABLE:"EMP", EMPNO:7499, ENAME:"ALLEN", JOB:"SALESMAN", MGR:7698, HIREDATE:"19810220", SAL:1600, COMM:300, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7521, ENAME:"WARD", JOB:"SALESMAN", MGR:7698, HIREDATE:"19810222", SAL:1250, COMM:500, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7566, ENAME:"JONES", JOB:"MANAGER", MGR:7839, HIREDATE:"19810402", SAL:2975, DEPTNO:20 })
coll.insert({ TABLE:"EMP", EMPNO:7654, ENAME:"MARTIN", JOB:"SALESMAN", MGR:7698, HIREDATE:"19810928", SAL:1250, COMM:1400, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7698, ENAME:"BLAKE", JOB:"MANAGER", MGR:7839, HIREDATE:"19810501", SAL:2850, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7782, ENAME:"CLARK", JOB:"MANAGER", MGR:7839, HIREDATE:"19810609", SAL:2450, DEPTNO:10 })
coll.insert({ TABLE:"EMP", EMPNO:7839, ENAME:"KING", JOB:"PRESIDENT", HIREDATE:"19811117", SAL:5000, DEPTNO:10 })
coll.insert({ TABLE:"EMP", EMPNO:7844, ENAME:"TURNER", JOB:"SALESMAN", MGR:7698, HIREDATE:"19810908", SAL:1500, COMM:0, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7900, ENAME:"JAMES", JOB:"CLERK", MGR:7698, HIREDATE:"19811203", SAL:950, DEPTNO:30 })
coll.insert({ TABLE:"EMP", EMPNO:7902, ENAME:"FORD", JOB:"ANALYST", MGR:7566, HIREDATE:"19811203", SAL:3000, DEPTNO:20 })
coll.insert({ TABLE:"EMP", EMPNO:7934, ENAME:"MILLER", JOB:"CLERK", MGR:7782, HIREDATE:"19820123", SAL:1300, DEPTNO:10 })

# coll.find.each { |doc| puts doc.inspect }

# 10 ACCOUNTING     NEW YORK
# 20 RESEARCH       DALLAS
# 30 SALES          CHICAGO
# 40 OPERATIONS     BOSTON

coll.insert({ TABLE:"DEPT", DEPTNO:10, DNAME:"ACCOUNTING", LOC:"NEW YORK"})
coll.insert({ TABLE:"DEPT", DEPTNO:20, DNAME:"RESEARCH", LOC:"DALLAS"})
coll.insert({ TABLE:"DEPT", DEPTNO:30, DNAME:"SALES", LOC:"CHICAGO"})
coll.insert({ TABLE:"DEPT", DEPTNO:40, DNAME:"OPERATIONS", LOC:"BOSTON"})


coll.find.each { |doc| puts doc.inspect }

# db.drop_collection("emp_dept")

贴一下输出,便于看清楚集合的结构

{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000001'), "TABLE"=>"EMP", "EMPNO"=>7369, "ENAME"=>"SMITH", "JOB"=>"CLERK", "MGR"=>7902, "HIREDATE"=>"19801217", "SAL"=>800, "DEPTNO"=>20}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000002'), "TABLE"=>"EMP", "EMPNO"=>7499, "ENAME"=>"ALLEN", "JOB"=>"SALESMAN", "MGR"=>7698, "HIREDATE"=>"19810220", "SAL"=>1600, "COMM"=>300, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000003'), "TABLE"=>"EMP", "EMPNO"=>7521, "ENAME"=>"WARD", "JOB"=>"SALESMAN", "MGR"=>7698, "HIREDATE"=>"19810222", "SAL"=>1250, "COMM"=>500, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000004'), "TABLE"=>"EMP", "EMPNO"=>7566, "ENAME"=>"JONES", "JOB"=>"MANAGER", "MGR"=>7839, "HIREDATE"=>"19810402", "SAL"=>2975, "DEPTNO"=>20}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000005'), "TABLE"=>"EMP", "EMPNO"=>7654, "ENAME"=>"MARTIN", "JOB"=>"SALESMAN", "MGR"=>7698, "HIREDATE"=>"19810928", "SAL"=>1250, "COMM"=>1400, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000006'), "TABLE"=>"EMP", "EMPNO"=>7698, "ENAME"=>"BLAKE", "JOB"=>"MANAGER", "MGR"=>7839, "HIREDATE"=>"19810501", "SAL"=>2850, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000007'), "TABLE"=>"EMP", "EMPNO"=>7782, "ENAME"=>"CLARK", "JOB"=>"MANAGER", "MGR"=>7839, "HIREDATE"=>"19810609", "SAL"=>2450, "DEPTNO"=>10}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000008'), "TABLE"=>"EMP", "EMPNO"=>7839, "ENAME"=>"KING", "JOB"=>"PRESIDENT", "HIREDATE"=>"19811117", "SAL"=>5000, "DEPTNO"=>10}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000009'), "TABLE"=>"EMP", "EMPNO"=>7844, "ENAME"=>"TURNER", "JOB"=>"SALESMAN", "MGR"=>7698, "HIREDATE"=>"19810908", "SAL"=>1500, "COMM"=>0, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000a'), "TABLE"=>"EMP", "EMPNO"=>7900, "ENAME"=>"JAMES", "JOB"=>"CLERK", "MGR"=>7698, "HIREDATE"=>"19811203", "SAL"=>950, "DEPTNO"=>30}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000b'), "TABLE"=>"EMP", "EMPNO"=>7902, "ENAME"=>"FORD", "JOB"=>"ANALYST", "MGR"=>7566, "HIREDATE"=>"19811203", "SAL"=>3000, "DEPTNO"=>20}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000c'), "TABLE"=>"EMP", "EMPNO"=>7934, "ENAME"=>"MILLER", "JOB"=>"CLERK", "MGR"=>7782, "HIREDATE"=>"19820123", "SAL"=>1300, "DEPTNO"=>10}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000d'), "TABLE"=>"DEPT", "DEPTNO"=>10, "DNAME"=>"ACCOUNTING", "LOC"=>"NEW YORK"}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000e'), "TABLE"=>"DEPT", "DEPTNO"=>20, "DNAME"=>"RESEARCH", "LOC"=>"DALLAS"}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af00000f'), "TABLE"=>"DEPT", "DEPTNO"=>30, "DNAME"=>"SALES", "LOC"=>"CHICAGO"}
{"_id"=>BSON::ObjectId('531c2a59cf7c7c45af000010'), "TABLE"=>"DEPT", "DEPTNO"=>40, "DNAME"=>"OPERATIONS", "LOC"=>"BOSTON"}

对于问题a,mongodb的查询如下

/* 20140309
 * a 列出每个部门的名称及其人数
 */
db.emp_dept.group({key: {"DEPTNO":true},
  initial: {"DNAME":"", "csum":0},
  reduce: function(obj, prev){
      if (obj.TABLE == "DEPT"){
          prev.DNAME = prev.DNAME + obj.DNAME
      } else if (obj.TABLE == "EMP"){
          prev.csum ++;
      }
}})

输出如下

[
    {
        "DEPTNO" : 20,
        "DNAME" : "RESEARCH",
        "csum" : 3
    },
    {
        "DEPTNO" : 30,
        "DNAME" : "SALES",
        "csum" : 6
    },
    {
        "DEPTNO" : 10,
        "DNAME" : "ACCOUNTING",
        "csum" : 3
    },
    {
        "DEPTNO" : 40,
        "DNAME" : "OPERATIONS",
        "csum" : 0
    }
]

问题b,比较简单,代码如下

/* 20140309
 * b 列出工资最高的头三名员工姓名及其工资
 */
db.emp_dept.find({"TABLE":"EMP"},{"ENAME":1,"SAL":1,"_id":0}).sort({"SAL":-1}).limit(3)

输出如下

{ "ENAME" : "KING", "SAL" : 5000 }
{ "ENAME" : "FORD", "SAL" : 3000 }
{ "ENAME" : "JONES", "SAL" : 2975 }

问题c,简单一点的解法可以这么做

/* 20140309
 * c 列出工资比上司高的员工姓名
 */
// solution 1
db.emp_dept.find({"$where": function(){
    if (this["TABLE"] == "EMP") {
        if (this["MGR"] != null){
            var mgr = db.emp_dept.findOne({"TABLE":"EMP", "EMPNO":this["MGR"]})
            if (this["SAL"] > mgr["SAL"]){
                return true;
            }else {
                return false;
            }
        }
    }
}}, {"EMPNO":1, "ENAME":1, "_id":0})

输出如下

{ "EMPNO" : 7902, "ENAME" : "FORD" }

如果希望能够在查出这些员工的同时,也能列出其上司和其上司的工资,可能就需要用map-reduce,如果用map-reduce的话,写法就有很多种了,我是这样写的

/* 20140309
 * c 列出工资比上司高的员工姓名
 */
// solution 2
get_mgr_sal = function() { 
    if (this["TABLE"] == "EMP") {
        if (this["MGR"] != null){
            var mgr = db.emp_dept.findOne({"TABLE":"EMP", "EMPNO":this["MGR"]});
            if (this["SAL"] > mgr["SAL"]){
                emit({"EMPNO":this["EMPNO"], "ENAME":this["ENAME"]}, {"SAL" : this["SAL"], 
                    "MGR_EMPNO":mgr["EMPNO"], "MGR_NAME":mgr["ENAME"], "MGR_SAL":mgr["SAL"]}); 
            }
        }
    }
}; 

get_mgr_sal = function() { 
    if (this["TABLE"] == "EMP") {
        if (this["MGR"] != null){
            var mgr = db.emp_dept.findOne({"TABLE":"EMP", "EMPNO":this["MGR"]})
            emit({"EMPNO":this["EMPNO"], "ENAME":this["ENAME"]}, {"SAL" : this["SAL"], 
                "MGR_EMPNO":mgr["EMPNO"], "MGR_NAME":mgr["ENAME"], "MGR_SAL":mgr["SAL"]}); 
        }
    }
}; 

higher_than_mgr = function(key, emits) { 
    return emits;
};

filter_result = function(key, reduced){
    if (reduced["SAL"] > reduced["MGR_SAL"]){
        return reduced;
    }
}

db.runCommand({"mapreduce" : "emp_dept", "map" : get_mgr_sal, "reduce" : higher_than_mgr,
    "out":"result", "finalize":filter_result})

db.result.find({"value":{"$ne":null}})

输出如下

{ "_id" : { "EMPNO" : 7902, "ENAME" : "FORD" }, "value" : { "SAL" : 3000, "MGR_EMPNO" : 7566, "MGR_NAME" : "JONES", "MGR_SAL" : 2975 } }

虽然已经有了结果,但是我感觉map-reduce这么用,感觉还不是很舒服,但也没有想出更好的写法。

问题d,我只想出了用map-reduce的解法

/* 20140309
 * d 列出平均工资高于公司总平均工资的部门名称
 */
map_avg_sal_dept = function() {
    if (this["TABLE"] == "EMP"){
        var dept = db.emp_dept.findOne({"TABLE":"DEPT", "DEPTNO":this["DEPTNO"]});
        var key = {"DEPTNO":dept["DEPTNO"], "DNAME":dept["DNAME"]};
        var value = {"SAL_DEPT":this["SAL"], "DEPT_COUNT":1};
        emit(key, value)
    }
}

red_avg_sal_dept = function(key, value) {
    var all_sal_dept = 0.0;
    var all_dept_count = 0;
    for (var i in value){
        all_sal_dept += value[i]["SAL_DEPT"];
        all_dept_count += value[i]["DEPT_COUNT"];
    }
    return {"AVG_SAL_DEPT":all_sal_dept/all_dept_count};
}

get_avg_sal = function(){
    var all_sal = 0.0;
    var all_count = 0;
    db.emp_dept.find({"TABLE":"EMP"}).forEach(
        function (e) {
            all_sal += e["SAL"];
            all_count ++;
        });
    return all_sal/all_count;
}

filter_avg_sal_dept = function(key, reduced) {
    if (reduced["AVG_SAL_DEPT"] > avg_sal){
        return reduced;
    }
}

db.runCommand({"mapreduce" : "emp_dept", "map" : map_avg_sal_dept, "reduce" : red_avg_sal_dept,
    "out":"result", "scope":{avg_sal:get_avg_sal()}, "finalize":filter_avg_sal_dept})

db.result.find({"value":{"$ne":null}})

输出如下

{ "_id" : { "DEPTNO" : 10, "DNAME" : "ACCOUNTING" }, "value" : { "AVG_SAL_DEPT" : 2916.6666666666665 } }
{ "_id" : { "DEPTNO" : 20, "DNAME" : "RESEARCH" }, "value" : { "AVG_SAL_DEPT" : 2258.3333333333335 } }

总体看起来,用mongodb处理结构化数据,还是显得不那么直观,解法也显得很迂回。当然,如果适应了的话,写这些代码可能会变快,但是比起SQL对数据的抽象程度和友好性来说,mongodb还是差了不少,如果能有mongodb的类sql接口,那就会更好了。

posted @ 2014-03-09 17:14  valleylord  阅读(1554)  评论(0编辑  收藏  举报