MongoDB聚合操作

MongoDB的聚合框架实现sum()、avg()、group by等聚合操作。通过聚合框架，还可对返回的结果进行处理，实现一些特殊需求，例如数据过滤、别名显示、增加字段、提取子字段等。

1 聚合框架组件

聚合框架包含两大组件：管道和表达式。

1.1管道

管道与unix管道类似，实质就是把扫描的数据输入聚合进程，进行一些过滤、分组、求和等操作，这些操作是通过管道操作符完成的。

例如，实现类似SQL：select b as ok from test111,

聚合语句如下:

db.test111.aggregate([{$project:{_id:0,ok:"$b"}}]);

{

"result" : [

{

"ok" : 2

},

……………………省略………………………………

{

"ok" : 10000

}

],

"ok" : 1

}

$project管道操作符就是用来控制字段输出的，本例$project不让_id字段显示，只显示字段b，并且用别名”ok”显示。与unix类似，多个管道操作符是可以一起使用的。例如，上面的结果，只要求显示前2行，可以用$limit管道操作符，语句如下：

mongos> db.test111.aggregate([{$project:{_id:0,ok:"$b"}},{ $limit : 2 },{$match:{ok:{$ne:2}}}])

{ "result" : [ { "ok" : 3 } ], "ok" : 1 }

还有其他的管道操作符：$group,$skip,$sort等

1.2 表达式

表达式实际就是对管道的结果进行一些计算，例如求平均值，最大值，记录条数等。Sql语句：select * from (select avg(b),max(b),count(a) from test111 group by a) where rownum<=3，对应的mongo查询语句如下：

db.test111.aggregate([

{ $group: {

_id: "$a",

avg_b: { $avg: "$b" },

max_b: { $max: "$b" },

nb_b: { $sum: 1 }

} },{ $limit : 3 }

]);

2 聚合运算性能优化

db.test.aggregate({$group:{_id:{EN_T:"$EN_T",EN_N:"$EN_N"}, SE:{$sum:"$S_PT"}}},{$limit:20},{$sort:{TIME:-1}})

聚合操作会把集合内容输入管道，然后计算输出。优化的目标就是要尽量让输入管道的内容更少一些。

2.1下面的管道操作符出现在管道开头时，都可以良好地使用索引：

$match,$sort,$limit,$skip

2.2 尽早过滤

如果聚合操作只需要部分文档，那么就应该尽早用$match操作户进行过滤，随后用$sort操作符进行排序，这样才能有效使用索引

2.3 管道顺序优化

3 聚合框架实例

考虑atricles集合:

3.1 $unwind:

Articles是文章集合，文章中有tags包含各种标签，以及用户评论。现在按照标签分组，列出每个标签下的文章作者名称。

mongos> db.articles.find(0 ).pretty()

{

"_id" : ObjectId("521d64e482f40a1927af1d4c"),

"title" : "this is my title",

"author" : "bob",

"posted" : ISODate("2013-08-28T02:48:04.561Z"),

"pageViews" : 5,

"tags" : [

"fun",

"good",

"fun"

],

"comments" : [

{

"author" : "joe",

"text" : "this is cool"

},

{

"author" : "sam",

"text" : "this is bad"

}

],

"other" : {

"foo" : 5

}

}

mongos> a

{

"title" : "this is my title",

"author" : "huangxing",

"posted" : ISODate("2013-08-28T02:50:39.639Z"),

"pageViews" : 5,

"tags" : [

"fun",

"good",

"fun"

],

"comments" : [

{

"author" : "lihao",

"text" : "this is cool"

},

{

"author" : "wangm",

"text" : "this is bad"

}

],

"other" : {

"foo" : 5

}

}

mongos> db.articles.insert(a)

mongos> db.articles.aggregate( { $project : { author : 1, tags : 1, } }, { $unwind : "$tags" }, { $group : { _id : { tags : "$tags" }, authors : { $addToSet : "$author" } } } );

{

"result" : [

{

"_id" : {

"tags" : "good"

},

"authors" : [

"huangxing",

"bob"

]

},

{

"_id" : {

"tags" : "fun"

},

"authors" : [

"huangxing",

"bob"

]

}

],

"ok" : 1

}

构建新集合，导入数据：

[root@55 ~]# mongoimport -h 192.168.69.55 --port 30000 -d test -c zipcodes -u test -p test --file ./zips.json

> db.zipcodes.findOne()

{

"city" : "ACMAR",

"loc" : [

-86.51557,

33.584132

],

"pop" : 6055,

"state" : "AL",

"_id" : "35004"

}

3.2 查询总人口超过1300万的州

> db.zipcodes.aggregate( { $group :

{ _id : "$state",

totalPop : { $sum : "$pop" } } },

{ $match : {totalPop : { $gte : 13 * 1000 * 1000 } } } )

{

"result" : [

{

"_id" : "NY",

"totalPop" : 17990455

},

{

"_id" : "TX",

"totalPop" : 16986510

},

{

"_id" : "CA",

"totalPop" : 29760021

}

],

"ok" : 1

}

3.3 查询每个州的城市的平均人口

mongos> db.zipcodes.aggregate( { $group :

{ _id : { state : "$state", city : "$city" },

pop : { $sum : "$pop" } } },

{ $group :

{ _id : "$_id.state",

{

"result" : [

{

"_id" : "RI",

"avgCityPop" : 18933.283018867925

},

],

"ok" : 1

}

3.4 查询每个州中人口最多的城市和人口少的城市

db.zipcodes.aggregate( { $group:

{ _id: { state: "$state", city: "$city" },

pop: { $sum: "$pop" } } },

{ $sort: { pop: 1 } },

{ $group:

{ _id : "$_id.state",

biggestCity: { $last: "$_id.city" },

biggestPop: { $last: "$pop" },

smallestCity: { $first: "$_id.city" },

smallestPop: { $first: "$pop" } } },

{ $project:

{ _id: 0,

state: "$_id",

biggestCity: { name: "$biggestCity", pop: "$biggestPop" },

smallestCity: { name: "$smallestCity", pop: "$smallestPop" } } } ,{$limit:3})

{

"result" : [

{

"biggestCity" : {

"name" : "CRANSTON",

"pop" : 176404

},

"smallestCity" : {

"name" : "CLAYVILLE",

"pop" : 45

},

"state" : "RI"

},

{

"biggestCity" : {

"name" : "CLEVELAND",

"pop" : 536759

},

"smallestCity" : {

"name" : "ISLE SAINT GEORG",

"pop" : 38

},

"state" : "OH"

},

{

"biggestCity" : {

"name" : "BALTIMORE",

"pop" : 733081

},

"smallestCity" : {

"name" : "ANNAPOLIS JUNCTI",

"pop" : 32

},

"state" : "MD"

}

],

"ok" : 1

}

3.5 查询排序，映射字段

db.zipcodes.aggregate( { $group :

{ _id : "$state",

totalPop : { $sum : "$pop" } } },

{ $match : {totalPop : { $gte : 13 * 1000 * 1000 } } },{$project:{name:{$toUpper:"$_id"},_id:0,pop:"$totalPop"}},{$sort : { name : 1 } })

3.6 Sql与聚合映射:

3.6.1 Select count(*) as count from zipcides

mongos> db.zipcodes.aggregate(

{ $group: { _id: null,

count: { $sum: 1 } } }

)

{ "result" : [ { "_id" : null, "count" : 29467 } ], "ok" : 1 }

3.6.2 Select sum(pop) as totalpop from zipcodes

db.zipcodes.aggregate( [

{ $group: { _id: null,

totalpop: { $sum: "$pop" } } },{$project:{_id:0,totalpop:1}}

])

{ "result" : [ { "totalpop" : 248706415 } ], "ok" : 1 }

3.6.3 Select state,sum(pop) as state_pop from zipcodes goup by state_pop having total_pop>=13000 order by state_pop

db.zipcodes.aggregate([

{$group:{_id:"$state",state_pop:{ $sum:"$pop"}}},

{$match:{state_pop:{$gt:13000000}}},

{$sort:{state_pop:1}}

])