count array occurrences across all documents with mongo

Personally I am not a big fan of transforming “data” as the names of keys in a result. The aggregation framework principles tend to aggree as this sort of operation is not supported either.

So the personal preference is to maintain “data” as “data” and accept that the processed output is actually better and more logical to a consistent object design:

db.people.aggregate([
    { "$group": {
        "_id": "$sex",
        "hobbies": { "$push": "$hobbies" },
        "total": { "$sum": 1 }
    }},
    { "$unwind": "$hobbies" },
    { "$unwind": "$hobbies" },
    { "$group": {
        "_id": {
            "sex": "$_id",
            "hobby": "$hobbies"
        },
        "total": { "$first": "$total" },
        "hobbyCount": { "$sum": 1 }
    }},
    { "$group": {
        "_id": "$_id.sex",
        "total": { "$first": "$total" },
        "hobbies": {
            "$push": { "name": "$_id.hobby", "count": "$hobbyCount" }
        }
    }}
])

Which produces a result like this:

[
    {
            "_id" : "female",
            "total" : 1,
            "hobbies" : [
                {
                    "name" : "tennis",
                    "count" : 1
                },
                {
                    "name" : "football",
                    "count" : 1
                }
            ]
    },
    {
        "_id" : "male",
        "total" : 2,
        "hobbies" : [
            {
                "name" : "swimming",
                "count" : 1
            },
            {
                "name" : "tennis",
                "count" : 2
            },
            {
                "name" : "football",
                "count" : 2
            }
        ]
    }
]

So the initial $group does the count per “sex” and stacks up the hobbies into an array of arrays. Then to de-normalize you $unwind twice to get singular items, $group to get the totals per hobby under each sex and finally regroup an array for each sex alone.

It’s the same data, it has a consistent and organic structure that is easy to process, and MongoDB and the aggregation framework was quite happy in producing this output.

If you really must convert your data to names of keys ( and I still recommend you do not as it is not a good pattern to follow in design ), then doing such a tranformation from the final state is fairly trivial for client code processing. As a basic JavaScript example suitable for the shell:

var out = db.people.aggregate([
    { "$group": {
        "_id": "$sex",
        "hobbies": { "$push": "$hobbies" },
        "total": { "$sum": 1 }
    }},
    { "$unwind": "$hobbies" },
    { "$unwind": "$hobbies" },
    { "$group": {
        "_id": {
            "sex": "$_id",
            "hobby": "$hobbies"
        },
        "total": { "$first": "$total" },
        "hobbyCount": { "$sum": 1 }
    }},
    { "$group": {
        "_id": "$_id.sex",
        "total": { "$first": "$total" },
        "hobbies": {
            "$push": { "name": "$_id.hobby", "count": "$hobbyCount" }
        }
    }}
]).toArray();

out.forEach(function(doc) {
    var obj = {};
    doc.hobbies.sort(function(a,b) { return a.count < b.count });
    doc.hobbies.forEach(function(hobby) {
        obj[hobby.name] = hobby.count;
    });
    doc.hobbies = obj;
    printjson(doc);
});

And then you are basically processing each cursor result into the desired output form, which really isn’t an aggregation function that is really required on the server anyway:

{
    "_id" : "female",
    "total" : 1,
    "hobbies" : {
        "tennis" : 1,
        "football" : 1
    }
}
{
    "_id" : "male",
    "total" : 2,
    "hobbies" : {
        "tennis" : 2,
        "football" : 2,
        "swimming" : 1
    }
}

Where that should also be fairly trival to implement that sort of manipulation into stream processing of the cursor result to tranform as required, as it is basically just the same logic.

On the other hand, you can always implement all the manipulation on the server using mapReduce instead:

db.people.mapReduce(
    function() {
        emit(
            this.sex,
            { 
                "total": 1,
                "hobbies": this.hobbies.map(function(key) {
                    return { "name": key, "count": 1 };
                })
            }
        );
    },
    function(key,values) {
        var obj  = {},
            reduced = {
                "total": 0,
                "hobbies": []
            };

        values.forEach(function(value) {
            reduced.total += value.total;
            value.hobbies.forEach(function(hobby) {
                if ( !obj.hasOwnProperty(hobby.name) )
                    obj[hobby.name] = 0;
                obj[hobby.name] += hobby.count;
            });
        });

        reduced.hobbies = Object.keys(obj).map(function(key) {
            return { "name": key, "count": obj[key] };
        }).sort(function(a,b) {
            return a.count < b.count;
        });

        return reduced;
    },
    { 
        "out": { "inline": 1 },
        "finalize": function(key,value) {
            var obj = {};
            value.hobbies.forEach(function(hobby) {
                obj[hobby.name] = hobby.count;
            });
            value.hobbies = obj;
            return value;
        }
    }
)

Where mapReduce has it’s own distinct style of output, but the same principles are used in accumulation and manipulation, if not likely as efficient as the aggregation framework can do:

   "results" : [
        {
            "_id" : "female",
            "value" : {
                "total" : 1,
                "hobbies" : {
                    "football" : 1,
                    "tennis" : 1
                }
            }
        },
        {
            "_id" : "male",
            "value" : {
                "total" : 2,
                "hobbies" : {
                    "football" : 2,
                    "tennis" : 2,
                    "swimming" : 1
                }
            }
        }
    ]

At the end of the day, I still say that the first form of processing is the most efficient and provides to my mind the most natural and consistent working of the data output, without even attempting to convert the data points into the names of keys. It’s probably best to consider following that pattern, but if you really must, then there are ways to manipulate results into a desired form in various approaches to processing.

Leave a Comment