Why compound wildcard index still do fetch?

Anderson_Lin · October 4, 2024, 7:42am

Hi,

I am trying to use compound wildcard index to prevent creating too many indexes.
My doc looks like below:

{
  "year": 2024,
  "month": 8,
  "day": 1,
  "hour": 0,
  "created": {
    "$date": "2024-08-01T00:24:37.029Z"
  },
  "appId": 9,
  "os": 9,
  "attributes": {
    "lang": "zh",
    "appVersion": "1.0",
    "country": "CN"
  }
}

The compound wildcard index script is:

db.my_coll.createIndex(
  {
    year: 1,
    month: 1,
    day: 1,
    hour: 1,
    appId: 1,
    os: 1,
    "attributes.$**": 1
  },
  {
    name: "compound_wildcard_test"
  }
)

And the aggregation pipeline script is:

[
  {
    $match:
      {
        year: 2024,
        month: 8,
        day: 1,
      },
  },
  {
    $group:
      {
        _id: {
          appId: "$appId",
          os: "$os",
        },
        value: {
          $sum: 1,
        },
      },
  },
]

The explain plan is:

{
  "explainVersion": "2",
  "queryPlanner": {
    "namespace": "my_db.my_coll",
    "indexFilterSet": false,
    "parsedQuery": {
      "$and": [
        { "day": { "$eq": 1 } },
        { "month": { "$eq": 8 } },
        { "year": { "$eq": 2024 } }
      ]
    },
    "queryHash": "F4876F52",
    "planCacheKey": "E072E021",
    "optimizedPipeline": true,
    "maxIndexedOrSolutionsReached": false,
    "maxIndexedAndSolutionsReached": false,
    "maxScansToExplodeReached": false,
    "winningPlan": {
      "queryPlan": {
        "stage": "GROUP",
        "planNodeId": 4,
        "inputStage": {
          "stage": "FETCH",
          "planNodeId": 2,
          "inputStage": {
            "stage": "IXSCAN",
            "planNodeId": 1,
            "keyPattern": {
              "year": 1,
              "month": 1,
              "day": 1,
              "hour": 1,
              "appId": 1,
              "os": 1,
              "$_path": 1
            },
            "indexName": "compound_wildcard_test",
            "isMultiKey": false,
            "multiKeyPaths": {
              "year": [],
              "month": [],
              "day": [],
              "hour": [],
              "appId": [],
              "os": [],
              "$_path": []
            },
            "isUnique": false,
            "isSparse": true,
            "isPartial": false,
            "indexVersion": 2,
            "direction": "forward",
            "indexBounds": {
              "year": ["[2024, 2024]"],
              "month": ["[8, 8]"],
              "day": ["[1, 1]"],
              "hour": ["[MinKey, MaxKey]"],
              "appId": ["[MinKey, MaxKey]"],
              "os": ["[MinKey, MaxKey]"],
              "$_path": ["[MinKey, MaxKey]"]
            }
          }
        }
      },
      "slotBasedPlan": {
        "slots": "$$RESULT=s33 env: { s4 = 1728027185275 (NOW), s3 = Timestamp(1728027184, 265) (CLUSTER_TIME), s2 = Nothing (SEARCH_META), s14 = Nothing, s10 = {\"year\" : 1, \"month\" : 1, \"day\" : 1, \"hour\" : 1, \"appId\" : 1, \"os\" : 1, \"$_path\" : 1, \"$_path\" : 1}, s25 = true, s1 = TimeZoneDatabase(America/Indiana/Tell_City...Africa/Blantyre) (timeZoneDB), s6 = IndexBounds(\"field #0['year']: [2024, 2024], field #1['month']: [8, 8], field #2['day']: [1, 1], field #3['hour']: [MinKey, MaxKey], field #4['appId']: [MinKey, MaxKey], fie\"...) }",
        "stages": "[4] project [s33 = newBsonObj(\"_id\", s32, \"value\", s30)] \n[4] project [s32 = newObj(\"appId\", s28, \"os\", s29)] \n[4] group [s28, s29] [s30 = sum(1)] spillSlots[s31] mergingExprs[sum(s31)] \n[2] nlj inner [] [s20, s21, s22, s23, s24] \n    left \n        [1] unique [s20] \n        [1] branch {s25} [s20, s21, s22, s23, s24] \n        [s5, s7, s8, s9, s10] [1] ixscan_generic s6 s9 s5 s7 s8 lowPriority [] @\"91308ead-aebb-40f8-9b39-fb5dcc5bb80a\" @\"compound_wildcard_test\" true \n        [s11, s17, s18, s19, s10] [1] nlj inner [] [s12, s13] \n            left \n                [1] project [s12 = getField(s15, \"l\"), s13 = getField(s15, \"h\")] \n                [1] unwind s15 s16 s14 false \n                [1] limit 1ll \n                [1] coscan \n            right \n                [1] ixseek s12 s13 s19 s11 s17 s18 [] @\"91308ead-aebb-40f8-9b39-fb5dcc5bb80a\" @\"compound_wildcard_test\" true \n    right \n        [2] limit 1ll \n        [2] seek s20 s26 s27 s21 s22 s23 s24 [s28 = appId, s29 = os] @\"91308ead-aebb-40f8-9b39-fb5dcc5bb80a\" true false \n"
      }
    },
    "rejectedPlans": []
  },
  "executionStats": {
    "executionSuccess": true,
    "nReturned": 125,
    "executionTimeMillis": 1730,
    "totalKeysExamined": 695205,
    "totalDocsExamined": 180895,
    "executionStages": {
      "stage": "project",
      "planNodeId": 4,
      "nReturned": 125,
      "executionTimeMillisEstimate": 1726,
      "opens": 1,
      "closes": 1,
      "saveState": 697,
      "restoreState": 697,
      "isEOF": 1,
      "projections": {
        "33": "newBsonObj(\"_id\", s32, \"value\", s30) "
      },
      "inputStage": {
        "stage": "project",
        "planNodeId": 4,
        "nReturned": 125,
        "executionTimeMillisEstimate": 1726,
        "opens": 1,
        "closes": 1,
        "saveState": 697,
        "restoreState": 697,
        "isEOF": 1,
        "projections": {
          "32": "newObj(\"appId\", s28, \"os\", s29) "
        },
        "inputStage": {
          "stage": "group",
          "planNodeId": 4,
          "nReturned": 125,
          "executionTimeMillisEstimate": 1726,
          "opens": 1,
          "closes": 1,
          "saveState": 697,
          "restoreState": 697,
          "isEOF": 1,
          "groupBySlots": [28, 29],
          "expressions": {
            "30": "sum(1) ",
            "initExprs": { "30": null }
          },
          "mergingExprs": { "31": "sum(s31) " },
          "usedDisk": false,
          "spills": 0,
          "spilledRecords": 0,
          "spilledDataStorageSize": 0,
          "inputStage": {
            "stage": "nlj",
            "planNodeId": 2,
            "nReturned": 180895,
            "executionTimeMillisEstimate": 1690,
            "opens": 1,
            "closes": 1,
            "saveState": 697,
            "restoreState": 697,
            "isEOF": 1,
            "totalDocsExamined": 180895,
            "totalKeysExamined": 695205,
            "collectionScans": 0,
            "collectionSeeks": 180895,
            "indexScans": 0,
            "indexSeeks": 1,
            "indexesUsed": [
              "compound_wildcard_test",
              "compound_wildcard_test"
            ],
            "innerOpens": 180895,
            "innerCloses": 1,
            "outerProjects": [],
            "outerCorrelated": [
              20, 21, 22, 23, 24
            ],
            "outerStage": {
              "stage": "unique",
              "planNodeId": 1,
              "nReturned": 180895,
              "executionTimeMillisEstimate": 1483,
              "opens": 1,
              "closes": 1,
              "saveState": 697,
              "restoreState": 697,
              "isEOF": 1,
              "dupsTested": 695204,
              "dupsDropped": 514309,
              "keySlots": [20],
              "inputStage": {
                "stage": "branch",
                "planNodeId": 1,
                "nReturned": 695204,
                "executionTimeMillisEstimate": 1295,
                "opens": 1,
                "closes": 1,
                "saveState": 697,
                "restoreState": 697,
                "isEOF": 1,
                "numTested": 1,
                "thenBranchOpens": 1,
                "thenBranchCloses": 1,
                "elseBranchOpens": 0,
                "elseBranchCloses": 0,
                "filter": "s25 ",
                "thenSlots": [5, 7, 8, 9, 10],
                "elseSlots": [11, 17, 18, 19, 10],
                "outputSlots": [
                  20, 21, 22, 23, 24
                ],
                "thenStage": {
                  "stage": "ixscan_generic",
                  "planNodeId": 1,
                  "nReturned": 695204,
                  "executionTimeMillisEstimate": 1295,
                  "opens": 1,
                  "closes": 1,
                  "saveState": 697,
                  "restoreState": 697,
                  "isEOF": 1,
                  "indexName": "compound_wildcard_test",
                  "keysExamined": 695205,
                  "seeks": 1,
                  "numReads": 695205,
                  "indexKeySlot": 9,
                  "recordIdSlot": 5,
                  "snapshotIdSlot": 7,
                  "indexIdentSlot": 8,
                  "outputSlots": [],
                  "indexKeysToInclude": "00000000000000000000000000000000"
                },
                "elseStage": {
                  "stage": "nlj",
                  "planNodeId": 1,
                  "nReturned": 0,
                  "executionTimeMillisEstimate": 0,
                  "opens": 0,
                  "closes": 0,
                  "saveState": 697,
                  "restoreState": 697,
                  "isEOF": 0,
                  "totalDocsExamined": 0,
                  "totalKeysExamined": 0,
                  "collectionScans": 0,
                  "collectionSeeks": 0,
                  "indexScans": 0,
                  "indexSeeks": 0,
                  "indexesUsed": [
                    "compound_wildcard_test"
                  ],
                  "innerOpens": 0,
                  "innerCloses": 0,
                  "outerProjects": [],
                  "outerCorrelated": [12, 13],
                  "outerStage": {
                    "stage": "project",
                    "planNodeId": 1,
                    "nReturned": 0,
                    "executionTimeMillisEstimate": 0,
                    "opens": 0,
                    "closes": 0,
                    "saveState": 697,
                    "restoreState": 697,
                    "isEOF": 0,
                    "projections": {
                      "12": "getField(s15, \"l\") ",
                      "13": "getField(s15, \"h\") "
                    },
                    "inputStage": {
                      "stage": "unwind",
                      "planNodeId": 1,
                      "nReturned": 0,
                      "executionTimeMillisEstimate": 0,
                      "opens": 0,
                      "closes": 0,
                      "saveState": 697,
                      "restoreState": 697,
                      "isEOF": 0,
                      "inputSlot": 14,
                      "outSlot": 15,
                      "outIndexSlot": 16,
                      "preserveNullAndEmptyArrays": 0,
                      "inputStage": {
                        "stage": "limit",
                        "planNodeId": 1,
                        "nReturned": 0,
                        "executionTimeMillisEstimate": 0,
                        "opens": 0,
                        "closes": 0,
                        "saveState": 697,
                        "restoreState": 697,
                        "isEOF": 0,
                        "inputStage": {
                          "stage": "coscan",
                          "planNodeId": 1,
                          "nReturned": 0,
                          "executionTimeMillisEstimate": 0,
                          "opens": 0,
                          "closes": 0,
                          "saveState": 697,
                          "restoreState": 697,
                          "isEOF": 0
                        }
                      }
                    }
                  },
                  "innerStage": {
                    "stage": "ixseek",
                    "planNodeId": 1,
                    "nReturned": 0,
                    "executionTimeMillisEstimate": 0,
                    "opens": 0,
                    "closes": 0,
                    "saveState": 697,
                    "restoreState": 697,
                    "isEOF": 0,
                    "indexName": "compound_wildcard_test",
                    "keysExamined": 0,
                    "seeks": 0,
                    "numReads": 0,
                    "indexKeySlot": 19,
                    "recordIdSlot": 11,
                    "snapshotIdSlot": 17,
                    "indexIdentSlot": 18,
                    "outputSlots": [],
                    "indexKeysToInclude": "00000000000000000000000000000000",
                    "seekKeyLow": "s12 ",
                    "seekKeyHigh": "s13 "
                  }
                }
              }
            },
            "innerStage": {
              "stage": "limit",
              "planNodeId": 2,
              "nReturned": 180895,
              "executionTimeMillisEstimate": 206,
              "opens": 180895,
              "closes": 1,
              "saveState": 697,
              "restoreState": 697,
              "isEOF": 1,
              "limit": 1,
              "inputStage": {
                "stage": "seek",
                "planNodeId": 2,
                "nReturned": 180895,
                "executionTimeMillisEstimate": 196,
                "opens": 180895,
                "closes": 1,
                "saveState": 697,
                "restoreState": 697,
                "isEOF": 0,
                "numReads": 180895,
                "recordSlot": 26,
                "recordIdSlot": 27,
                "seekKeySlot": 20,
                "snapshotIdSlot": 21,
                "indexIdentSlot": 22,
                "indexKeySlot": 23,
                "indexKeyPatternSlot": 24,
                "fields": ["appId", "os"],
                "outputSlots": [28, 29]
              }
            }
          }
        }
      }
    }
  },
  "command": {
    "aggregate": "my_coll",
    "pipeline": [
      {
        "$match": {
          "year": 2024,
          "month": 8,
          "day": 1
        }
      },
      {
        "$group": {
          "_id": {
            "appId": "$appId",
            "os": "$os"
          },
          "value": { "$sum": 1 }
        }
      }
    ],
    "cursor": {},
    "maxTimeMS": 300000,
    "$db": "my_db"
  },
  "serverInfo": {
    "host": "atlas-9modrn-shard-00-01.aw6o0.mongodb.net",
    "port": 27017,
    "version": "7.0.12",
    "gitVersion": "b6513ce0781db6818e24619e8a461eae90bc94fc"
  },
  "serverParameters": {
    "internalQueryFacetBufferSizeBytes": 104857600,
    "internalQueryFacetMaxOutputDocSizeBytes": 104857600,
    "internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
    "internalDocumentSourceGroupMaxMemoryBytes": 104857600,
    "internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
    "internalQueryProhibitBlockingMergeOnMongoS": 0,
    "internalQueryMaxAddToSetBytes": 104857600,
    "internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600,
    "internalQueryFrameworkControl": "trySbeRestricted"
  },
  "ok": 1,
  "$clusterTime": {
    "clusterTime": {
      "$timestamp": "7421820250468909061"
    },
    "signature": {
      "hash": "/UK0CMpYqyJCh8d2WJxz676xcRc=",
      "keyId": {
        "low": 2,
        "high": 1713453437,
        "unsigned": false
      }
    }
  },
  "operationTime": {
    "$timestamp": "7421820250468909061"
  }
}

All query fields are all defined in compound_wildcard_test index.
Why it still use fetch stage which makes query slow?
And how can I prevent fetch stage?
Any ideas are welcome!

Thank you.

iamabs2001 · October 5, 2024, 9:24pm

Anderson_Lin:

db.my_coll.createIndex(
  {
    year: 1,
    month: 1,
    day: 1,
    hour: 1,
    appId: 1,
    os: 1,
    "attributes.$**": 1
  },
  {
    name: "compound_wildcard_test"
  }
)

It might be because mongodb is trying to group certain fields but it’s missing some context, you can try adding a full projection of the usefull fields before grouping in the aggregate pipeline.

Anderson_Lin · October 7, 2024, 2:29am

Hi,

I tried $project but no luck, it still uses $fetch.
Then I add attribute in $match, and it won’t do $fetch, like below:

[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": 'JP'
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country"
      },
      value: {
        $sum: 1
      }
    }
  }
]

BUT this is not what I want,
I don’t need to match country but group by country.

Can’t compound wildcard index be used in $group only?

Any comments are welcome!
Thank you very much.

steevej · October 7, 2024, 2:54am

May be you can try

attributes.country : { $ne : null }

in your $match.

It is a hack that should match all countries and might still avoid the FETCH.

Anderson_Lin · October 7, 2024, 4:14am

Hi @steevej ,

Thanks for your advice.

I modified the aggregation script:

[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {$ne: null},
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country",
      },
      value: {
        $sum: 1
      }
    }
  }
]

But it still uses fetch.

{
 "stage": "FETCH",
 "planNodeId": 2,
 "nReturned": 180895,
 "executionTimeMillisEstimate": 1762,
 "opens": 1,
 "closes": 1,
 "saveState": 696,
 "restoreState": 696,
 "isEOF": 1,
 "numTested": 180895,
 "filter": {
  "attributes.country": {
   "$not": {
    "$eq": null
   }
  }
 },
 "docsExamined": 180895,
 "keysExamined": 0
}

Very strange.

steevej · October 7, 2024, 12:47pm

Yep. As mentioned

Not all hacks works all the time. I thought that hitting attributes.country would do the same as your equality on JP. May be you could try with some other tests that would hit the same field but with more specific values compared to null. I would try with

"attributes.country" : { "$gte" : "AA" , "$lte" : "ZZ" }

Assuming your data follows the ISO-3166 2 letters country codes standard, all countries should match. Hopefully, this range query will select the same index as the equality query of JP.

Anderson_Lin · October 7, 2024, 2:58pm

Hi @steevej ,

After changed to "attributes.country" : { "$gte" : "AA" , "$lte" : "ZZ" }, no fetch anymore.
Awesome!

But the $group will contain more fields, for example:

[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {$gte: 'AA', $lte: 'ZZ'},
      "attributes.lang": {$gte: 'aa', $lte: 'zz'}
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country",
        lang: "$attributes.lang"
      },
      value: {
        $sum: 1
      }
    }
  }
]

This still uses fetch:

{
 "stage": "FETCH",
 "planNodeId": 2,
 "nReturned": 178774,
 "executionTimeMillisEstimate": 1783,
 "opens": 1,
 "closes": 1,
 "saveState": 696,
 "restoreState": 696,
 "isEOF": 1,
 "numTested": 180895,
 "filter": {
  "$and": [
   {
    "attributes.country": {
     "$lte": "ZZ"
    }
   },
   {
    "attributes.country": {
     "$gte": "AA"
    }
   },
   {
    "attributes.lang": {
     "$lte": "zz"
    }
   },
   {
    "attributes.lang": {
     "$gte": "aa"
    }
   }
  ]
 },
 "docsExamined": 180895,
 "keysExamined": 0
}

I tried "attributes.lang": 'en', but it’s not working.
Is this caused by

The query predicate specifies exactly one field covered by the wildcard index.

from https://www.mongodb.com/docs/manual/core/indexes/index-types/index-wildcard/#covered-queries, or other reasons ?

I also tried below, but still no luck:

[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {$gte: 'AA', $lte: 'ZZ'},
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country",
        lang: "$attributes.lang"
      },
      value: {
        $sum: 1
      }
    }
  }
]

{
 "stage": "FETCH",
 "planNodeId": 2,
 "nReturned": 180858,
 "executionTimeMillisEstimate": 716,
 "opens": 1,
 "closes": 1,
 "saveState": 181,
 "restoreState": 181,
 "isEOF": 1,
 "totalDocsExamined": 180858,
 "totalKeysExamined": 182916,
 "collectionScans": 0,
 "collectionSeeks": 180858,
 "indexScans": 0,
 "indexSeeks": 1,
 "indexesUsed": [
  "compound_wildcard_test",
  "compound_wildcard_test"
 ],
 "innerOpens": 180858,
 "innerCloses": 1,
 "outerProjects": [],
 "outerCorrelated": [
  20,
  21,
  22,
  23,
  24
 ],
 "docsExamined": 180858,
 "keysExamined": 0
}

Please let me know if any hints.
Thank you very much!

steevej · October 7, 2024, 10:44pm

Method 1 - Attribute Pattern

This is not an hack but it is a major update to your schema.

Take a look at the attribute pattern.

Essentially you go from the object attributes:

to the array attributes:

You replace the wildcard index with a multi-key index on attributes.k and attributes.v.

Hopefully, the optimizer will do this differently.

Method 2 - Group Lookup

This is a hack that worked in some occasions to either boost performance or go over the 16MB limit but it is not clear to me the exact reasons. I have a good idea of why but no hard facts. Actually, I know exactly why it works to overcome the 16MB limit.

Here you split the $group into 2 parts, the first one only builds the _id and the second performs a lookup to do the counting.

It would look something like:

const main_match = { $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {$gte: 'AA', $lte: 'ZZ'},
} }

const main_group = {  $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country",
    }
} }

const lang_match = { $match : {
      year: 2024,
      month: 8,
      day: 1,
      $expr : { $and : [ 
          { $eq : [ "$appId" , "$$_id_appId" ] } ,
          { $eq : [ "$os" , "$$_id_os" ] } ,
          { $eq : [ "$attributes.country" , "$$_id_country" ] }
      ] },
      "attributes.lang": {$gte: "aa" , $lte: "zz"},
} }

const lang_group = { $group : {
  _id : {
        appId: "$$_id_appId",
        os: "$$_id_os",
        country: "$$_id_country",
        lang: "$attributes.lang"
      },
  value : { $sum : 1 }
} }

const lang_pipeline = [ lang_match , lang_group ]

const main_lookup = { $group : {
    from : the_same_collection ,
    let : {
        _id_appId: "$_id.appId",
        _id_os: "$_id.os",
        _id_country: "$_id.country",
    } ,
    as : "result" ,
    pipeline : lang_pipeline ,
} }

const main_pipeline = [
    main_match ,
    main_group  ,
    main_lookup ,
]

The main_pipeline should not be FETCHing since we know it already does not FETCH for a single attribute match. The lang_pipeline, still $match on 2 attributes but the country is now becomes an equality query rather than range query.

So it might work.

The format will be a little bit different but I leave that as an exercise to the reader.

Anderson_Lin · October 8, 2024, 7:24am

Hi @steevej ,

For attribute pattern,
the document is changed:

{
  "appId": 999,
  "createdDttm": {
    "$date": "2024-09-01T00:55:35.048Z"
  },
  "day": 1,
  "hour": 0,
  "month": 9,
  "os": 1,
  "year": 2024,
  "attributes_array": [
    {
      "k": "lang",
      "v": "en"
    },
    {
      "k": "appVersion",
      "v": "1.3.2"
    },
    {
      "k": "country",
      "v": "MM"
    },
    {
      "k": "continent",
      "v": "AS"
    },
    {
      "k": "province",
      "v": "06"
    },
    }
  ]
}

The index:

db.collection.createIndex(
  {
    year: 1,
    month: 1,
    day: 1,
    hour: 1,
    appId: 1,
    os: 1,
    "attributes_array.k": 1,
    "attributes_array.v": 1
  },
  {
    name: "attribute_pattern_test"
  }
)

The aggregation pipeline:

[
  {
    $match:
      {
        year: 2024,
        month: 8,
        day: 1,
        "attributes_array.k": "country",
      }
  },
  {
    $project: {
      _id: 0,
      appId: 1,
      os: 1,
      result: {
        $reduce: {
          input: "$attributes_array",
          initialValue: {
            country: null
          },
          in: {
            country: {
              $cond: [
                {
                  $eq: ["$$this.k", "country"]
                },
                "$$this.v",
                "$$value.country"
              ]
            }
          }
        }
      }
    }
  },
  {
    $group:
      {
        _id: {
          appId: "$appId",
          os: "$os",
          country: "$result.country"
        },
        value: {
          $sum: 1
        }
      }
  }
]

The performance is not good.
I added below in $match, but still use fetch:

         "attributes_array.v": 'US'

         "attributes_array.v": {
           $gte: 'AA', $lte: 'ZZ'
        }

Do you know how to avoid fetch?
Thank you in advance.

steevej · October 11, 2024, 3:45pm

Have you tried the version with the $lookup?

I will be out of idea after the following.

The attributes country and lang looks generic enough. Just like appId, os and year/month/day. In addition, they are part of a major use-case.

So I would move them out from attributes and make them top level field like the appId,os,year,month and day. And add them as normal field of the index.

Anderson_Lin · October 12, 2024, 4:21am

Hi @steevej ,

I tried method 2 group lookup, but it took about 2 minutes to get all data:

db.device.aggregate(
[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {
        $gte: "AA",
        $lte: "ZZ"
      }
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country"
      }
    }
  },
  {
    $lookup: {
      from: "device",
      let: {
        id_appId: "$_id.appId",
        id_os: "$_id.os",
        id_country: "$_id.country"
      },
      as: "result",
      pipeline: [
        {
          $match: {
            year: 2024,
            month: 8,
            day: 1,
            $expr: {
              $and: [
                {
                  $eq: ["$appId", "$id_appId"]
                },
                { $eq: ["$os", "$id_os"] },
                {
                  $eq: [
                    "$attributes.country",
                    "$id_country"
                  ]
                }
              ]
            },
            "attributes.lang": {$gte: "aa", $lte: "zz"}
          },
        },
                  {
            $group: {
              _id: {
                appId: "$id_appId",
                os: "$id_os",
                country: "$id_country",
                lang: "$attributes.lang"
              },
              value: {
                $sum: 1
              }
            }
          }
      ]
    }
  },
  {$project: {
    _id: 0,
    result: "$result"
  }},
  {$unwind: {
    path: "$result",
    preserveNullAndEmptyArrays: true
  }},
  {$group: {
    _id: {
      appId: "$result._id.appId",
      os: "$result._id.os",
      country: "$result._id.country",
      lang: "$result._id.lang",
    },
    value: {
      $sum: "$result.value"
    }
  }}
]
)

I think the method 2 could not be a good solution.
I added appId in first $match, but it still needs 7 seconds:

  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      appId: 37,
      "attributes.country": {
        $gte: "AA",
        $lte: "ZZ"
      }
    }
  },

Besides, if I need more attributes to group, I guess more $lookup are needed.

The reason why I put country, lang… to a attributes field is because I have many reports that will group by different fields, for example:

report by country
report by lang
report by appVersion
report by contient, country, province
report by new_field_1, new_field2

In the future, new fields will be added to attributes field to support new reports, and I won’t need to recreate my compound wildcard index.
So I guess I should put country, lang, … to a attributes field.
But if there is a new report, like new_field_report by country, I need to add this new_field to my compound wildcard index.
This is the disadvantage.

Please let me know if any better ideas!
Thank you very much.

steevej · October 12, 2024, 1:44pm

The $group within the $lookup pipeline is wrong. You only need lang: in the _id because the $match of the $lookup provides unique values for appId, os and country.

Luke_Thompson · October 13, 2024, 11:58pm

I should preface this with saying I have no inside knowledge of how all this works, so I’m making a few guesses here.

I think one problem is that a compound index is a b+ tree which includes all the fields, in order and you are omitting the hour, appid and os from the $match.

Another could be the size of the compound index.

The query planner never decides to add for you a match on {hour: {$exists: true}} in order to walk the tree down every branch of hour because:
a) this may be more expensive than a document fetch and applying the filters there
b) in a large compound index with many hundreds of thousands of documents, and potentially millions of index entries this may take up a huge amount of memory

In my experience, indexes will be utilized only when every field in the index (or a partial start set of fields in index order) is included in the match.

I don’t know the full internal workings of the query planner and search code, but I believe that in the case of a very large compound index like the one you are building, MongoDB is trying to manage its memory and figuring out if all or part of the index can be loaded into RAM. It’s also looking at the query and whether the index will serve the query well.

A couple of questions I would ask are:
a) how big is this index?
b) how much memory does the mongod server have?
c) does the match statement require all parts of the index, in the order of the compound fields?

The answers to these might cause different behaviour from the query planner.

When it comes to (c), there is perhaps a benefit to duplicating the country field outside the attributes array to:

simplify the query
reduce the memory required for an index on country
simplify the job the query planner has

So if you duplicated the country field outside the attributes array so the document had:
{
year: 2024,
month: 10,
day: 1,
hour: 0,
country: ‘AU’,
attributes: [
{k: ‘country’, v: ‘AU’},
…
]
}

And added the index:
{
year: 1, month: 1, day: 1, country: 1
}

Then I think you’d get a nice covered query with no fetch, as well as a greater chance the index would be fully loaded into RAM and stay hot in RAM ongoing.

I noticed in the past that the queries involving nested object array queries were:

complex
often resulted in document fetch stages where I thought it would have scanned the index
e.g. if you try to match on two or more attributes, the query planner will only ever look at one

Of course if you have many different queries with different combinations of these fields this might be problematic. In that case I’d consider switching to Atlas Search. It can perform multiple index intersection over simple field indexes - so you could have an index over year, another over month etc.

Anderson_Lin · October 14, 2024, 2:33am

Hi @steevej ,

Thank you to point out my mistake.
I changed “group lookup” to

[
  {
    $match: {
      year: 2024,
      month: 8,
      day: 1,
      "attributes.country": {
        $gte: "AA",
        $lte: "ZZ"
      }
    }
  },
  {
    $group: {
      _id: {
        appId: "$appId",
        os: "$os",
        country: "$attributes.country"
      }
    }
  },
  {
    $lookup: {
      from: "device",
      let: {
        id_appId: "$_id.appId",
        id_os: "$_id.os",
        id_country: "$_id.country"
      },
      as: "result",
      pipeline: [
        {
          $match: {
            year: 2024,
            month: 8,
            day: 1,
            $expr: {
              $and: [
                {
                  $eq: ["$appId", "$$id_appId"]
                },
                {
                  $eq: ["$os", "$$id_os"]
                },
                {
                  $eq: [
                    "$attributes.country",
                    "$$id_country"
                  ]
                }
              ]
            },
            "attributes.lang": {
              $gte: "aa",
              $lte: "zz"
            }
          }
        },
        {
          $group: {
            _id: {
              lang: "$attributes.lang"
            },
            value: {
              $sum: 1
            }
          }
        }
      ]
    }
  },
  {
    $unwind: {
      path: "$result",
      preserveNullAndEmptyArrays: true
    }
  },
  {
    $group: {
      _id: {
        appId: "$_id.appId",
        os: "$_id.os",
        country: "$_id.country",
        lang: "$result._id.lang"
      },
      value: {
        $sum: "$result.value"
      }
    }
  }
]

It still takes around 2 minutes.

steevej · October 14, 2024, 1:55pm

I think this is a very important point and detail that I have completely missed from the first post. The presence of hour in the index might indeed be detrimental to your use-case. Thanks Luke for pointing that out.

A few things to try

1- Remove hour from the index.

or

2- Move it after attributes

or

3- Keep hour where it is but add

 hour : { $gte : 0 , $lte : 24 }

to your $match if using $exists:true as proposed by Luke does not force the index

Anderson_Lin · October 15, 2024, 2:36am

Hi @Luke_Thompson ,

Thank you for comments.
I tried atlas search before.

Test : dynamic mapping + facet query

Search index:

{
  "mappings": {
    "dynamic": true,
    "fields": {
      "appId": {
        "indexDoubles": false,
        "representation": "int64",
        "type": "numberFacet"
      },
      "os": {
        "indexDoubles": false,
        "representation": "int64",
        "type": "numberFacet"
      }
    }
  }
}

The aggregate pipeline:

[
  {
    $searchMeta: {
      index: "default",
      count: {type: "total"},
      facet: {
        operator: {
          compound: {
            filter: [
              {
                equals: {
                  path: "year",
                  value: 2023
                }
              },
              {
                equals: {
                  path: "month",
                  value: 12
                }
              },
              {
                equals: {
                  path: "day",
                  value: 1
                }
              },
            ]
          }
        },
        facets: {
          "appIdFacet": {
            default: "other",
            type: "number",
            path: "appId",
            boundaries: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89]
          }
        }
      }
    }
  }
]

The disadvantages:

I have to define field facet type, eg. numberFacet. Atlas search won’t set facet type automatically. If new field is added to document, I have to rebuild search index, and my purpose is not to rebuild any indexes when new fields.
The facet can’t support multiple “group by”. For example, group by appId and os, no separate. It isn’t “group by appId”, and another “group by os”.

Let me know if any,
thank you!

system · October 22, 2024, 9:36am

This topic was automatically closed 5 days after the last reply. New replies are no longer allowed.