Hello, Community!
We’re using a sharded MongoDB V5.0.14 setup in our production environment and we have been getting an error while reads are made against two of the biggest collections in the database. Below is the snippet from the mongod slow query logs. Full log of a sample query is attached.
"errMsg":"version mismatch detected for company.collection1",
"errName":"StaleConfig",
"errCode":13388
This collection is at 13 TB data size and compressed to disk to ~ 1.9 TB with zstd compression option. It receives about 80% of overall traffic compared to the other collections
I’ve gone through similar posts like shard-version-not-ok-version-mismatch-detected-for, staleconfig-error-in-sharded-data-cluster-an-error-from-cluster-data-placement-c, MongoDB Jira SERVER-45119, staleconfig-how-to-stop-this
It looks like the only option to try out is to execute the flushRouterConfig command. we executed it on all the mongos nodes, config server nodes, and all data shard nodes (PSS). The errors seem to stop for about an hour or two and they resurface
Also, we disabled the balancing on this particular collection to see if the chunk autosplit and movement is causing any stale data. It doesn’t seem to stop either
We would like to know
- Why this happens in the first place? is this issue/error internal to MongoDB or is it triggered by bad query from the client or bad setup from the Infra DevOps perspective?
- We guess this error is propagated to the client so this is a failure from the user point of view though we never got a chance to reproduce this behaviour ourselves. We executed all the queries which encountered this error and they seem to be returning the results properly
Please help and ask any information that would help you further
I couldn’t upload a non-image attachment. So, pasting the slow query log below
{
"attr" : {
"command" : {
"$audit" : {
"$impersonatedRoles" : [
{
"db" : "admin",
"role" : "root"
}
],
"$impersonatedUsers" : [
{
"db" : "admin",
"user" : "user1"
}
]
},
"$client" : {
"driver" : {
"name" : "mongo-java-driver|legacy",
"version" : "3.12.2"
},
"mongos" : {
"client" : "10.0.x.x:yyyyy",
"host" : "mongos-01.company.net:27017",
"version" : "5.0.14"
},
"os" : {
"architecture" : "amd64",
"name" : "Linux",
"type" : "Linux",
"version" : "5.10.118-111.515.amzn2.x86_64"
},
"platform" : "Java/Oracle Corporation/1.8.0_171-b10"
},
"$clusterTime" : {
"clusterTime" : {
"$timestamp" : {
"i" : 32,
"t" : 1677742474
}
},
"signature" : {
"hash" : {
"$binary" : {
"base64" : "WoHMZCgb2wPkODlfrH17ft8p4kU=",
"subType" : "0"
}
},
"keyId" : 7192513166306181143
}
},
"$configServerState" : {
"opTime" : {
"t" : -1,
"ts" : {
"$timestamp" : {
"i" : 14,
"t" : 1677742474
}
}
}
},
"$configTime" : {
"$timestamp" : {
"i" : 14,
"t" : 1677742474
}
},
"$db" : "company",
"$readPreference" : {
"mode" : "secondaryPreferred"
},
"$topologyTime" : {
"$timestamp" : {
"i" : 2,
"t" : 1674647828
}
},
"clientOperationKey" : {
"$uuid" : "c8d4a888-3501-422b-9edb-382ef7abcd01"
},
"filter" : {
"$comment" : "a0fa53d5-60d3-43c7-a0fb-a68720403790",
"field1" : "value1"
},
"find" : "collection1",
"limit" : 24,
"lsid" : {
"id" : {
"$uuid" : "af276bbb-73e1-46d8-b04b-4ae6980f89b8"
},
"uid" : {
"$binary" : {
"base64" : "JEiqi/uQbAC38XTQIncz/YXMD580biZRCf0ibuMFRVg=",
"subType" : "0"
}
}
},
"maxTimeMS" : 18000,
"maxTimeMSOpOnly" : 18009,
"projection" : {
"field2" : 1,
"field3" : 1
},
"readConcern" : {
"level" : "local",
"provenance" : "implicitDefault"
},
"shardVersion" : [
{
"$timestamp" : {
"i" : 5,
"t" : 29078
}
},
{
"$oid" : "63d12aed200597703ab44399"
},
{
"$timestamp" : {
"i" : 4427,
"t" : 1674652397
}
}
],
"sort" : {
"field3" : -1
}
},
"durationMillis" : 214,
"errCode" : 13388,
"errMsg" : "version mismatch detected for company.collection1",
"errName" : "StaleConfig",
"locks" : {
"FeatureCompatibilityVersion" : {
"acquireCount" : {
"r" : 1
}
},
"Global" : {
"acquireCount" : {
"r" : 1
}
},
"Mutex" : {
"acquireCount" : {
"r" : 2
}
}
},
"ns" : "company.collection1",
"numYields" : 0,
"ok" : 0,
"protocol" : "op_msg",
"readConcern" : {
"level" : "local",
"provenance" : "implicitDefault"
},
"remote" : "10.0.x.x:yyyyy",
"reslen" : 685,
"storage" : {},
"type" : "command"
},
"c" : "COMMAND",
"ctx" : "conn47321",
"id" : 51803,
"msg" : "Slow query",
"s" : "I",
"t" : {
"$date" : "2023-03-02T07:34:35.472+00:00"
}
}