[{"data":1,"prerenderedAt":179},["ShallowReactive",2],{"content:\u002F2026\u002Fsolve-waitgroup-panic":3,"surround:\u002F2026\u002Fsolve-waitgroup-panic":173},{"id":4,"title":5,"body":6,"categories":148,"date":150,"description":151,"draft":152,"extension":153,"image":154,"meta":155,"navigation":157,"path":158,"permalink":159,"published":159,"readingTime":160,"recommend":165,"references":159,"seo":166,"sitemap":167,"stem":168,"tags":169,"type":171,"updated":159,"__hash__":172},"content\u002Fposts\u002F2026\u002Fsolve-waitgroup-panic.md","修复 WaitGroup 送出的 Panic 大礼包",{"type":7,"value":8,"toc":135},"minimark",[9,13,17,23,26,30,33,62,69,73,83,86,92,95,101,108,111,117,120,123,126,129,132],[10,11,12],"h2",{"id":12},"背景",[14,15,16],"p",{},"分布式游戏服务器，架构上分为 Gate 网关、Node 游戏节点，Consul 服务发现，Redis 玩家定位器。 在一次 Game Node 重启后，新启动的节点进程，触发了",[14,18,19],{},[20,21,22],"code",{"code":22},"panic: sync: negative WaitGroup counter",[14,24,25],{},"导致节点崩溃。",[10,27,29],{"id":28},"为什么使用-waitgroup","为什么使用 WaitGroup",[14,31,32],{},"我在框架层面定义了四个 State 状态常量：",[34,35,36,44,50,56],"ul",{},[37,38,39,43],"li",{},[40,41,42],"strong",{},"Shut 关闭","：进程未启动或已销毁，不接受任何请求",[37,45,46,49],{},[40,47,48],{},"Work 工作","：正常运行，正常分配新玩家",[37,51,52,55],{},[40,53,54],{},"Busy 繁忙","：不要分配新玩家",[37,57,58,61],{},[40,59,60],{},"Hang 挂起","：正在优雅关闭，等待任务完成后销毁",[14,63,64,65,68],{},"State 的目的是“能不能接活”，并不知道“有多少活没干完”。当状态变为 Hang，怎么知道已接收的请求都响应了？因此引入 WaitGroup 。以玩家",[20,66,67],{"code":67},"进入游戏 → 领取任务 →下线","为例，梳理目前 WaitGroup 的完整链路，是我们定位 Panic 的关键。",[70,71,72],"h3",{"id":72},"玩家上线",[74,75,80],"pre",{"className":76,"code":78,"language":79},[77],"language-text","客户端连接 Gate → Gate 转发登录请求到 Node → Node 处理登录\n    │\n    └── ctx.BindNode() → 🟢 AddWait (+1) 写入 Redis 绑定关系\n","text",[20,81,78],{"__ignoreMap":82},"",[70,84,85],{"id":85},"玩家领取任务",[74,87,90],{"className":88,"code":89,"language":79},[77],"客户端发包 → Gate 查 Redis 定位 Node 并通知 Node → Node 串行处理\n\n全程没有 AddWait\u002FDoneWait。\n",[20,91,89],{"__ignoreMap":82},[70,93,94],{"id":94},"玩家下线",[74,96,99],{"className":97,"code":98,"language":79},[77],"Gate 检测到断线 → 通知 Node → Node 执行断线处理\n    │\n    └── proxy.UnbindNode() → 🔴 DoneWait (-1) 对应 BindNode 的 Add\n",[20,100,98],{"__ignoreMap":82},[14,102,103,104,107],{},"正常流程下，上线 +1，下线 -1，WaitGroup 归零， ",[20,105,106],{"code":106},"[wg.Wait()](http:\u002F\u002Fwg.Wait%28%29)","  返回。",[14,109,110],{},"但 Game Node 重启，打破了节点的配对规则！场景如下：",[74,112,115],{"className":113,"code":114,"language":79},[77],"时间线      Gate                    Node A (旧)              Node B (新)\n────────────────────────────────────────────────────────────────────────\nT1        玩家连接中               BindNode: wg.Add(1)\n          \n                                                            Redis: 玩家 → \"game\" 节点\n\nT2        ---                     进程退出                   ---\n\nT3        ---                     ---                       启动，wg = 0\n\nT4        检测到玩家断线\n          查 Redis: 玩家 → \"game\"\n          查 Consul: \"game\" → B\n          → 发给了 B!\n\nT5        ---                     ---                      收到断线事件\n                                                           → UnbindNode()\n                                                           → doneWait()\n                                                           → wg.Done()\n          \n                                                           → 0 → -1\n                                                           → 💥 PANIC\n",[20,116,114],{"__ignoreMap":82},[14,118,119],{},"我对 Add\u002FDone 配对隐含了“同一进程”的假设。Node A 做的 Add +1 是在 A 进程里，但 Done -1 却是在 B 进程里。B 进程从未 Add，直接 Done，WaitGroup 自然变为负数了。",[14,121,122],{},"至此，问题一目了然了，Node A 异常退出后，Redis 玩家定位器中仍残留玩家与 Node A 的绑定，当 Node B 以相同名称注册到 Consul 后，Gate 就会把本应发给 A 的事件，错误的发给 B。",[10,124,125],{"id":125},"解决方案",[14,127,128],{},"重启 Game Node 节点时，使用不同的实例 ID。",[10,130,131],{"id":131},"总结",[14,133,134],{},"分布式系统中进程重启，可能会打破 WaitGroup 同一生命周期内配对。",{"title":82,"searchDepth":136,"depth":136,"links":137},4,[138,140,146,147],{"id":12,"depth":139,"text":12},2,{"id":28,"depth":139,"text":29,"children":141},[142,144,145],{"id":72,"depth":143,"text":72},3,{"id":85,"depth":143,"text":85},{"id":94,"depth":143,"text":94},{"id":125,"depth":139,"text":125},{"id":131,"depth":139,"text":131},[149],"技术","2026-03-30 15:59:07","Node A 的玩家数据残留在 Redis，而 Node B 以相同 ID 顶替上线，跨进程的 sync.WaitGroup 计数污染就此上演。",false,"md","https:\u002F\u002Fimg2.tofaka.com\u002Fautoupload\u002FZ3wg1auvHGH_fxQcOFgj2SfNcKcqEnRmcljopnyJoMs\u002F20260618\u002Fyuof\u002F1001X623\u002F%E4%BC%81%E4%B8%9A%E5%BE%AE%E4%BF%A1%E6%88%AA%E5%9B%BE_17817742138899.png\u002Fwebp",{"slots":156},{},true,"\u002F2026\u002Fsolve-waitgroup-panic",null,{"text":161,"minutes":162,"time":163,"words":164},"5 min read",4.565,273900,913,1,{"title":5,"description":151},{"loc":158},"posts\u002F2026\u002Fsolve-waitgroup-panic",[170],"游戏","story","c_gEelmzx1IvaOqUXQ5Xne4ek0cShrBpGWWmNm2Z114",[174,159],{"title":175,"path":176,"stem":177,"date":178,"type":171,"children":-1},"读薄《代码大全2》","\u002F2025\u002Fcode-complete-2-notes","posts\u002F2025\u002Fcode-complete-2-notes","2025-08-05 19:53:37",1781779103066]