[{"data":1,"prerenderedAt":733},["ShallowReactive",2],{"article-\u002Fai-drug-week07-scaffold-gnn":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":5,"title":7,"description":8,"date":9,"tags":10,"listed":14,"body":15,"_type":727,"_id":728,"_source":729,"_file":730,"_stem":731,"_extension":732},"\u002Farticles\u002F\u002Fai-drug-week07-scaffold-gnn","",false,"第 7 周：Scaffold Split 和 GNN 入门","记录 random split 与 scaffold split 的差异，以及从 Morgan Fingerprint 过渡到分子图和简单 GCN baseline 的过程。","2026-06-03",[11,12,13],"AI制药","深度学习","人工智能",true,{"type":16,"children":17,"toc":717},"root",[18,26,32,192,197,202,207,212,217,222,227,266,273,278,283,288,294,299,304,309,314,328,333,388,393,399,404,471,476,489,519,524,530,535,549,554,617,622,627,632,637,642,647,678,683,706,711],{"type":19,"tag":20,"props":21,"children":23},"element","h2",{"id":22},"前情提要",[24],{"type":25,"value":22},"text",{"type":19,"tag":27,"props":28,"children":29},"p",{},[30],{"type":25,"value":31},"正在做一个AI制药平台项目，前端使用vue3、后端采用双后端(主后端Java、SpringBoot做业务，次后端Python、FastAPI做AI服务)",{"type":19,"tag":33,"props":34,"children":38},"pre",{"className":35,"code":36,"language":37,"meta":5,"style":5},"language-plain shiki shiki-themes github-dark","Vue3 前端\n  |\n  | REST API\n  v\nSpringBoot 主后端\n  |\n  | 任务管理 \u002F 数据管理\n  v\nPostgreSQL + Redis\n  |\n  | 调用\n  v\nPython AI Service\n  |\n  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n  v\n模型推理与分子计算\n","plain",[39],{"type":19,"tag":40,"props":41,"children":42},"code",{"__ignoreMap":5},[43,54,63,72,81,90,98,107,115,124,132,141,149,158,166,175,183],{"type":19,"tag":44,"props":45,"children":48},"span",{"class":46,"line":47},"line",1,[49],{"type":19,"tag":44,"props":50,"children":51},{},[52],{"type":25,"value":53},"Vue3 前端\n",{"type":19,"tag":44,"props":55,"children":57},{"class":46,"line":56},2,[58],{"type":19,"tag":44,"props":59,"children":60},{},[61],{"type":25,"value":62},"  |\n",{"type":19,"tag":44,"props":64,"children":66},{"class":46,"line":65},3,[67],{"type":19,"tag":44,"props":68,"children":69},{},[70],{"type":25,"value":71},"  | REST API\n",{"type":19,"tag":44,"props":73,"children":75},{"class":46,"line":74},4,[76],{"type":19,"tag":44,"props":77,"children":78},{},[79],{"type":25,"value":80},"  v\n",{"type":19,"tag":44,"props":82,"children":84},{"class":46,"line":83},5,[85],{"type":19,"tag":44,"props":86,"children":87},{},[88],{"type":25,"value":89},"SpringBoot 主后端\n",{"type":19,"tag":44,"props":91,"children":93},{"class":46,"line":92},6,[94],{"type":19,"tag":44,"props":95,"children":96},{},[97],{"type":25,"value":62},{"type":19,"tag":44,"props":99,"children":101},{"class":46,"line":100},7,[102],{"type":19,"tag":44,"props":103,"children":104},{},[105],{"type":25,"value":106},"  | 任务管理 \u002F 数据管理\n",{"type":19,"tag":44,"props":108,"children":110},{"class":46,"line":109},8,[111],{"type":19,"tag":44,"props":112,"children":113},{},[114],{"type":25,"value":80},{"type":19,"tag":44,"props":116,"children":118},{"class":46,"line":117},9,[119],{"type":19,"tag":44,"props":120,"children":121},{},[122],{"type":25,"value":123},"PostgreSQL + Redis\n",{"type":19,"tag":44,"props":125,"children":127},{"class":46,"line":126},10,[128],{"type":19,"tag":44,"props":129,"children":130},{},[131],{"type":25,"value":62},{"type":19,"tag":44,"props":133,"children":135},{"class":46,"line":134},11,[136],{"type":19,"tag":44,"props":137,"children":138},{},[139],{"type":25,"value":140},"  | 调用\n",{"type":19,"tag":44,"props":142,"children":144},{"class":46,"line":143},12,[145],{"type":19,"tag":44,"props":146,"children":147},{},[148],{"type":25,"value":80},{"type":19,"tag":44,"props":150,"children":152},{"class":46,"line":151},13,[153],{"type":19,"tag":44,"props":154,"children":155},{},[156],{"type":25,"value":157},"Python AI Service\n",{"type":19,"tag":44,"props":159,"children":161},{"class":46,"line":160},14,[162],{"type":19,"tag":44,"props":163,"children":164},{},[165],{"type":25,"value":62},{"type":19,"tag":44,"props":167,"children":169},{"class":46,"line":168},15,[170],{"type":19,"tag":44,"props":171,"children":172},{},[173],{"type":25,"value":174},"  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n",{"type":19,"tag":44,"props":176,"children":178},{"class":46,"line":177},16,[179],{"type":19,"tag":44,"props":180,"children":181},{},[182],{"type":25,"value":80},{"type":19,"tag":44,"props":184,"children":186},{"class":46,"line":185},17,[187],{"type":19,"tag":44,"props":188,"children":189},{},[190],{"type":25,"value":191},"模型推理与分子计算\n",{"type":19,"tag":27,"props":193,"children":194},{},[195],{"type":25,"value":196},"会把自己认为比较重要的部分，单独写成文章以便记录。写这个项目的主要目的是在干中学，因为不想从头到尾啃生物信息学、生物化学原理这些AI制药理论。通过项目能快速理解核心概念，剩下零零散散的后续慢慢补。",{"type":19,"tag":27,"props":198,"children":199},{},[200],{"type":25,"value":201},"此篇就是第 7 周的记录。",{"type":19,"tag":20,"props":203,"children":205},{"id":204},"第-7-周scaffold-split-和-gnn-入门",[206],{"type":25,"value":7},{"type":19,"tag":27,"props":208,"children":209},{},[210],{"type":25,"value":211},"第 7 周主要解决两个问题。",{"type":19,"tag":27,"props":213,"children":214},{},[215],{"type":25,"value":216},"第一，前面几周用的 random split 可能太乐观。结构相似的分子可能同时出现在训练集和测试集里，模型看起来泛化得不错，但实际只是记住了相近化学骨架。",{"type":19,"tag":27,"props":218,"children":219},{},[220],{"type":25,"value":221},"第二，Morgan Fingerprint 虽然稳定，但它已经把分子结构压成固定长度 bit vector。进入 GNN 之前，需要把分子重新看成图：原子是节点，化学键是边。",{"type":19,"tag":27,"props":223,"children":224},{},[225],{"type":25,"value":226},"这一周的路线是：",{"type":19,"tag":33,"props":228,"children":230},{"className":35,"code":229,"language":37,"meta":5,"style":5},"random split vs scaffold split\n-> Bemis-Murcko scaffold\n-> mol_to_graph\n-> simple GCN baseline\n",[231],{"type":19,"tag":40,"props":232,"children":233},{"__ignoreMap":5},[234,242,250,258],{"type":19,"tag":44,"props":235,"children":236},{"class":46,"line":47},[237],{"type":19,"tag":44,"props":238,"children":239},{},[240],{"type":25,"value":241},"random split vs scaffold split\n",{"type":19,"tag":44,"props":243,"children":244},{"class":46,"line":56},[245],{"type":19,"tag":44,"props":246,"children":247},{},[248],{"type":25,"value":249},"-> Bemis-Murcko scaffold\n",{"type":19,"tag":44,"props":251,"children":252},{"class":46,"line":65},[253],{"type":19,"tag":44,"props":254,"children":255},{},[256],{"type":25,"value":257},"-> mol_to_graph\n",{"type":19,"tag":44,"props":259,"children":260},{"class":46,"line":74},[261],{"type":19,"tag":44,"props":262,"children":263},{},[264],{"type":25,"value":265},"-> simple GCN baseline\n",{"type":19,"tag":267,"props":268,"children":270},"h3",{"id":269},"random-split-的问题",[271],{"type":25,"value":272},"random split 的问题",{"type":19,"tag":27,"props":274,"children":275},{},[276],{"type":25,"value":277},"random split 按样本随机划分 train \u002F validation \u002F test。它适合很多机器学习任务，但在分子任务里有一个明显风险：结构非常相似的分子可能被分到不同 split。",{"type":19,"tag":27,"props":279,"children":280},{},[281],{"type":25,"value":282},"这会让测试集变得“不够陌生”。模型在训练集中见过相近骨架后，测试集表现可能很好，但这种表现不一定代表它能泛化到新的化学系列。",{"type":19,"tag":27,"props":284,"children":285},{},[286],{"type":25,"value":287},"在药物研发里，更关心的问题往往不是“同一批相似分子里还能不能预测准”，而是“遇到新骨架时还剩多少预测能力”。这就是 scaffold split 的价值。",{"type":19,"tag":267,"props":289,"children":291},{"id":290},"scaffold-split-更严格",[292],{"type":25,"value":293},"scaffold split 更严格",{"type":19,"tag":27,"props":295,"children":296},{},[297],{"type":25,"value":298},"Scaffold split 基于 Bemis-Murcko scaffold 分组。简单说，它按分子的核心骨架划分，而不是按单个分子随机划分。",{"type":19,"tag":27,"props":300,"children":301},{},[302],{"type":25,"value":303},"同一个 scaffold 下的分子会被放到同一个 split 里，避免相似骨架同时出现在训练集和测试集。",{"type":19,"tag":27,"props":305,"children":306},{},[307],{"type":25,"value":308},"这会让测试更难，但也更接近真实泛化。模型如果在 scaffold split 下仍然表现稳定，才更有说服力。",{"type":19,"tag":27,"props":310,"children":311},{},[312],{"type":25,"value":313},"这一周的对比脚本是：",{"type":19,"tag":33,"props":315,"children":317},{"className":35,"code":316,"language":37,"meta":5,"style":5},"ml-experiments\u002Fscaffold_split.py\n",[318],{"type":19,"tag":40,"props":319,"children":320},{"__ignoreMap":5},[321],{"type":19,"tag":44,"props":322,"children":323},{"class":46,"line":47},[324],{"type":19,"tag":44,"props":325,"children":326},{},[327],{"type":25,"value":316},{"type":19,"tag":27,"props":329,"children":330},{},[331],{"type":25,"value":332},"它输出：",{"type":19,"tag":33,"props":334,"children":336},{"className":35,"code":335,"language":37,"meta":5,"style":5},"ml-experiments\u002Foutputs\u002Fscaffold_split_metrics.json\nml-experiments\u002Foutputs\u002Fscaffold_split_assignments.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fsplit_comparison_r2.png\nml-experiments\u002Freports\u002Fsplit_comparison.md\nmodels\u002Fesol_rf_random_split.pkl\nmodels\u002Fesol_rf_scaffold_split.pkl\n",[337],{"type":19,"tag":40,"props":338,"children":339},{"__ignoreMap":5},[340,348,356,364,372,380],{"type":19,"tag":44,"props":341,"children":342},{"class":46,"line":47},[343],{"type":19,"tag":44,"props":344,"children":345},{},[346],{"type":25,"value":347},"ml-experiments\u002Foutputs\u002Fscaffold_split_metrics.json\n",{"type":19,"tag":44,"props":349,"children":350},{"class":46,"line":56},[351],{"type":19,"tag":44,"props":352,"children":353},{},[354],{"type":25,"value":355},"ml-experiments\u002Foutputs\u002Fscaffold_split_assignments.csv\n",{"type":19,"tag":44,"props":357,"children":358},{"class":46,"line":65},[359],{"type":19,"tag":44,"props":360,"children":361},{},[362],{"type":25,"value":363},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsplit_comparison_r2.png\n",{"type":19,"tag":44,"props":365,"children":366},{"class":46,"line":74},[367],{"type":19,"tag":44,"props":368,"children":369},{},[370],{"type":25,"value":371},"ml-experiments\u002Freports\u002Fsplit_comparison.md\n",{"type":19,"tag":44,"props":373,"children":374},{"class":46,"line":83},[375],{"type":19,"tag":44,"props":376,"children":377},{},[378],{"type":25,"value":379},"models\u002Fesol_rf_random_split.pkl\n",{"type":19,"tag":44,"props":381,"children":382},{"class":46,"line":92},[383],{"type":19,"tag":44,"props":384,"children":385},{},[386],{"type":25,"value":387},"models\u002Fesol_rf_scaffold_split.pkl\n",{"type":19,"tag":27,"props":389,"children":390},{},[391],{"type":25,"value":392},"这组文件的作用是把“为什么 random split 可能过于乐观”从一句判断变成可复现实验。",{"type":19,"tag":267,"props":394,"children":396},{"id":395},"从-fingerprint-到-molecular-graph",[397],{"type":25,"value":398},"从 fingerprint 到 molecular graph",{"type":19,"tag":27,"props":400,"children":401},{},[402],{"type":25,"value":403},"Morgan Fingerprint 和 GNN 的差异，本质上是分子表示方式不同。",{"type":19,"tag":405,"props":406,"children":407},"table",{},[408,427],{"type":19,"tag":409,"props":410,"children":411},"thead",{},[412],{"type":19,"tag":413,"props":414,"children":415},"tr",{},[416,422],{"type":19,"tag":417,"props":418,"children":419},"th",{},[420],{"type":25,"value":421},"表示",{"type":19,"tag":417,"props":423,"children":424},{},[425],{"type":25,"value":426},"特点",{"type":19,"tag":428,"props":429,"children":430},"tbody",{},[431,445,458],{"type":19,"tag":413,"props":432,"children":433},{},[434,440],{"type":19,"tag":435,"props":436,"children":437},"td",{},[438],{"type":25,"value":439},"RDKit descriptors",{"type":19,"tag":435,"props":441,"children":442},{},[443],{"type":25,"value":444},"低维、人工定义、含义清楚",{"type":19,"tag":413,"props":446,"children":447},{},[448,453],{"type":19,"tag":435,"props":449,"children":450},{},[451],{"type":25,"value":452},"Morgan Fingerprint",{"type":19,"tag":435,"props":454,"children":455},{},[456],{"type":25,"value":457},"固定长度 bit vector，稳定、高效",{"type":19,"tag":413,"props":459,"children":460},{},[461,466],{"type":19,"tag":435,"props":462,"children":463},{},[464],{"type":25,"value":465},"分子图",{"type":19,"tag":435,"props":467,"children":468},{},[469],{"type":25,"value":470},"保留原子和键的连接关系",{"type":19,"tag":27,"props":472,"children":473},{},[474],{"type":25,"value":475},"Fingerprint 已经把结构编码成固定长度向量。GNN 则直接在图上做消息传递，让原子节点从邻居节点和边关系中更新表示。",{"type":19,"tag":27,"props":477,"children":478},{},[479,481,487],{"type":25,"value":480},"这一周实现的最小版本 ",{"type":19,"tag":40,"props":482,"children":484},{"className":483},[],[485],{"type":25,"value":486},"mol_to_graph",{"type":25,"value":488}," 主要包含：",{"type":19,"tag":490,"props":491,"children":492},"ul",{},[493,499,504,509,514],{"type":19,"tag":494,"props":495,"children":496},"li",{},[497],{"type":25,"value":498},"原子作为节点",{"type":19,"tag":494,"props":500,"children":501},{},[502],{"type":25,"value":503},"化学键作为边",{"type":19,"tag":494,"props":505,"children":506},{},[507],{"type":25,"value":508},"节点特征",{"type":19,"tag":494,"props":510,"children":511},{},[512],{"type":25,"value":513},"边索引",{"type":19,"tag":494,"props":515,"children":516},{},[517],{"type":25,"value":518},"图级标签",{"type":19,"tag":27,"props":520,"children":521},{},[522],{"type":25,"value":523},"这一步的价值不是立刻拿到高分，而是把输入从表格特征切换成图数据。",{"type":19,"tag":267,"props":525,"children":527},{"id":526},"simple-gcn-baseline",[528],{"type":25,"value":529},"simple GCN baseline",{"type":19,"tag":27,"props":531,"children":532},{},[533],{"type":25,"value":534},"GNN 部分先跑一个简单 GCN baseline：",{"type":19,"tag":33,"props":536,"children":538},{"className":35,"code":537,"language":37,"meta":5,"style":5},"ml-experiments\u002Fsimple_gnn_baseline.py\n",[539],{"type":19,"tag":40,"props":540,"children":541},{"__ignoreMap":5},[542],{"type":19,"tag":44,"props":543,"children":544},{"class":46,"line":47},[545],{"type":19,"tag":44,"props":546,"children":547},{},[548],{"type":25,"value":537},{"type":19,"tag":27,"props":550,"children":551},{},[552],{"type":25,"value":553},"输出文件包括：",{"type":19,"tag":33,"props":555,"children":557},{"className":35,"code":556,"language":37,"meta":5,"style":5},"models\u002Fesol_simple_gcn.pt\nml-experiments\u002Foutputs\u002Fsimple_gnn_metrics.json\nml-experiments\u002Foutputs\u002Fsimple_gnn_predictions.csv\nml-experiments\u002Foutputs\u002Fsimple_gnn_training_log.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_loss_curve.png\nml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_pred_vs_actual.png\nml-experiments\u002Freports\u002Fsimple_gnn_report.md\n",[558],{"type":19,"tag":40,"props":559,"children":560},{"__ignoreMap":5},[561,569,577,585,593,601,609],{"type":19,"tag":44,"props":562,"children":563},{"class":46,"line":47},[564],{"type":19,"tag":44,"props":565,"children":566},{},[567],{"type":25,"value":568},"models\u002Fesol_simple_gcn.pt\n",{"type":19,"tag":44,"props":570,"children":571},{"class":46,"line":56},[572],{"type":19,"tag":44,"props":573,"children":574},{},[575],{"type":25,"value":576},"ml-experiments\u002Foutputs\u002Fsimple_gnn_metrics.json\n",{"type":19,"tag":44,"props":578,"children":579},{"class":46,"line":65},[580],{"type":19,"tag":44,"props":581,"children":582},{},[583],{"type":25,"value":584},"ml-experiments\u002Foutputs\u002Fsimple_gnn_predictions.csv\n",{"type":19,"tag":44,"props":586,"children":587},{"class":46,"line":74},[588],{"type":19,"tag":44,"props":589,"children":590},{},[591],{"type":25,"value":592},"ml-experiments\u002Foutputs\u002Fsimple_gnn_training_log.csv\n",{"type":19,"tag":44,"props":594,"children":595},{"class":46,"line":83},[596],{"type":19,"tag":44,"props":597,"children":598},{},[599],{"type":25,"value":600},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_loss_curve.png\n",{"type":19,"tag":44,"props":602,"children":603},{"class":46,"line":92},[604],{"type":19,"tag":44,"props":605,"children":606},{},[607],{"type":25,"value":608},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_pred_vs_actual.png\n",{"type":19,"tag":44,"props":610,"children":611},{"class":46,"line":100},[612],{"type":19,"tag":44,"props":613,"children":614},{},[615],{"type":25,"value":616},"ml-experiments\u002Freports\u002Fsimple_gnn_report.md\n",{"type":19,"tag":27,"props":618,"children":619},{},[620],{"type":25,"value":621},"这版 GCN 只负责跑通图模型训练、评估、保存和报告流程。它不承担“超过所有 baseline”的压力。对项目来说，先把分子图数据结构、PyTorch Geometric 数据流、图级回归流程接起来，比追求一个漂亮分数更重要。",{"type":19,"tag":267,"props":623,"children":625},{"id":624},"这一周改变了评估视角",[626],{"type":25,"value":624},{"type":19,"tag":27,"props":628,"children":629},{},[630],{"type":25,"value":631},"第 4 周和第 5 周更多是在比较模型：RandomForest、MLP、不同特征。第 7 周开始，评估方式本身也变成了实验对象。",{"type":19,"tag":27,"props":633,"children":634},{},[635],{"type":25,"value":636},"如果 random split 分数高，而 scaffold split 分数明显下降，这不是坏事。它说明模型在新骨架上的泛化能力有限，也说明之前的 random split 可能给了过于乐观的判断。",{"type":19,"tag":27,"props":638,"children":639},{},[640],{"type":25,"value":641},"这类结论比单纯展示一个模型更接近 AI 制药工程里真实的问题：数据划分、化学空间、泛化边界，都会影响模型是否可信。",{"type":19,"tag":27,"props":643,"children":644},{},[645],{"type":25,"value":646},"第 7 周结束后，项目里已经有三类输入表示：",{"type":19,"tag":33,"props":648,"children":650},{"className":35,"code":649,"language":37,"meta":5,"style":5},"RDKit descriptors\nMorgan Fingerprint\nmolecular graph\n",[651],{"type":19,"tag":40,"props":652,"children":653},{"__ignoreMap":5},[654,662,670],{"type":19,"tag":44,"props":655,"children":656},{"class":46,"line":47},[657],{"type":19,"tag":44,"props":658,"children":659},{},[660],{"type":25,"value":661},"RDKit descriptors\n",{"type":19,"tag":44,"props":663,"children":664},{"class":46,"line":56},[665],{"type":19,"tag":44,"props":666,"children":667},{},[668],{"type":25,"value":669},"Morgan Fingerprint\n",{"type":19,"tag":44,"props":671,"children":672},{"class":46,"line":65},[673],{"type":19,"tag":44,"props":674,"children":675},{},[676],{"type":25,"value":677},"molecular graph\n",{"type":19,"tag":27,"props":679,"children":680},{},[681],{"type":25,"value":682},"也有两类评估划分：",{"type":19,"tag":33,"props":684,"children":686},{"className":35,"code":685,"language":37,"meta":5,"style":5},"random split\nscaffold split\n",[687],{"type":19,"tag":40,"props":688,"children":689},{"__ignoreMap":5},[690,698],{"type":19,"tag":44,"props":691,"children":692},{"class":46,"line":47},[693],{"type":19,"tag":44,"props":694,"children":695},{},[696],{"type":25,"value":697},"random split\n",{"type":19,"tag":44,"props":699,"children":700},{"class":46,"line":56},[701],{"type":19,"tag":44,"props":702,"children":703},{},[704],{"type":25,"value":705},"scaffold split\n",{"type":19,"tag":27,"props":707,"children":708},{},[709],{"type":25,"value":710},"后续无论接 DTI、ADMET 扩展，还是把模型封装成推理 API，都可以基于这套更严肃的评估框架继续推进。",{"type":19,"tag":712,"props":713,"children":714},"style",{},[715],{"type":25,"value":716},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":5,"searchDepth":56,"depth":56,"links":718},[719,720],{"id":22,"depth":56,"text":22},{"id":204,"depth":56,"text":7,"children":721},[722,723,724,725,726],{"id":269,"depth":65,"text":272},{"id":290,"depth":65,"text":293},{"id":395,"depth":65,"text":398},{"id":526,"depth":65,"text":529},{"id":624,"depth":65,"text":624},"markdown","content:articles:生信基础:ai-drug-week07-scaffold-gnn.md","content","articles\u002F生信基础\u002Fai-drug-week07-scaffold-gnn.md","articles\u002F生信基础\u002Fai-drug-week07-scaffold-gnn","md",1780895158452]