[{"data":1,"prerenderedAt":731},["ShallowReactive",2],{"article-ai\u002Fai-drug-week07-scaffold-gnn":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"tags":11,"listed":15,"body":16,"_type":725,"_id":726,"_source":727,"_file":728,"_stem":729,"_extension":730},"\u002Farticles\u002Fai\u002Fai-drug-week07-scaffold-gnn","ai",false,"","第 7 周：Scaffold Split 和 GNN 入门","记录 random split 与 scaffold split 的差异，以及从 Morgan Fingerprint 过渡到分子图和简单 GCN baseline 的过程。","2026-06-03",[12,13,14],"AI制药","深度学习","人工智能",true,{"type":17,"children":18,"toc":718},"root",[19,27,33,193,198,203,208,213,218,223,228,267,274,279,284,289,295,300,305,310,315,329,334,389,394,400,405,472,477,490,520,525,531,536,550,555,618,623,628,633,638,643,648,679,684,707,712],{"type":20,"tag":21,"props":22,"children":24},"element","h1",{"id":23},"前情提要",[25],{"type":26,"value":23},"text",{"type":20,"tag":28,"props":29,"children":30},"p",{},[31],{"type":26,"value":32},"正在做一个AI制药平台项目，前端使用vue3、后端采用双后端(主后端Java、SpringBoot做业务，次后端Python、FastAPI做AI服务)",{"type":20,"tag":34,"props":35,"children":39},"pre",{"className":36,"code":37,"language":38,"meta":7,"style":7},"language-plain shiki shiki-themes github-dark","Vue3 前端\n  |\n  | REST API\n  v\nSpringBoot 主后端\n  |\n  | 任务管理 \u002F 数据管理\n  v\nPostgreSQL + Redis\n  |\n  | 调用\n  v\nPython AI Service\n  |\n  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n  v\n模型推理与分子计算\n","plain",[40],{"type":20,"tag":41,"props":42,"children":43},"code",{"__ignoreMap":7},[44,55,64,73,82,91,99,108,116,125,133,142,150,159,167,176,184],{"type":20,"tag":45,"props":46,"children":49},"span",{"class":47,"line":48},"line",1,[50],{"type":20,"tag":45,"props":51,"children":52},{},[53],{"type":26,"value":54},"Vue3 前端\n",{"type":20,"tag":45,"props":56,"children":58},{"class":47,"line":57},2,[59],{"type":20,"tag":45,"props":60,"children":61},{},[62],{"type":26,"value":63},"  |\n",{"type":20,"tag":45,"props":65,"children":67},{"class":47,"line":66},3,[68],{"type":20,"tag":45,"props":69,"children":70},{},[71],{"type":26,"value":72},"  | REST API\n",{"type":20,"tag":45,"props":74,"children":76},{"class":47,"line":75},4,[77],{"type":20,"tag":45,"props":78,"children":79},{},[80],{"type":26,"value":81},"  v\n",{"type":20,"tag":45,"props":83,"children":85},{"class":47,"line":84},5,[86],{"type":20,"tag":45,"props":87,"children":88},{},[89],{"type":26,"value":90},"SpringBoot 主后端\n",{"type":20,"tag":45,"props":92,"children":94},{"class":47,"line":93},6,[95],{"type":20,"tag":45,"props":96,"children":97},{},[98],{"type":26,"value":63},{"type":20,"tag":45,"props":100,"children":102},{"class":47,"line":101},7,[103],{"type":20,"tag":45,"props":104,"children":105},{},[106],{"type":26,"value":107},"  | 任务管理 \u002F 数据管理\n",{"type":20,"tag":45,"props":109,"children":111},{"class":47,"line":110},8,[112],{"type":20,"tag":45,"props":113,"children":114},{},[115],{"type":26,"value":81},{"type":20,"tag":45,"props":117,"children":119},{"class":47,"line":118},9,[120],{"type":20,"tag":45,"props":121,"children":122},{},[123],{"type":26,"value":124},"PostgreSQL + Redis\n",{"type":20,"tag":45,"props":126,"children":128},{"class":47,"line":127},10,[129],{"type":20,"tag":45,"props":130,"children":131},{},[132],{"type":26,"value":63},{"type":20,"tag":45,"props":134,"children":136},{"class":47,"line":135},11,[137],{"type":20,"tag":45,"props":138,"children":139},{},[140],{"type":26,"value":141},"  | 调用\n",{"type":20,"tag":45,"props":143,"children":145},{"class":47,"line":144},12,[146],{"type":20,"tag":45,"props":147,"children":148},{},[149],{"type":26,"value":81},{"type":20,"tag":45,"props":151,"children":153},{"class":47,"line":152},13,[154],{"type":20,"tag":45,"props":155,"children":156},{},[157],{"type":26,"value":158},"Python AI Service\n",{"type":20,"tag":45,"props":160,"children":162},{"class":47,"line":161},14,[163],{"type":20,"tag":45,"props":164,"children":165},{},[166],{"type":26,"value":63},{"type":20,"tag":45,"props":168,"children":170},{"class":47,"line":169},15,[171],{"type":20,"tag":45,"props":172,"children":173},{},[174],{"type":26,"value":175},"  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n",{"type":20,"tag":45,"props":177,"children":179},{"class":47,"line":178},16,[180],{"type":20,"tag":45,"props":181,"children":182},{},[183],{"type":26,"value":81},{"type":20,"tag":45,"props":185,"children":187},{"class":47,"line":186},17,[188],{"type":20,"tag":45,"props":189,"children":190},{},[191],{"type":26,"value":192},"模型推理与分子计算\n",{"type":20,"tag":28,"props":194,"children":195},{},[196],{"type":26,"value":197},"会把自己认为比较重要的部分，单独写成文章以便记录。写这个项目的主要目的是以赛代练，因为不想从头到尾啃生物信息学、生物化学原理这些AI制药理论。通过项目能快速理解核心概念，剩下零零散散的后续慢慢补。",{"type":20,"tag":28,"props":199,"children":200},{},[201],{"type":26,"value":202},"此篇就是第 7 周的记录。",{"type":20,"tag":21,"props":204,"children":206},{"id":205},"第-7-周scaffold-split-和-gnn-入门",[207],{"type":26,"value":8},{"type":20,"tag":28,"props":209,"children":210},{},[211],{"type":26,"value":212},"第 7 周主要解决两个问题。",{"type":20,"tag":28,"props":214,"children":215},{},[216],{"type":26,"value":217},"第一，前面几周用的 random split 可能太乐观。结构相似的分子可能同时出现在训练集和测试集里，模型看起来泛化得不错，但实际只是记住了相近化学骨架。",{"type":20,"tag":28,"props":219,"children":220},{},[221],{"type":26,"value":222},"第二，Morgan Fingerprint 虽然稳定，但它已经把分子结构压成固定长度 bit vector。进入 GNN 之前，需要把分子重新看成图：原子是节点，化学键是边。",{"type":20,"tag":28,"props":224,"children":225},{},[226],{"type":26,"value":227},"这一周的路线是：",{"type":20,"tag":34,"props":229,"children":231},{"className":36,"code":230,"language":38,"meta":7,"style":7},"random split vs scaffold split\n-> Bemis-Murcko scaffold\n-> mol_to_graph\n-> simple GCN baseline\n",[232],{"type":20,"tag":41,"props":233,"children":234},{"__ignoreMap":7},[235,243,251,259],{"type":20,"tag":45,"props":236,"children":237},{"class":47,"line":48},[238],{"type":20,"tag":45,"props":239,"children":240},{},[241],{"type":26,"value":242},"random split vs scaffold split\n",{"type":20,"tag":45,"props":244,"children":245},{"class":47,"line":57},[246],{"type":20,"tag":45,"props":247,"children":248},{},[249],{"type":26,"value":250},"-> Bemis-Murcko scaffold\n",{"type":20,"tag":45,"props":252,"children":253},{"class":47,"line":66},[254],{"type":20,"tag":45,"props":255,"children":256},{},[257],{"type":26,"value":258},"-> mol_to_graph\n",{"type":20,"tag":45,"props":260,"children":261},{"class":47,"line":75},[262],{"type":20,"tag":45,"props":263,"children":264},{},[265],{"type":26,"value":266},"-> simple GCN baseline\n",{"type":20,"tag":268,"props":269,"children":271},"h2",{"id":270},"random-split-的问题",[272],{"type":26,"value":273},"random split 的问题",{"type":20,"tag":28,"props":275,"children":276},{},[277],{"type":26,"value":278},"random split 按样本随机划分 train \u002F validation \u002F test。它适合很多机器学习任务，但在分子任务里有一个明显风险：结构非常相似的分子可能被分到不同 split。",{"type":20,"tag":28,"props":280,"children":281},{},[282],{"type":26,"value":283},"这会让测试集变得“不够陌生”。模型在训练集中见过相近骨架后，测试集表现可能很好，但这种表现不一定代表它能泛化到新的化学系列。",{"type":20,"tag":28,"props":285,"children":286},{},[287],{"type":26,"value":288},"在药物研发里，更关心的问题往往不是“同一批相似分子里还能不能预测准”，而是“遇到新骨架时还剩多少预测能力”。这就是 scaffold split 的价值。",{"type":20,"tag":268,"props":290,"children":292},{"id":291},"scaffold-split-更严格",[293],{"type":26,"value":294},"scaffold split 更严格",{"type":20,"tag":28,"props":296,"children":297},{},[298],{"type":26,"value":299},"Scaffold split 基于 Bemis-Murcko scaffold 分组。简单说，它按分子的核心骨架划分，而不是按单个分子随机划分。",{"type":20,"tag":28,"props":301,"children":302},{},[303],{"type":26,"value":304},"同一个 scaffold 下的分子会被放到同一个 split 里，避免相似骨架同时出现在训练集和测试集。",{"type":20,"tag":28,"props":306,"children":307},{},[308],{"type":26,"value":309},"这会让测试更难，但也更接近真实泛化。模型如果在 scaffold split 下仍然表现稳定，才更有说服力。",{"type":20,"tag":28,"props":311,"children":312},{},[313],{"type":26,"value":314},"这一周的对比脚本是：",{"type":20,"tag":34,"props":316,"children":318},{"className":36,"code":317,"language":38,"meta":7,"style":7},"ml-experiments\u002Fscaffold_split.py\n",[319],{"type":20,"tag":41,"props":320,"children":321},{"__ignoreMap":7},[322],{"type":20,"tag":45,"props":323,"children":324},{"class":47,"line":48},[325],{"type":20,"tag":45,"props":326,"children":327},{},[328],{"type":26,"value":317},{"type":20,"tag":28,"props":330,"children":331},{},[332],{"type":26,"value":333},"它输出：",{"type":20,"tag":34,"props":335,"children":337},{"className":36,"code":336,"language":38,"meta":7,"style":7},"ml-experiments\u002Foutputs\u002Fscaffold_split_metrics.json\nml-experiments\u002Foutputs\u002Fscaffold_split_assignments.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fsplit_comparison_r2.png\nml-experiments\u002Freports\u002Fsplit_comparison.md\nmodels\u002Fesol_rf_random_split.pkl\nmodels\u002Fesol_rf_scaffold_split.pkl\n",[338],{"type":20,"tag":41,"props":339,"children":340},{"__ignoreMap":7},[341,349,357,365,373,381],{"type":20,"tag":45,"props":342,"children":343},{"class":47,"line":48},[344],{"type":20,"tag":45,"props":345,"children":346},{},[347],{"type":26,"value":348},"ml-experiments\u002Foutputs\u002Fscaffold_split_metrics.json\n",{"type":20,"tag":45,"props":350,"children":351},{"class":47,"line":57},[352],{"type":20,"tag":45,"props":353,"children":354},{},[355],{"type":26,"value":356},"ml-experiments\u002Foutputs\u002Fscaffold_split_assignments.csv\n",{"type":20,"tag":45,"props":358,"children":359},{"class":47,"line":66},[360],{"type":20,"tag":45,"props":361,"children":362},{},[363],{"type":26,"value":364},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsplit_comparison_r2.png\n",{"type":20,"tag":45,"props":366,"children":367},{"class":47,"line":75},[368],{"type":20,"tag":45,"props":369,"children":370},{},[371],{"type":26,"value":372},"ml-experiments\u002Freports\u002Fsplit_comparison.md\n",{"type":20,"tag":45,"props":374,"children":375},{"class":47,"line":84},[376],{"type":20,"tag":45,"props":377,"children":378},{},[379],{"type":26,"value":380},"models\u002Fesol_rf_random_split.pkl\n",{"type":20,"tag":45,"props":382,"children":383},{"class":47,"line":93},[384],{"type":20,"tag":45,"props":385,"children":386},{},[387],{"type":26,"value":388},"models\u002Fesol_rf_scaffold_split.pkl\n",{"type":20,"tag":28,"props":390,"children":391},{},[392],{"type":26,"value":393},"这组文件的作用是把“为什么 random split 可能过于乐观”从一句判断变成可复现实验。",{"type":20,"tag":268,"props":395,"children":397},{"id":396},"从-fingerprint-到-molecular-graph",[398],{"type":26,"value":399},"从 fingerprint 到 molecular graph",{"type":20,"tag":28,"props":401,"children":402},{},[403],{"type":26,"value":404},"Morgan Fingerprint 和 GNN 的差异，本质上是分子表示方式不同。",{"type":20,"tag":406,"props":407,"children":408},"table",{},[409,428],{"type":20,"tag":410,"props":411,"children":412},"thead",{},[413],{"type":20,"tag":414,"props":415,"children":416},"tr",{},[417,423],{"type":20,"tag":418,"props":419,"children":420},"th",{},[421],{"type":26,"value":422},"表示",{"type":20,"tag":418,"props":424,"children":425},{},[426],{"type":26,"value":427},"特点",{"type":20,"tag":429,"props":430,"children":431},"tbody",{},[432,446,459],{"type":20,"tag":414,"props":433,"children":434},{},[435,441],{"type":20,"tag":436,"props":437,"children":438},"td",{},[439],{"type":26,"value":440},"RDKit descriptors",{"type":20,"tag":436,"props":442,"children":443},{},[444],{"type":26,"value":445},"低维、人工定义、含义清楚",{"type":20,"tag":414,"props":447,"children":448},{},[449,454],{"type":20,"tag":436,"props":450,"children":451},{},[452],{"type":26,"value":453},"Morgan Fingerprint",{"type":20,"tag":436,"props":455,"children":456},{},[457],{"type":26,"value":458},"固定长度 bit vector，稳定、高效",{"type":20,"tag":414,"props":460,"children":461},{},[462,467],{"type":20,"tag":436,"props":463,"children":464},{},[465],{"type":26,"value":466},"分子图",{"type":20,"tag":436,"props":468,"children":469},{},[470],{"type":26,"value":471},"保留原子和键的连接关系",{"type":20,"tag":28,"props":473,"children":474},{},[475],{"type":26,"value":476},"Fingerprint 已经把结构编码成固定长度向量。GNN 则直接在图上做消息传递，让原子节点从邻居节点和边关系中更新表示。",{"type":20,"tag":28,"props":478,"children":479},{},[480,482,488],{"type":26,"value":481},"这一周实现的最小版本 ",{"type":20,"tag":41,"props":483,"children":485},{"className":484},[],[486],{"type":26,"value":487},"mol_to_graph",{"type":26,"value":489}," 主要包含：",{"type":20,"tag":491,"props":492,"children":493},"ul",{},[494,500,505,510,515],{"type":20,"tag":495,"props":496,"children":497},"li",{},[498],{"type":26,"value":499},"原子作为节点",{"type":20,"tag":495,"props":501,"children":502},{},[503],{"type":26,"value":504},"化学键作为边",{"type":20,"tag":495,"props":506,"children":507},{},[508],{"type":26,"value":509},"节点特征",{"type":20,"tag":495,"props":511,"children":512},{},[513],{"type":26,"value":514},"边索引",{"type":20,"tag":495,"props":516,"children":517},{},[518],{"type":26,"value":519},"图级标签",{"type":20,"tag":28,"props":521,"children":522},{},[523],{"type":26,"value":524},"这一步的价值不是立刻拿到高分，而是把输入从表格特征切换成图数据。",{"type":20,"tag":268,"props":526,"children":528},{"id":527},"simple-gcn-baseline",[529],{"type":26,"value":530},"simple GCN baseline",{"type":20,"tag":28,"props":532,"children":533},{},[534],{"type":26,"value":535},"GNN 部分先跑一个简单 GCN baseline：",{"type":20,"tag":34,"props":537,"children":539},{"className":36,"code":538,"language":38,"meta":7,"style":7},"ml-experiments\u002Fsimple_gnn_baseline.py\n",[540],{"type":20,"tag":41,"props":541,"children":542},{"__ignoreMap":7},[543],{"type":20,"tag":45,"props":544,"children":545},{"class":47,"line":48},[546],{"type":20,"tag":45,"props":547,"children":548},{},[549],{"type":26,"value":538},{"type":20,"tag":28,"props":551,"children":552},{},[553],{"type":26,"value":554},"输出文件包括：",{"type":20,"tag":34,"props":556,"children":558},{"className":36,"code":557,"language":38,"meta":7,"style":7},"models\u002Fesol_simple_gcn.pt\nml-experiments\u002Foutputs\u002Fsimple_gnn_metrics.json\nml-experiments\u002Foutputs\u002Fsimple_gnn_predictions.csv\nml-experiments\u002Foutputs\u002Fsimple_gnn_training_log.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_loss_curve.png\nml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_pred_vs_actual.png\nml-experiments\u002Freports\u002Fsimple_gnn_report.md\n",[559],{"type":20,"tag":41,"props":560,"children":561},{"__ignoreMap":7},[562,570,578,586,594,602,610],{"type":20,"tag":45,"props":563,"children":564},{"class":47,"line":48},[565],{"type":20,"tag":45,"props":566,"children":567},{},[568],{"type":26,"value":569},"models\u002Fesol_simple_gcn.pt\n",{"type":20,"tag":45,"props":571,"children":572},{"class":47,"line":57},[573],{"type":20,"tag":45,"props":574,"children":575},{},[576],{"type":26,"value":577},"ml-experiments\u002Foutputs\u002Fsimple_gnn_metrics.json\n",{"type":20,"tag":45,"props":579,"children":580},{"class":47,"line":66},[581],{"type":20,"tag":45,"props":582,"children":583},{},[584],{"type":26,"value":585},"ml-experiments\u002Foutputs\u002Fsimple_gnn_predictions.csv\n",{"type":20,"tag":45,"props":587,"children":588},{"class":47,"line":75},[589],{"type":20,"tag":45,"props":590,"children":591},{},[592],{"type":26,"value":593},"ml-experiments\u002Foutputs\u002Fsimple_gnn_training_log.csv\n",{"type":20,"tag":45,"props":595,"children":596},{"class":47,"line":84},[597],{"type":20,"tag":45,"props":598,"children":599},{},[600],{"type":26,"value":601},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_loss_curve.png\n",{"type":20,"tag":45,"props":603,"children":604},{"class":47,"line":93},[605],{"type":20,"tag":45,"props":606,"children":607},{},[608],{"type":26,"value":609},"ml-experiments\u002Foutputs\u002Ffigures\u002Fsimple_gnn_pred_vs_actual.png\n",{"type":20,"tag":45,"props":611,"children":612},{"class":47,"line":101},[613],{"type":20,"tag":45,"props":614,"children":615},{},[616],{"type":26,"value":617},"ml-experiments\u002Freports\u002Fsimple_gnn_report.md\n",{"type":20,"tag":28,"props":619,"children":620},{},[621],{"type":26,"value":622},"这版 GCN 只负责跑通图模型训练、评估、保存和报告流程。它不承担“超过所有 baseline”的压力。对项目来说，先把分子图数据结构、PyTorch Geometric 数据流、图级回归流程接起来，比追求一个漂亮分数更重要。",{"type":20,"tag":268,"props":624,"children":626},{"id":625},"这一周改变了评估视角",[627],{"type":26,"value":625},{"type":20,"tag":28,"props":629,"children":630},{},[631],{"type":26,"value":632},"第 4 周和第 5 周更多是在比较模型：RandomForest、MLP、不同特征。第 7 周开始，评估方式本身也变成了实验对象。",{"type":20,"tag":28,"props":634,"children":635},{},[636],{"type":26,"value":637},"如果 random split 分数高，而 scaffold split 分数明显下降，这不是坏事。它说明模型在新骨架上的泛化能力有限，也说明之前的 random split 可能给了过于乐观的判断。",{"type":20,"tag":28,"props":639,"children":640},{},[641],{"type":26,"value":642},"这类结论比单纯展示一个模型更接近 AI 制药工程里真实的问题：数据划分、化学空间、泛化边界，都会影响模型是否可信。",{"type":20,"tag":28,"props":644,"children":645},{},[646],{"type":26,"value":647},"第 7 周结束后，项目里已经有三类输入表示：",{"type":20,"tag":34,"props":649,"children":651},{"className":36,"code":650,"language":38,"meta":7,"style":7},"RDKit descriptors\nMorgan Fingerprint\nmolecular graph\n",[652],{"type":20,"tag":41,"props":653,"children":654},{"__ignoreMap":7},[655,663,671],{"type":20,"tag":45,"props":656,"children":657},{"class":47,"line":48},[658],{"type":20,"tag":45,"props":659,"children":660},{},[661],{"type":26,"value":662},"RDKit descriptors\n",{"type":20,"tag":45,"props":664,"children":665},{"class":47,"line":57},[666],{"type":20,"tag":45,"props":667,"children":668},{},[669],{"type":26,"value":670},"Morgan Fingerprint\n",{"type":20,"tag":45,"props":672,"children":673},{"class":47,"line":66},[674],{"type":20,"tag":45,"props":675,"children":676},{},[677],{"type":26,"value":678},"molecular graph\n",{"type":20,"tag":28,"props":680,"children":681},{},[682],{"type":26,"value":683},"也有两类评估划分：",{"type":20,"tag":34,"props":685,"children":687},{"className":36,"code":686,"language":38,"meta":7,"style":7},"random split\nscaffold split\n",[688],{"type":20,"tag":41,"props":689,"children":690},{"__ignoreMap":7},[691,699],{"type":20,"tag":45,"props":692,"children":693},{"class":47,"line":48},[694],{"type":20,"tag":45,"props":695,"children":696},{},[697],{"type":26,"value":698},"random split\n",{"type":20,"tag":45,"props":700,"children":701},{"class":47,"line":57},[702],{"type":20,"tag":45,"props":703,"children":704},{},[705],{"type":26,"value":706},"scaffold split\n",{"type":20,"tag":28,"props":708,"children":709},{},[710],{"type":26,"value":711},"后续无论接 DTI、ADMET 扩展，还是把模型封装成推理 API，都可以基于这套更严肃的评估框架继续推进。",{"type":20,"tag":713,"props":714,"children":715},"style",{},[716],{"type":26,"value":717},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":7,"searchDepth":57,"depth":57,"links":719},[720,721,722,723,724],{"id":270,"depth":57,"text":273},{"id":291,"depth":57,"text":294},{"id":396,"depth":57,"text":399},{"id":527,"depth":57,"text":530},{"id":625,"depth":57,"text":625},"markdown","content:articles:ai:ai-drug-week07-scaffold-gnn.md","content","articles\u002Fai\u002Fai-drug-week07-scaffold-gnn.md","articles\u002Fai\u002Fai-drug-week07-scaffold-gnn","md",1780481290974]