[{"data":1,"prerenderedAt":786},["ShallowReactive",2],{"article-ai\u002Fai-drug-week04-esol-rf-baseline":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"tags":11,"listed":15,"body":16,"_type":780,"_id":781,"_source":782,"_file":783,"_stem":784,"_extension":785},"\u002Farticles\u002Fai\u002Fai-drug-week04-esol-rf-baseline","ai",false,"","第 4 周：用 ESOL 和 RandomForest 建一个分子性质预测 baseline","记录从 ESOL 数据、RDKit descriptors 到 RandomForest 回归模型的第一版分子水溶解度预测实验。","2026-06-03",[12,13,14],"AI制药","深度学习","人工智能",true,{"type":17,"children":18,"toc":773},"root",[19,27,33,193,198,203,208,221,226,273,278,285,304,334,339,406,412,425,488,493,498,503,517,522,569,582,630,635,682,692,698,703,708,713,719,724,729,734,762,767],{"type":20,"tag":21,"props":22,"children":24},"element","h1",{"id":23},"前情提要",[25],{"type":26,"value":23},"text",{"type":20,"tag":28,"props":29,"children":30},"p",{},[31],{"type":26,"value":32},"正在做一个AI制药平台项目，前端使用vue3、后端采用双后端(主后端Java、SpringBoot做业务，次后端Python、FastAPI做AI服务)",{"type":20,"tag":34,"props":35,"children":39},"pre",{"className":36,"code":37,"language":38,"meta":7,"style":7},"language-plain shiki shiki-themes github-dark","Vue3 前端\n  |\n  | REST API\n  v\nSpringBoot 主后端\n  |\n  | 任务管理 \u002F 数据管理\n  v\nPostgreSQL + Redis\n  |\n  | 调用\n  v\nPython AI Service\n  |\n  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n  v\n模型推理与分子计算\n","plain",[40],{"type":20,"tag":41,"props":42,"children":43},"code",{"__ignoreMap":7},[44,55,64,73,82,91,99,108,116,125,133,142,150,159,167,176,184],{"type":20,"tag":45,"props":46,"children":49},"span",{"class":47,"line":48},"line",1,[50],{"type":20,"tag":45,"props":51,"children":52},{},[53],{"type":26,"value":54},"Vue3 前端\n",{"type":20,"tag":45,"props":56,"children":58},{"class":47,"line":57},2,[59],{"type":20,"tag":45,"props":60,"children":61},{},[62],{"type":26,"value":63},"  |\n",{"type":20,"tag":45,"props":65,"children":67},{"class":47,"line":66},3,[68],{"type":20,"tag":45,"props":69,"children":70},{},[71],{"type":26,"value":72},"  | REST API\n",{"type":20,"tag":45,"props":74,"children":76},{"class":47,"line":75},4,[77],{"type":20,"tag":45,"props":78,"children":79},{},[80],{"type":26,"value":81},"  v\n",{"type":20,"tag":45,"props":83,"children":85},{"class":47,"line":84},5,[86],{"type":20,"tag":45,"props":87,"children":88},{},[89],{"type":26,"value":90},"SpringBoot 主后端\n",{"type":20,"tag":45,"props":92,"children":94},{"class":47,"line":93},6,[95],{"type":20,"tag":45,"props":96,"children":97},{},[98],{"type":26,"value":63},{"type":20,"tag":45,"props":100,"children":102},{"class":47,"line":101},7,[103],{"type":20,"tag":45,"props":104,"children":105},{},[106],{"type":26,"value":107},"  | 任务管理 \u002F 数据管理\n",{"type":20,"tag":45,"props":109,"children":111},{"class":47,"line":110},8,[112],{"type":20,"tag":45,"props":113,"children":114},{},[115],{"type":26,"value":81},{"type":20,"tag":45,"props":117,"children":119},{"class":47,"line":118},9,[120],{"type":20,"tag":45,"props":121,"children":122},{},[123],{"type":26,"value":124},"PostgreSQL + Redis\n",{"type":20,"tag":45,"props":126,"children":128},{"class":47,"line":127},10,[129],{"type":20,"tag":45,"props":130,"children":131},{},[132],{"type":26,"value":63},{"type":20,"tag":45,"props":134,"children":136},{"class":47,"line":135},11,[137],{"type":20,"tag":45,"props":138,"children":139},{},[140],{"type":26,"value":141},"  | 调用\n",{"type":20,"tag":45,"props":143,"children":145},{"class":47,"line":144},12,[146],{"type":20,"tag":45,"props":147,"children":148},{},[149],{"type":26,"value":81},{"type":20,"tag":45,"props":151,"children":153},{"class":47,"line":152},13,[154],{"type":20,"tag":45,"props":155,"children":156},{},[157],{"type":26,"value":158},"Python AI Service\n",{"type":20,"tag":45,"props":160,"children":162},{"class":47,"line":161},14,[163],{"type":20,"tag":45,"props":164,"children":165},{},[166],{"type":26,"value":63},{"type":20,"tag":45,"props":168,"children":170},{"class":47,"line":169},15,[171],{"type":20,"tag":45,"props":172,"children":173},{},[174],{"type":26,"value":175},"  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n",{"type":20,"tag":45,"props":177,"children":179},{"class":47,"line":178},16,[180],{"type":20,"tag":45,"props":181,"children":182},{},[183],{"type":26,"value":81},{"type":20,"tag":45,"props":185,"children":187},{"class":47,"line":186},17,[188],{"type":20,"tag":45,"props":189,"children":190},{},[191],{"type":26,"value":192},"模型推理与分子计算\n",{"type":20,"tag":28,"props":194,"children":195},{},[196],{"type":26,"value":197},"会把自己认为比较重要的部分，单独写成文章以便记录。写这个项目的主要目的是以赛代练，因为不想从头到尾啃生物信息学、生物化学原理这些AI制药理论。通过项目能快速理解核心概念，剩下零零散散的后续慢慢补。",{"type":20,"tag":28,"props":199,"children":200},{},[201],{"type":26,"value":202},"此篇就是第 4 周的记录。",{"type":20,"tag":21,"props":204,"children":206},{"id":205},"第-4-周用-esol-和-randomforest-建一个分子性质预测-baseline",[207],{"type":26,"value":8},{"type":20,"tag":28,"props":209,"children":210},{},[211,213,219],{"type":26,"value":212},"第 4 周开始进入分子性质预测。前 3 周主要是在处理 SMILES、描述符、指纹和相似性搜索，这些东西本身还不算“预测”。这一周我把它们接到一个真正的监督学习任务上：用 ESOL 数据集预测小分子的水溶解度 ",{"type":20,"tag":41,"props":214,"children":216},{"className":215},[],[217],{"type":26,"value":218},"logS",{"type":26,"value":220},"。",{"type":20,"tag":28,"props":222,"children":223},{},[224],{"type":26,"value":225},"我没有一上来写 GNN。第一版选择了更朴素的组合：",{"type":20,"tag":34,"props":227,"children":229},{"className":36,"code":228,"language":38,"meta":7,"style":7},"SMILES\n-> RDKit descriptors\n-> RandomForestRegressor\n-> MAE \u002F RMSE \u002F R2\n-> model.pkl + report\n",[230],{"type":20,"tag":41,"props":231,"children":232},{"__ignoreMap":7},[233,241,249,257,265],{"type":20,"tag":45,"props":234,"children":235},{"class":47,"line":48},[236],{"type":20,"tag":45,"props":237,"children":238},{},[239],{"type":26,"value":240},"SMILES\n",{"type":20,"tag":45,"props":242,"children":243},{"class":47,"line":57},[244],{"type":20,"tag":45,"props":245,"children":246},{},[247],{"type":26,"value":248},"-> RDKit descriptors\n",{"type":20,"tag":45,"props":250,"children":251},{"class":47,"line":66},[252],{"type":20,"tag":45,"props":253,"children":254},{},[255],{"type":26,"value":256},"-> RandomForestRegressor\n",{"type":20,"tag":45,"props":258,"children":259},{"class":47,"line":75},[260],{"type":20,"tag":45,"props":261,"children":262},{},[263],{"type":26,"value":264},"-> MAE \u002F RMSE \u002F R2\n",{"type":20,"tag":45,"props":266,"children":267},{"class":47,"line":84},[268],{"type":20,"tag":45,"props":269,"children":270},{},[271],{"type":26,"value":272},"-> model.pkl + report\n",{"type":20,"tag":28,"props":274,"children":275},{},[276],{"type":26,"value":277},"这个 baseline 的意义不在于模型复杂，而在于它给后面的 MLP、GNN、scaffold split 留下一个可比较的参照。",{"type":20,"tag":279,"props":280,"children":282},"h2",{"id":281},"esol-预测的是-logs",[283],{"type":26,"value":284},"ESOL 预测的是 logS",{"type":20,"tag":28,"props":286,"children":287},{},[288,290,295,297,303],{"type":26,"value":289},"这里最容易混淆的是 ",{"type":20,"tag":41,"props":291,"children":293},{"className":292},[],[294],{"type":26,"value":218},{"type":26,"value":296}," 和 ",{"type":20,"tag":41,"props":298,"children":300},{"className":299},[],[301],{"type":26,"value":302},"logP",{"type":26,"value":220},{"type":20,"tag":28,"props":305,"children":306},{},[307,312,314,319,321,326,328,333],{"type":20,"tag":41,"props":308,"children":310},{"className":309},[],[311],{"type":26,"value":218},{"type":26,"value":313}," 表示水溶解度，描述一个分子能在水里溶解多少。",{"type":20,"tag":41,"props":315,"children":317},{"className":316},[],[318],{"type":26,"value":302},{"type":26,"value":320}," 表示脂水分配系数，描述分子更偏亲脂还是亲水。二者相关，但不是同一个目标。ESOL 数据集做的是水溶解度预测，所以目标列是 ",{"type":20,"tag":41,"props":322,"children":324},{"className":323},[],[325],{"type":26,"value":218},{"type":26,"value":327},"，不是 ",{"type":20,"tag":41,"props":329,"children":331},{"className":330},[],[332],{"type":26,"value":302},{"type":26,"value":220},{"type":20,"tag":28,"props":335,"children":336},{},[337],{"type":26,"value":338},"我在这一周把 ESOL 当成一个回归任务处理。输入是分子结构，输出是连续数值，因此评估指标选了：",{"type":20,"tag":340,"props":341,"children":342},"table",{},[343,362],{"type":20,"tag":344,"props":345,"children":346},"thead",{},[347],{"type":20,"tag":348,"props":349,"children":350},"tr",{},[351,357],{"type":20,"tag":352,"props":353,"children":354},"th",{},[355],{"type":26,"value":356},"指标",{"type":20,"tag":352,"props":358,"children":359},{},[360],{"type":26,"value":361},"作用",{"type":20,"tag":363,"props":364,"children":365},"tbody",{},[366,380,393],{"type":20,"tag":348,"props":367,"children":368},{},[369,375],{"type":20,"tag":370,"props":371,"children":372},"td",{},[373],{"type":26,"value":374},"MAE",{"type":20,"tag":370,"props":376,"children":377},{},[378],{"type":26,"value":379},"平均绝对误差，直观看平均偏差",{"type":20,"tag":348,"props":381,"children":382},{},[383,388],{"type":20,"tag":370,"props":384,"children":385},{},[386],{"type":26,"value":387},"RMSE",{"type":20,"tag":370,"props":389,"children":390},{},[391],{"type":26,"value":392},"对大误差更敏感",{"type":20,"tag":348,"props":394,"children":395},{},[396,401],{"type":20,"tag":370,"props":397,"children":398},{},[399],{"type":26,"value":400},"R2",{"type":20,"tag":370,"props":402,"children":403},{},[404],{"type":26,"value":405},"衡量模型解释目标变量变化的能力",{"type":20,"tag":279,"props":407,"children":409},{"id":408},"特征先从-rdkit-descriptors-开始",[410],{"type":26,"value":411},"特征先从 RDKit descriptors 开始",{"type":20,"tag":28,"props":413,"children":414},{},[415,417,423],{"type":26,"value":416},"这一版没有使用深度学习特征，而是复用前面封装好的 ",{"type":20,"tag":41,"props":418,"children":420},{"className":419},[],[421],{"type":26,"value":422},"molkit.calculate_descriptors()",{"type":26,"value":424},"。最终进入 RandomForest 的特征包括：",{"type":20,"tag":34,"props":426,"children":428},{"className":36,"code":427,"language":38,"meta":7,"style":7},"molecular_weight\nlogp\ntpsa\nh_bond_donors\nh_bond_acceptors\nrotatable_bonds\nring_count\n",[429],{"type":20,"tag":41,"props":430,"children":431},{"__ignoreMap":7},[432,440,448,456,464,472,480],{"type":20,"tag":45,"props":433,"children":434},{"class":47,"line":48},[435],{"type":20,"tag":45,"props":436,"children":437},{},[438],{"type":26,"value":439},"molecular_weight\n",{"type":20,"tag":45,"props":441,"children":442},{"class":47,"line":57},[443],{"type":20,"tag":45,"props":444,"children":445},{},[446],{"type":26,"value":447},"logp\n",{"type":20,"tag":45,"props":449,"children":450},{"class":47,"line":66},[451],{"type":20,"tag":45,"props":452,"children":453},{},[454],{"type":26,"value":455},"tpsa\n",{"type":20,"tag":45,"props":457,"children":458},{"class":47,"line":75},[459],{"type":20,"tag":45,"props":460,"children":461},{},[462],{"type":26,"value":463},"h_bond_donors\n",{"type":20,"tag":45,"props":465,"children":466},{"class":47,"line":84},[467],{"type":20,"tag":45,"props":468,"children":469},{},[470],{"type":26,"value":471},"h_bond_acceptors\n",{"type":20,"tag":45,"props":473,"children":474},{"class":47,"line":93},[475],{"type":20,"tag":45,"props":476,"children":477},{},[478],{"type":26,"value":479},"rotatable_bonds\n",{"type":20,"tag":45,"props":481,"children":482},{"class":47,"line":101},[483],{"type":20,"tag":45,"props":484,"children":485},{},[486],{"type":26,"value":487},"ring_count\n",{"type":20,"tag":28,"props":489,"children":490},{},[491],{"type":26,"value":492},"这些描述符很基础，但好处是含义清楚。比如分子量、极性表面积、氢键供体\u002F受体、可旋转键数量，都能和溶解度建立比较直接的关系。对于一个工程作品集来说，这种 baseline 比直接堆一个黑盒模型更稳。",{"type":20,"tag":279,"props":494,"children":496},{"id":495},"实验脚本",[497],{"type":26,"value":495},{"type":20,"tag":28,"props":499,"children":500},{},[501],{"type":26,"value":502},"本周核心脚本是：",{"type":20,"tag":34,"props":504,"children":506},{"className":36,"code":505,"language":38,"meta":7,"style":7},"ml-experiments\u002Fesol_rf_baseline.py\n",[507],{"type":20,"tag":41,"props":508,"children":509},{"__ignoreMap":7},[510],{"type":20,"tag":45,"props":511,"children":512},{"class":47,"line":48},[513],{"type":20,"tag":45,"props":514,"children":515},{},[516],{"type":26,"value":505},{"type":20,"tag":28,"props":518,"children":519},{},[520],{"type":26,"value":521},"它完成了几件事：",{"type":20,"tag":523,"props":524,"children":525},"ul",{},[526,532,537,542,553,558],{"type":20,"tag":527,"props":528,"children":529},"li",{},[530],{"type":26,"value":531},"下载或读取 ESOL 数据",{"type":20,"tag":527,"props":533,"children":534},{},[535],{"type":26,"value":536},"校验 SMILES 并计算 RDKit descriptors",{"type":20,"tag":527,"props":538,"children":539},{},[540],{"type":26,"value":541},"按 train \u002F validation \u002F test 划分数据",{"type":20,"tag":527,"props":543,"children":544},{},[545,547],{"type":26,"value":546},"训练 ",{"type":20,"tag":41,"props":548,"children":550},{"className":549},[],[551],{"type":26,"value":552},"RandomForestRegressor",{"type":20,"tag":527,"props":554,"children":555},{},[556],{"type":26,"value":557},"输出预测结果、指标和散点图",{"type":20,"tag":527,"props":559,"children":560},{},[561,563],{"type":26,"value":562},"保存模型到 ",{"type":20,"tag":41,"props":564,"children":566},{"className":565},[],[567],{"type":26,"value":568},"models\u002Fesol_rf.pkl",{"type":20,"tag":28,"props":570,"children":571},{},[572,574,580],{"type":26,"value":573},"运行入口保留在 ",{"type":20,"tag":41,"props":575,"children":577},{"className":576},[],[578],{"type":26,"value":579},"ai-service",{"type":26,"value":581}," 环境里：",{"type":20,"tag":34,"props":583,"children":587},{"className":584,"code":585,"language":586,"meta":7,"style":7},"language-bash shiki shiki-themes github-dark","cd ai-service\nuv run python ..\u002Fml-experiments\u002Fesol_rf_baseline.py\n","bash",[588],{"type":20,"tag":41,"props":589,"children":590},{"__ignoreMap":7},[591,606],{"type":20,"tag":45,"props":592,"children":593},{"class":47,"line":48},[594,600],{"type":20,"tag":45,"props":595,"children":597},{"style":596},"--shiki-default:#79B8FF",[598],{"type":26,"value":599},"cd",{"type":20,"tag":45,"props":601,"children":603},{"style":602},"--shiki-default:#9ECBFF",[604],{"type":26,"value":605}," ai-service\n",{"type":20,"tag":45,"props":607,"children":608},{"class":47,"line":57},[609,615,620,625],{"type":20,"tag":45,"props":610,"children":612},{"style":611},"--shiki-default:#B392F0",[613],{"type":26,"value":614},"uv",{"type":20,"tag":45,"props":616,"children":617},{"style":602},[618],{"type":26,"value":619}," run",{"type":20,"tag":45,"props":621,"children":622},{"style":602},[623],{"type":26,"value":624}," python",{"type":20,"tag":45,"props":626,"children":627},{"style":602},[628],{"type":26,"value":629}," ..\u002Fml-experiments\u002Fesol_rf_baseline.py\n",{"type":20,"tag":28,"props":631,"children":632},{},[633],{"type":26,"value":634},"输出文件集中放在：",{"type":20,"tag":34,"props":636,"children":638},{"className":36,"code":637,"language":38,"meta":7,"style":7},"models\u002Fesol_rf.pkl\nml-experiments\u002Foutputs\u002Fesol_rf_metrics.json\nml-experiments\u002Foutputs\u002Fesol_rf_predictions.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fesol_rf_pred_vs_actual.png\nml-experiments\u002Freports\u002Fesol_rf_report.md\n",[639],{"type":20,"tag":41,"props":640,"children":641},{"__ignoreMap":7},[642,650,658,666,674],{"type":20,"tag":45,"props":643,"children":644},{"class":47,"line":48},[645],{"type":20,"tag":45,"props":646,"children":647},{},[648],{"type":26,"value":649},"models\u002Fesol_rf.pkl\n",{"type":20,"tag":45,"props":651,"children":652},{"class":47,"line":57},[653],{"type":20,"tag":45,"props":654,"children":655},{},[656],{"type":26,"value":657},"ml-experiments\u002Foutputs\u002Fesol_rf_metrics.json\n",{"type":20,"tag":45,"props":659,"children":660},{"class":47,"line":66},[661],{"type":20,"tag":45,"props":662,"children":663},{},[664],{"type":26,"value":665},"ml-experiments\u002Foutputs\u002Fesol_rf_predictions.csv\n",{"type":20,"tag":45,"props":667,"children":668},{"class":47,"line":75},[669],{"type":20,"tag":45,"props":670,"children":671},{},[672],{"type":26,"value":673},"ml-experiments\u002Foutputs\u002Ffigures\u002Fesol_rf_pred_vs_actual.png\n",{"type":20,"tag":45,"props":675,"children":676},{"class":47,"line":84},[677],{"type":20,"tag":45,"props":678,"children":679},{},[680],{"type":26,"value":681},"ml-experiments\u002Freports\u002Fesol_rf_report.md\n",{"type":20,"tag":28,"props":683,"children":684},{},[685,690],{"type":20,"tag":41,"props":686,"children":688},{"className":687},[],[689],{"type":26,"value":568},{"type":26,"value":691}," 里不只保存模型本身，也保存了特征列、目标列、指标和数据来源信息。后面做推理服务时，不能只加载一个裸模型，否则很容易出现训练和预测阶段特征顺序不一致的问题。",{"type":20,"tag":279,"props":693,"children":695},{"id":694},"predicted-vs-actual-散点图",[696],{"type":26,"value":697},"predicted vs actual 散点图",{"type":20,"tag":28,"props":699,"children":700},{},[701],{"type":26,"value":702},"这一周最有用的图是 predicted vs actual。",{"type":20,"tag":28,"props":704,"children":705},{},[706],{"type":26,"value":707},"散点越接近中间的对角线，预测越准。点在对角线上方，说明模型把这个分子预测得更易溶于水；点在下方，说明模型预测得更难溶于水。",{"type":20,"tag":28,"props":709,"children":710},{},[711],{"type":26,"value":712},"这张图比单独看 MAE 更有信息量。MAE 只能告诉我平均偏差，散点图能看出误差是否集中在某类分子上，也能暴露极端样本。",{"type":20,"tag":279,"props":714,"children":716},{"id":715},"baseline-的价值",[717],{"type":26,"value":718},"baseline 的价值",{"type":20,"tag":28,"props":720,"children":721},{},[722],{"type":26,"value":723},"RandomForest 不新，但在这个任务里很适合作为第一版。它不需要复杂训练流程，对小数据集比较友好，也能快速确认数据处理链路有没有问题。",{"type":20,"tag":28,"props":725,"children":726},{},[727],{"type":26,"value":728},"如果后面的 MLP 或 GNN 在同样数据划分下跑不过这个 baseline，那复杂模型本身就没有带来收益。这个结论比“我跑了一个深度学习模型”更有价值。",{"type":20,"tag":28,"props":730,"children":731},{},[732],{"type":26,"value":733},"第 4 周最终留下的核心产物是：",{"type":20,"tag":34,"props":735,"children":737},{"className":36,"code":736,"language":38,"meta":7,"style":7},"ml-experiments\u002Fesol_rf_baseline.py\nmodels\u002Fesol_rf.pkl\nml-experiments\u002Freports\u002Fesol_rf_report.md\n",[738],{"type":20,"tag":41,"props":739,"children":740},{"__ignoreMap":7},[741,748,755],{"type":20,"tag":45,"props":742,"children":743},{"class":47,"line":48},[744],{"type":20,"tag":45,"props":745,"children":746},{},[747],{"type":26,"value":505},{"type":20,"tag":45,"props":749,"children":750},{"class":47,"line":57},[751],{"type":20,"tag":45,"props":752,"children":753},{},[754],{"type":26,"value":649},{"type":20,"tag":45,"props":756,"children":757},{"class":47,"line":66},[758],{"type":20,"tag":45,"props":759,"children":760},{},[761],{"type":26,"value":681},{"type":20,"tag":28,"props":763,"children":764},{},[765],{"type":26,"value":766},"从这一周开始，项目从“分子数据处理工具”进入了“可评估的分子性质预测模型”。",{"type":20,"tag":768,"props":769,"children":770},"style",{},[771],{"type":26,"value":772},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":7,"searchDepth":57,"depth":57,"links":774},[775,776,777,778,779],{"id":281,"depth":57,"text":284},{"id":408,"depth":57,"text":411},{"id":495,"depth":57,"text":495},{"id":694,"depth":57,"text":697},{"id":715,"depth":57,"text":718},"markdown","content:articles:ai:ai-drug-week04-esol-rf-baseline.md","content","articles\u002Fai\u002Fai-drug-week04-esol-rf-baseline.md","articles\u002Fai\u002Fai-drug-week04-esol-rf-baseline","md",1780481290972]