[{"data":1,"prerenderedAt":788},["ShallowReactive",2],{"article-\u002Fai-drug-week04-esol-rf-baseline":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":5,"title":7,"description":8,"date":9,"tags":10,"listed":14,"body":15,"_type":782,"_id":783,"_source":784,"_file":785,"_stem":786,"_extension":787},"\u002Farticles\u002F\u002Fai-drug-week04-esol-rf-baseline","",false,"第 4 周：用 ESOL 和 RandomForest 建一个分子性质预测 baseline","记录从 ESOL 数据、RDKit descriptors 到 RandomForest 回归模型的第一版分子水溶解度预测实验。","2026-06-03",[11,12,13],"AI制药","深度学习","人工智能",true,{"type":16,"children":17,"toc":772},"root",[18,26,32,192,197,202,207,220,225,272,277,284,303,333,338,405,411,424,487,492,497,502,516,521,568,581,629,634,681,691,697,702,707,712,718,723,728,733,761,766],{"type":19,"tag":20,"props":21,"children":23},"element","h2",{"id":22},"前情提要",[24],{"type":25,"value":22},"text",{"type":19,"tag":27,"props":28,"children":29},"p",{},[30],{"type":25,"value":31},"正在做一个AI制药平台项目，前端使用vue3、后端采用双后端(主后端Java、SpringBoot做业务，次后端Python、FastAPI做AI服务)",{"type":19,"tag":33,"props":34,"children":38},"pre",{"className":35,"code":36,"language":37,"meta":5,"style":5},"language-plain shiki shiki-themes github-dark","Vue3 前端\n  |\n  | REST API\n  v\nSpringBoot 主后端\n  |\n  | 任务管理 \u002F 数据管理\n  v\nPostgreSQL + Redis\n  |\n  | 调用\n  v\nPython AI Service\n  |\n  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n  v\n模型推理与分子计算\n","plain",[39],{"type":19,"tag":40,"props":41,"children":42},"code",{"__ignoreMap":5},[43,54,63,72,81,90,98,107,115,124,132,141,149,158,166,175,183],{"type":19,"tag":44,"props":45,"children":48},"span",{"class":46,"line":47},"line",1,[49],{"type":19,"tag":44,"props":50,"children":51},{},[52],{"type":25,"value":53},"Vue3 前端\n",{"type":19,"tag":44,"props":55,"children":57},{"class":46,"line":56},2,[58],{"type":19,"tag":44,"props":59,"children":60},{},[61],{"type":25,"value":62},"  |\n",{"type":19,"tag":44,"props":64,"children":66},{"class":46,"line":65},3,[67],{"type":19,"tag":44,"props":68,"children":69},{},[70],{"type":25,"value":71},"  | REST API\n",{"type":19,"tag":44,"props":73,"children":75},{"class":46,"line":74},4,[76],{"type":19,"tag":44,"props":77,"children":78},{},[79],{"type":25,"value":80},"  v\n",{"type":19,"tag":44,"props":82,"children":84},{"class":46,"line":83},5,[85],{"type":19,"tag":44,"props":86,"children":87},{},[88],{"type":25,"value":89},"SpringBoot 主后端\n",{"type":19,"tag":44,"props":91,"children":93},{"class":46,"line":92},6,[94],{"type":19,"tag":44,"props":95,"children":96},{},[97],{"type":25,"value":62},{"type":19,"tag":44,"props":99,"children":101},{"class":46,"line":100},7,[102],{"type":19,"tag":44,"props":103,"children":104},{},[105],{"type":25,"value":106},"  | 任务管理 \u002F 数据管理\n",{"type":19,"tag":44,"props":108,"children":110},{"class":46,"line":109},8,[111],{"type":19,"tag":44,"props":112,"children":113},{},[114],{"type":25,"value":80},{"type":19,"tag":44,"props":116,"children":118},{"class":46,"line":117},9,[119],{"type":19,"tag":44,"props":120,"children":121},{},[122],{"type":25,"value":123},"PostgreSQL + Redis\n",{"type":19,"tag":44,"props":125,"children":127},{"class":46,"line":126},10,[128],{"type":19,"tag":44,"props":129,"children":130},{},[131],{"type":25,"value":62},{"type":19,"tag":44,"props":133,"children":135},{"class":46,"line":134},11,[136],{"type":19,"tag":44,"props":137,"children":138},{},[139],{"type":25,"value":140},"  | 调用\n",{"type":19,"tag":44,"props":142,"children":144},{"class":46,"line":143},12,[145],{"type":19,"tag":44,"props":146,"children":147},{},[148],{"type":25,"value":80},{"type":19,"tag":44,"props":150,"children":152},{"class":46,"line":151},13,[153],{"type":19,"tag":44,"props":154,"children":155},{},[156],{"type":25,"value":157},"Python AI Service\n",{"type":19,"tag":44,"props":159,"children":161},{"class":46,"line":160},14,[162],{"type":19,"tag":44,"props":163,"children":164},{},[165],{"type":25,"value":62},{"type":19,"tag":44,"props":167,"children":169},{"class":46,"line":168},15,[170],{"type":19,"tag":44,"props":171,"children":172},{},[173],{"type":25,"value":174},"  | RDKit \u002F PyTorch \u002F PyG \u002F scikit-learn\n",{"type":19,"tag":44,"props":176,"children":178},{"class":46,"line":177},16,[179],{"type":19,"tag":44,"props":180,"children":181},{},[182],{"type":25,"value":80},{"type":19,"tag":44,"props":184,"children":186},{"class":46,"line":185},17,[187],{"type":19,"tag":44,"props":188,"children":189},{},[190],{"type":25,"value":191},"模型推理与分子计算\n",{"type":19,"tag":27,"props":193,"children":194},{},[195],{"type":25,"value":196},"会把自己认为比较重要的部分，单独写成文章以便记录。写这个项目的主要目的是在干中学，因为不想从头到尾啃生物信息学、生物化学原理这些AI制药理论。通过项目能快速理解核心概念，剩下零零散散的后续慢慢补。",{"type":19,"tag":27,"props":198,"children":199},{},[200],{"type":25,"value":201},"此篇就是第 4 周的记录。",{"type":19,"tag":20,"props":203,"children":205},{"id":204},"第-4-周用-esol-和-randomforest-建一个分子性质预测-baseline",[206],{"type":25,"value":7},{"type":19,"tag":27,"props":208,"children":209},{},[210,212,218],{"type":25,"value":211},"第 4 周开始进入分子性质预测。前 3 周主要是在处理 SMILES、描述符、指纹和相似性搜索，这些东西本身还不算“预测”。这一周我把它们接到一个真正的监督学习任务上：用 ESOL 数据集预测小分子的水溶解度 ",{"type":19,"tag":40,"props":213,"children":215},{"className":214},[],[216],{"type":25,"value":217},"logS",{"type":25,"value":219},"。",{"type":19,"tag":27,"props":221,"children":222},{},[223],{"type":25,"value":224},"我没有一上来写 GNN。第一版选择了更朴素的组合：",{"type":19,"tag":33,"props":226,"children":228},{"className":35,"code":227,"language":37,"meta":5,"style":5},"SMILES\n-> RDKit descriptors\n-> RandomForestRegressor\n-> MAE \u002F RMSE \u002F R2\n-> model.pkl + report\n",[229],{"type":19,"tag":40,"props":230,"children":231},{"__ignoreMap":5},[232,240,248,256,264],{"type":19,"tag":44,"props":233,"children":234},{"class":46,"line":47},[235],{"type":19,"tag":44,"props":236,"children":237},{},[238],{"type":25,"value":239},"SMILES\n",{"type":19,"tag":44,"props":241,"children":242},{"class":46,"line":56},[243],{"type":19,"tag":44,"props":244,"children":245},{},[246],{"type":25,"value":247},"-> RDKit descriptors\n",{"type":19,"tag":44,"props":249,"children":250},{"class":46,"line":65},[251],{"type":19,"tag":44,"props":252,"children":253},{},[254],{"type":25,"value":255},"-> RandomForestRegressor\n",{"type":19,"tag":44,"props":257,"children":258},{"class":46,"line":74},[259],{"type":19,"tag":44,"props":260,"children":261},{},[262],{"type":25,"value":263},"-> MAE \u002F RMSE \u002F R2\n",{"type":19,"tag":44,"props":265,"children":266},{"class":46,"line":83},[267],{"type":19,"tag":44,"props":268,"children":269},{},[270],{"type":25,"value":271},"-> model.pkl + report\n",{"type":19,"tag":27,"props":273,"children":274},{},[275],{"type":25,"value":276},"这个 baseline 的意义不在于模型复杂，而在于它给后面的 MLP、GNN、scaffold split 留下一个可比较的参照。",{"type":19,"tag":278,"props":279,"children":281},"h3",{"id":280},"esol-预测的是-logs",[282],{"type":25,"value":283},"ESOL 预测的是 logS",{"type":19,"tag":27,"props":285,"children":286},{},[287,289,294,296,302],{"type":25,"value":288},"这里最容易混淆的是 ",{"type":19,"tag":40,"props":290,"children":292},{"className":291},[],[293],{"type":25,"value":217},{"type":25,"value":295}," 和 ",{"type":19,"tag":40,"props":297,"children":299},{"className":298},[],[300],{"type":25,"value":301},"logP",{"type":25,"value":219},{"type":19,"tag":27,"props":304,"children":305},{},[306,311,313,318,320,325,327,332],{"type":19,"tag":40,"props":307,"children":309},{"className":308},[],[310],{"type":25,"value":217},{"type":25,"value":312}," 表示水溶解度，描述一个分子能在水里溶解多少。",{"type":19,"tag":40,"props":314,"children":316},{"className":315},[],[317],{"type":25,"value":301},{"type":25,"value":319}," 表示脂水分配系数，描述分子更偏亲脂还是亲水。二者相关，但不是同一个目标。ESOL 数据集做的是水溶解度预测，所以目标列是 ",{"type":19,"tag":40,"props":321,"children":323},{"className":322},[],[324],{"type":25,"value":217},{"type":25,"value":326},"，不是 ",{"type":19,"tag":40,"props":328,"children":330},{"className":329},[],[331],{"type":25,"value":301},{"type":25,"value":219},{"type":19,"tag":27,"props":334,"children":335},{},[336],{"type":25,"value":337},"我在这一周把 ESOL 当成一个回归任务处理。输入是分子结构，输出是连续数值，因此评估指标选了：",{"type":19,"tag":339,"props":340,"children":341},"table",{},[342,361],{"type":19,"tag":343,"props":344,"children":345},"thead",{},[346],{"type":19,"tag":347,"props":348,"children":349},"tr",{},[350,356],{"type":19,"tag":351,"props":352,"children":353},"th",{},[354],{"type":25,"value":355},"指标",{"type":19,"tag":351,"props":357,"children":358},{},[359],{"type":25,"value":360},"作用",{"type":19,"tag":362,"props":363,"children":364},"tbody",{},[365,379,392],{"type":19,"tag":347,"props":366,"children":367},{},[368,374],{"type":19,"tag":369,"props":370,"children":371},"td",{},[372],{"type":25,"value":373},"MAE",{"type":19,"tag":369,"props":375,"children":376},{},[377],{"type":25,"value":378},"平均绝对误差，直观看平均偏差",{"type":19,"tag":347,"props":380,"children":381},{},[382,387],{"type":19,"tag":369,"props":383,"children":384},{},[385],{"type":25,"value":386},"RMSE",{"type":19,"tag":369,"props":388,"children":389},{},[390],{"type":25,"value":391},"对大误差更敏感",{"type":19,"tag":347,"props":393,"children":394},{},[395,400],{"type":19,"tag":369,"props":396,"children":397},{},[398],{"type":25,"value":399},"R2",{"type":19,"tag":369,"props":401,"children":402},{},[403],{"type":25,"value":404},"衡量模型解释目标变量变化的能力",{"type":19,"tag":278,"props":406,"children":408},{"id":407},"特征先从-rdkit-descriptors-开始",[409],{"type":25,"value":410},"特征先从 RDKit descriptors 开始",{"type":19,"tag":27,"props":412,"children":413},{},[414,416,422],{"type":25,"value":415},"这一版没有使用深度学习特征，而是复用前面封装好的 ",{"type":19,"tag":40,"props":417,"children":419},{"className":418},[],[420],{"type":25,"value":421},"molkit.calculate_descriptors()",{"type":25,"value":423},"。最终进入 RandomForest 的特征包括：",{"type":19,"tag":33,"props":425,"children":427},{"className":35,"code":426,"language":37,"meta":5,"style":5},"molecular_weight\nlogp\ntpsa\nh_bond_donors\nh_bond_acceptors\nrotatable_bonds\nring_count\n",[428],{"type":19,"tag":40,"props":429,"children":430},{"__ignoreMap":5},[431,439,447,455,463,471,479],{"type":19,"tag":44,"props":432,"children":433},{"class":46,"line":47},[434],{"type":19,"tag":44,"props":435,"children":436},{},[437],{"type":25,"value":438},"molecular_weight\n",{"type":19,"tag":44,"props":440,"children":441},{"class":46,"line":56},[442],{"type":19,"tag":44,"props":443,"children":444},{},[445],{"type":25,"value":446},"logp\n",{"type":19,"tag":44,"props":448,"children":449},{"class":46,"line":65},[450],{"type":19,"tag":44,"props":451,"children":452},{},[453],{"type":25,"value":454},"tpsa\n",{"type":19,"tag":44,"props":456,"children":457},{"class":46,"line":74},[458],{"type":19,"tag":44,"props":459,"children":460},{},[461],{"type":25,"value":462},"h_bond_donors\n",{"type":19,"tag":44,"props":464,"children":465},{"class":46,"line":83},[466],{"type":19,"tag":44,"props":467,"children":468},{},[469],{"type":25,"value":470},"h_bond_acceptors\n",{"type":19,"tag":44,"props":472,"children":473},{"class":46,"line":92},[474],{"type":19,"tag":44,"props":475,"children":476},{},[477],{"type":25,"value":478},"rotatable_bonds\n",{"type":19,"tag":44,"props":480,"children":481},{"class":46,"line":100},[482],{"type":19,"tag":44,"props":483,"children":484},{},[485],{"type":25,"value":486},"ring_count\n",{"type":19,"tag":27,"props":488,"children":489},{},[490],{"type":25,"value":491},"这些描述符很基础，但好处是含义清楚。比如分子量、极性表面积、氢键供体\u002F受体、可旋转键数量，都能和溶解度建立比较直接的关系。对于一个工程作品集来说，这种 baseline 比直接堆一个黑盒模型更稳。",{"type":19,"tag":278,"props":493,"children":495},{"id":494},"实验脚本",[496],{"type":25,"value":494},{"type":19,"tag":27,"props":498,"children":499},{},[500],{"type":25,"value":501},"本周核心脚本是：",{"type":19,"tag":33,"props":503,"children":505},{"className":35,"code":504,"language":37,"meta":5,"style":5},"ml-experiments\u002Fesol_rf_baseline.py\n",[506],{"type":19,"tag":40,"props":507,"children":508},{"__ignoreMap":5},[509],{"type":19,"tag":44,"props":510,"children":511},{"class":46,"line":47},[512],{"type":19,"tag":44,"props":513,"children":514},{},[515],{"type":25,"value":504},{"type":19,"tag":27,"props":517,"children":518},{},[519],{"type":25,"value":520},"它完成了几件事：",{"type":19,"tag":522,"props":523,"children":524},"ul",{},[525,531,536,541,552,557],{"type":19,"tag":526,"props":527,"children":528},"li",{},[529],{"type":25,"value":530},"下载或读取 ESOL 数据",{"type":19,"tag":526,"props":532,"children":533},{},[534],{"type":25,"value":535},"校验 SMILES 并计算 RDKit descriptors",{"type":19,"tag":526,"props":537,"children":538},{},[539],{"type":25,"value":540},"按 train \u002F validation \u002F test 划分数据",{"type":19,"tag":526,"props":542,"children":543},{},[544,546],{"type":25,"value":545},"训练 ",{"type":19,"tag":40,"props":547,"children":549},{"className":548},[],[550],{"type":25,"value":551},"RandomForestRegressor",{"type":19,"tag":526,"props":553,"children":554},{},[555],{"type":25,"value":556},"输出预测结果、指标和散点图",{"type":19,"tag":526,"props":558,"children":559},{},[560,562],{"type":25,"value":561},"保存模型到 ",{"type":19,"tag":40,"props":563,"children":565},{"className":564},[],[566],{"type":25,"value":567},"models\u002Fesol_rf.pkl",{"type":19,"tag":27,"props":569,"children":570},{},[571,573,579],{"type":25,"value":572},"运行入口保留在 ",{"type":19,"tag":40,"props":574,"children":576},{"className":575},[],[577],{"type":25,"value":578},"ai-service",{"type":25,"value":580}," 环境里：",{"type":19,"tag":33,"props":582,"children":586},{"className":583,"code":584,"language":585,"meta":5,"style":5},"language-bash shiki shiki-themes github-dark","cd ai-service\nuv run python ..\u002Fml-experiments\u002Fesol_rf_baseline.py\n","bash",[587],{"type":19,"tag":40,"props":588,"children":589},{"__ignoreMap":5},[590,605],{"type":19,"tag":44,"props":591,"children":592},{"class":46,"line":47},[593,599],{"type":19,"tag":44,"props":594,"children":596},{"style":595},"--shiki-default:#79B8FF",[597],{"type":25,"value":598},"cd",{"type":19,"tag":44,"props":600,"children":602},{"style":601},"--shiki-default:#9ECBFF",[603],{"type":25,"value":604}," ai-service\n",{"type":19,"tag":44,"props":606,"children":607},{"class":46,"line":56},[608,614,619,624],{"type":19,"tag":44,"props":609,"children":611},{"style":610},"--shiki-default:#B392F0",[612],{"type":25,"value":613},"uv",{"type":19,"tag":44,"props":615,"children":616},{"style":601},[617],{"type":25,"value":618}," run",{"type":19,"tag":44,"props":620,"children":621},{"style":601},[622],{"type":25,"value":623}," python",{"type":19,"tag":44,"props":625,"children":626},{"style":601},[627],{"type":25,"value":628}," ..\u002Fml-experiments\u002Fesol_rf_baseline.py\n",{"type":19,"tag":27,"props":630,"children":631},{},[632],{"type":25,"value":633},"输出文件集中放在：",{"type":19,"tag":33,"props":635,"children":637},{"className":35,"code":636,"language":37,"meta":5,"style":5},"models\u002Fesol_rf.pkl\nml-experiments\u002Foutputs\u002Fesol_rf_metrics.json\nml-experiments\u002Foutputs\u002Fesol_rf_predictions.csv\nml-experiments\u002Foutputs\u002Ffigures\u002Fesol_rf_pred_vs_actual.png\nml-experiments\u002Freports\u002Fesol_rf_report.md\n",[638],{"type":19,"tag":40,"props":639,"children":640},{"__ignoreMap":5},[641,649,657,665,673],{"type":19,"tag":44,"props":642,"children":643},{"class":46,"line":47},[644],{"type":19,"tag":44,"props":645,"children":646},{},[647],{"type":25,"value":648},"models\u002Fesol_rf.pkl\n",{"type":19,"tag":44,"props":650,"children":651},{"class":46,"line":56},[652],{"type":19,"tag":44,"props":653,"children":654},{},[655],{"type":25,"value":656},"ml-experiments\u002Foutputs\u002Fesol_rf_metrics.json\n",{"type":19,"tag":44,"props":658,"children":659},{"class":46,"line":65},[660],{"type":19,"tag":44,"props":661,"children":662},{},[663],{"type":25,"value":664},"ml-experiments\u002Foutputs\u002Fesol_rf_predictions.csv\n",{"type":19,"tag":44,"props":666,"children":667},{"class":46,"line":74},[668],{"type":19,"tag":44,"props":669,"children":670},{},[671],{"type":25,"value":672},"ml-experiments\u002Foutputs\u002Ffigures\u002Fesol_rf_pred_vs_actual.png\n",{"type":19,"tag":44,"props":674,"children":675},{"class":46,"line":83},[676],{"type":19,"tag":44,"props":677,"children":678},{},[679],{"type":25,"value":680},"ml-experiments\u002Freports\u002Fesol_rf_report.md\n",{"type":19,"tag":27,"props":682,"children":683},{},[684,689],{"type":19,"tag":40,"props":685,"children":687},{"className":686},[],[688],{"type":25,"value":567},{"type":25,"value":690}," 里不只保存模型本身，也保存了特征列、目标列、指标和数据来源信息。后面做推理服务时，不能只加载一个裸模型，否则很容易出现训练和预测阶段特征顺序不一致的问题。",{"type":19,"tag":278,"props":692,"children":694},{"id":693},"predicted-vs-actual-散点图",[695],{"type":25,"value":696},"predicted vs actual 散点图",{"type":19,"tag":27,"props":698,"children":699},{},[700],{"type":25,"value":701},"这一周最有用的图是 predicted vs actual。",{"type":19,"tag":27,"props":703,"children":704},{},[705],{"type":25,"value":706},"散点越接近中间的对角线，预测越准。点在对角线上方，说明模型把这个分子预测得更易溶于水；点在下方，说明模型预测得更难溶于水。",{"type":19,"tag":27,"props":708,"children":709},{},[710],{"type":25,"value":711},"这张图比单独看 MAE 更有信息量。MAE 只能告诉我平均偏差，散点图能看出误差是否集中在某类分子上，也能暴露极端样本。",{"type":19,"tag":278,"props":713,"children":715},{"id":714},"baseline-的价值",[716],{"type":25,"value":717},"baseline 的价值",{"type":19,"tag":27,"props":719,"children":720},{},[721],{"type":25,"value":722},"RandomForest 不新，但在这个任务里很适合作为第一版。它不需要复杂训练流程，对小数据集比较友好，也能快速确认数据处理链路有没有问题。",{"type":19,"tag":27,"props":724,"children":725},{},[726],{"type":25,"value":727},"如果后面的 MLP 或 GNN 在同样数据划分下跑不过这个 baseline，那复杂模型本身就没有带来收益。这个结论比“我跑了一个深度学习模型”更有价值。",{"type":19,"tag":27,"props":729,"children":730},{},[731],{"type":25,"value":732},"第 4 周最终留下的核心产物是：",{"type":19,"tag":33,"props":734,"children":736},{"className":35,"code":735,"language":37,"meta":5,"style":5},"ml-experiments\u002Fesol_rf_baseline.py\nmodels\u002Fesol_rf.pkl\nml-experiments\u002Freports\u002Fesol_rf_report.md\n",[737],{"type":19,"tag":40,"props":738,"children":739},{"__ignoreMap":5},[740,747,754],{"type":19,"tag":44,"props":741,"children":742},{"class":46,"line":47},[743],{"type":19,"tag":44,"props":744,"children":745},{},[746],{"type":25,"value":504},{"type":19,"tag":44,"props":748,"children":749},{"class":46,"line":56},[750],{"type":19,"tag":44,"props":751,"children":752},{},[753],{"type":25,"value":648},{"type":19,"tag":44,"props":755,"children":756},{"class":46,"line":65},[757],{"type":19,"tag":44,"props":758,"children":759},{},[760],{"type":25,"value":680},{"type":19,"tag":27,"props":762,"children":763},{},[764],{"type":25,"value":765},"从这一周开始，项目从“分子数据处理工具”进入了“可评估的分子性质预测模型”。",{"type":19,"tag":767,"props":768,"children":769},"style",{},[770],{"type":25,"value":771},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":5,"searchDepth":56,"depth":56,"links":773},[774,775],{"id":22,"depth":56,"text":22},{"id":204,"depth":56,"text":7,"children":776},[777,778,779,780,781],{"id":280,"depth":65,"text":283},{"id":407,"depth":65,"text":410},{"id":494,"depth":65,"text":494},{"id":693,"depth":65,"text":696},{"id":714,"depth":65,"text":717},"markdown","content:articles:生信基础:ai-drug-week04-esol-rf-baseline.md","content","articles\u002F生信基础\u002Fai-drug-week04-esol-rf-baseline.md","articles\u002F生信基础\u002Fai-drug-week04-esol-rf-baseline","md",1780895158451]