[{"data":1,"prerenderedAt":890},["ShallowReactive",2],{"article-ai\u002Fmpl-hydrophobicity":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"tags":11,"body":14,"_type":884,"_id":885,"_source":886,"_file":887,"_stem":888,"_extension":889},"\u002Farticles\u002Fai\u002Fmpl-hydrophobicity","ai",false,"","用MLP预测氨基酸亲疏水性：一个完整的机器学习小实验-PyTorch版","使用PyTorch构建MLP模型，结合RDKit Morgan指纹特征工程，通过留一交叉验证预测20种标准氨基酸的亲疏水性，覆盖数据处理、模型训练、正则化与评估的完整流程。","2026-05-11",[12,13],"人工智能","生物信息学",{"type":15,"children":16,"toc":865},"root",[17,25,30,60,65,75,81,86,91,99,104,145,150,156,161,169,197,202,207,215,220,225,232,237,245,250,255,260,268,273,281,286,291,296,329,334,342,353,363,368,373,378,383,388,413,418,426,431,436,449,457,462,470,474,521,526,531,539,544,552,557,562,567,572,579,584,589,596,601,609,614,620,625,630,636,641,660,665,671,676,681,690,695,713,718,723,728,733,761,766,771,776,794,799,804,812,817,845,850],{"type":18,"tag":19,"props":20,"children":21},"element","p",{},[22],{"type":23,"value":24},"text","最近我做了一个很小的机器学习项目：用 MLP 预测 20 种标准氨基酸的亲疏水性。",{"type":18,"tag":19,"props":26,"children":27},{},[28],{"type":23,"value":29},"这个项目的数据量很小，模型也不复杂，但它刚好覆盖了一个机器学习实验最重要的几个环节：",{"type":18,"tag":31,"props":32,"children":33},"ul",{},[34,40,45,50,55],{"type":18,"tag":35,"props":36,"children":37},"li",{},[38],{"type":23,"value":39},"原始数据如何变成模型能理解的数值特征",{"type":18,"tag":35,"props":41,"children":42},{},[43],{"type":23,"value":44},"神经网络如何通过损失函数和优化器学习",{"type":18,"tag":35,"props":46,"children":47},{},[48],{"type":23,"value":49},"小数据集为什么容易过拟合",{"type":18,"tag":35,"props":51,"children":52},{},[53],{"type":23,"value":54},"为什么评估方式比单次准确率更重要",{"type":18,"tag":35,"props":56,"children":57},{},[58],{"type":23,"value":59},"为什么要做 baseline 和 正则化",{"type":18,"tag":19,"props":61,"children":62},{},[63],{"type":23,"value":64},"项目地址中的核心流程是：",{"type":18,"tag":66,"props":67,"children":69},"pre",{"code":68},"SMILES 字符串 -> RDKit 解析 -> Morgan 指纹 -> MLP 分类 -> 留一交叉验证\n",[70],{"type":18,"tag":71,"props":72,"children":73},"code",{"__ignoreMap":7},[74],{"type":23,"value":68},{"type":18,"tag":76,"props":77,"children":79},"h2",{"id":78},"项目目标",[80],{"type":23,"value":78},{"type":18,"tag":19,"props":82,"children":83},{},[84],{"type":23,"value":85},"给定一个氨基酸的分子结构，用机器学习模型预测它是疏水还是亲水。",{"type":18,"tag":19,"props":87,"children":88},{},[89],{"type":23,"value":90},"数据文件中每一行是一种氨基酸，例如：",{"type":18,"tag":66,"props":92,"children":94},{"code":93},"name,abbreviation,three_letter,smiles,hydrophobic\n丙氨酸,Ala,Ala,CC(N)C(=O)O,1\n丝氨酸,Ser,Ser,NC(CO)C(=O)O,0\n",[95],{"type":18,"tag":71,"props":96,"children":97},{"__ignoreMap":7},[98],{"type":23,"value":93},{"type":18,"tag":19,"props":100,"children":101},{},[102],{"type":23,"value":103},"其中：",{"type":18,"tag":31,"props":105,"children":106},{},[107,118],{"type":18,"tag":35,"props":108,"children":109},{},[110,116],{"type":18,"tag":71,"props":111,"children":113},{"className":112},[],[114],{"type":23,"value":115},"smiles",{"type":23,"value":117}," 是氨基酸的分子结构表示",{"type":18,"tag":35,"props":119,"children":120},{},[121,127,129,135,137,143],{"type":18,"tag":71,"props":122,"children":124},{"className":123},[],[125],{"type":23,"value":126},"hydrophobic",{"type":23,"value":128}," 是标签，",{"type":18,"tag":71,"props":130,"children":132},{"className":131},[],[133],{"type":23,"value":134},"1",{"type":23,"value":136}," 表示疏水，",{"type":18,"tag":71,"props":138,"children":140},{"className":139},[],[141],{"type":23,"value":142},"0",{"type":23,"value":144}," 表示亲水",{"type":18,"tag":19,"props":146,"children":147},{},[148],{"type":23,"value":149},"这是一个二分类问题。",{"type":18,"tag":76,"props":151,"children":153},{"id":152},"为什么不能直接把-smiles-喂给神经网络",[154],{"type":23,"value":155},"为什么不能直接把 SMILES 喂给神经网络",{"type":18,"tag":19,"props":157,"children":158},{},[159],{"type":23,"value":160},"SMILES 是一种文本形式的分子表示，例如丙氨酸的 SMILES 是：",{"type":18,"tag":66,"props":162,"children":164},{"code":163},"CC(N)C(=O)O\n",[165],{"type":18,"tag":71,"props":166,"children":167},{"__ignoreMap":7},[168],{"type":23,"value":163},{"type":18,"tag":19,"props":170,"children":171},{},[172,174,180,182,188,189,195],{"type":23,"value":173},"但是神经网络本质上处理的是数值张量。它并不直接理解 ",{"type":18,"tag":71,"props":175,"children":177},{"className":176},[],[178],{"type":23,"value":179},"C",{"type":23,"value":181},"、",{"type":18,"tag":71,"props":183,"children":185},{"className":184},[],[186],{"type":23,"value":187},"N",{"type":23,"value":181},{"type":18,"tag":71,"props":190,"children":192},{"className":191},[],[193],{"type":23,"value":194},"O",{"type":23,"value":196}," 这些字符代表什么化学意义。",{"type":18,"tag":19,"props":198,"children":199},{},[200],{"type":23,"value":201},"所以第一步要做特征工程：把 SMILES 转换成模型可以学习的数值向量。",{"type":18,"tag":19,"props":203,"children":204},{},[205],{"type":23,"value":206},"在这个项目中，我使用 RDKit 生成 Morgan 指纹：",{"type":18,"tag":66,"props":208,"children":210},{"code":209},"def smiles_to_morgan_bits(\n    smiles: str,\n    radius: int = 2,\n    fp_size: int = 2048,\n) -> torch.Tensor:\n    mol = Chem.MolFromSmiles(smiles)\n    if mol is None:\n        raise ValueError(f\"Invalid SMILES: {smiles}\")\n​\n    generator = AllChem.GetMorganGenerator(radius=radius, fpSize=fp_size)\n    fingerprint = generator.GetFingerprint(mol)\n    return torch.tensor(\n        list(map(int, fingerprint.ToBitString())),\n        dtype=torch.float32,\n    )\n",[211],{"type":18,"tag":71,"props":212,"children":213},{"__ignoreMap":7},[214],{"type":23,"value":209},{"type":18,"tag":19,"props":216,"children":217},{},[218],{"type":23,"value":219},"Morgan 指纹可以理解为：把分子中的局部结构模式编码成一个固定长度的 0\u002F1 向量。",{"type":18,"tag":19,"props":221,"children":222},{},[223],{"type":23,"value":224},"比如原始输入是：",{"type":18,"tag":66,"props":226,"children":227},{"code":163},[228],{"type":18,"tag":71,"props":229,"children":230},{"__ignoreMap":7},[231],{"type":23,"value":163},{"type":18,"tag":19,"props":233,"children":234},{},[235],{"type":23,"value":236},"经过特征工程后变成类似这样的向量：",{"type":18,"tag":66,"props":238,"children":240},{"code":239},"[0, 1, 0, 0, 1, ..., 0]\n",[241],{"type":18,"tag":71,"props":242,"children":243},{"__ignoreMap":7},[244],{"type":23,"value":239},{"type":18,"tag":19,"props":246,"children":247},{},[248],{"type":23,"value":249},"在这个项目中，每个氨基酸最终都会变成一个 2048 维的向量。",{"type":18,"tag":76,"props":251,"children":253},{"id":252},"模型结构",[254],{"type":23,"value":252},{"type":18,"tag":19,"props":256,"children":257},{},[258],{"type":23,"value":259},"模型使用的是一个很小的 MLP：",{"type":18,"tag":66,"props":261,"children":263},{"code":262},"Input (2048) -> Linear -> ReLU -> Dropout -> Linear -> Output (1)\n",[264],{"type":18,"tag":71,"props":265,"children":266},{"__ignoreMap":7},[267],{"type":23,"value":262},{"type":18,"tag":19,"props":269,"children":270},{},[271],{"type":23,"value":272},"对应代码：",{"type":18,"tag":66,"props":274,"children":276},{"code":275},"model = HydroMLP(in_dim=2048, hidden_layer_sizes=(32,), dropout=0.1)\n",[277],{"type":18,"tag":71,"props":278,"children":279},{"__ignoreMap":7},[280],{"type":23,"value":275},{"type":18,"tag":19,"props":282,"children":283},{},[284],{"type":23,"value":285},"这里有一个重要选择：模型没有设计得很大。",{"type":18,"tag":19,"props":287,"children":288},{},[289],{"type":23,"value":290},"原因是数据只有 20 条，而输入特征却有 2048 维。如果模型太大，它很容易把训练集记住，而不是真的学到亲疏水性的规律。这就是过拟合。",{"type":18,"tag":19,"props":292,"children":293},{},[294],{"type":23,"value":295},"所以我做了两个约束：",{"type":18,"tag":31,"props":297,"children":298},{},[299,310],{"type":18,"tag":35,"props":300,"children":301},{},[302,304],{"type":23,"value":303},"使用较小的隐藏层：",{"type":18,"tag":71,"props":305,"children":307},{"className":306},[],[308],{"type":23,"value":309},"2048 -> 32 -> 1",{"type":18,"tag":35,"props":311,"children":312},{},[313,315,321,323],{"type":23,"value":314},"加入正则化：",{"type":18,"tag":71,"props":316,"children":318},{"className":317},[],[319],{"type":23,"value":320},"Dropout",{"type":23,"value":322}," 和 ",{"type":18,"tag":71,"props":324,"children":326},{"className":325},[],[327],{"type":23,"value":328},"weight_decay",{"type":18,"tag":19,"props":330,"children":331},{},[332],{"type":23,"value":333},"训练时使用：",{"type":18,"tag":66,"props":335,"children":337},{"code":336},"criterion = nn.BCEWithLogitsLoss()\noptimizer = torch.optim.Adam(model.parameters(), lr=0.003, weight_decay=1e-3)\n",[338],{"type":18,"tag":71,"props":339,"children":340},{"__ignoreMap":7},[341],{"type":23,"value":336},{"type":18,"tag":19,"props":343,"children":344},{},[345,351],{"type":18,"tag":71,"props":346,"children":348},{"className":347},[],[349],{"type":23,"value":350},"BCEWithLogitsLoss",{"type":23,"value":352}," 适合二分类任务。它内部会把模型输出的 logit 转换成概率，再计算二分类交叉熵。",{"type":18,"tag":19,"props":354,"children":355},{},[356,361],{"type":18,"tag":71,"props":357,"children":359},{"className":358},[],[360],{"type":23,"value":328},{"type":23,"value":362}," 是 L2 正则化，可以限制模型参数不要变得过大，从而降低过拟合风险。",{"type":18,"tag":76,"props":364,"children":366},{"id":365},"为什么使用留一交叉验证",[367],{"type":23,"value":365},{"type":18,"tag":19,"props":369,"children":370},{},[371],{"type":23,"value":372},"一开始我用的是普通的训练集\u002F测试集划分，例如 15 条训练、5 条测试。",{"type":18,"tag":19,"props":374,"children":375},{},[376],{"type":23,"value":377},"但这个项目只有 20 条数据。测试集只有 5 条时，结果非常容易受随机划分影响。某一次准确率高，不一定说明模型真的好；某一次准确率低，也不一定说明模型完全没学到东西。",{"type":18,"tag":19,"props":379,"children":380},{},[381],{"type":23,"value":382},"因此我改成了留一交叉验证。",{"type":18,"tag":19,"props":384,"children":385},{},[386],{"type":23,"value":387},"留一交叉验证的做法是：",{"type":18,"tag":389,"props":390,"children":392},"ol",{"start":391},0,[393,398,403,408],{"type":18,"tag":35,"props":394,"children":395},{},[396],{"type":23,"value":397},"每次拿 1 个氨基酸作为测试样本",{"type":18,"tag":35,"props":399,"children":400},{},[401],{"type":23,"value":402},"剩下 19 个氨基酸作为训练样本",{"type":18,"tag":35,"props":404,"children":405},{},[406],{"type":23,"value":407},"重复 20 次，让每个氨基酸都当一次测试样本",{"type":18,"tag":35,"props":409,"children":410},{},[411],{"type":23,"value":412},"汇总 20 次预测结果，计算总体准确率",{"type":18,"tag":19,"props":414,"children":415},{},[416],{"type":23,"value":417},"代码中的核心逻辑是：",{"type":18,"tag":66,"props":419,"children":421},{"code":420},"for test_idx in range(len(X)):\n    train_idx = [i for i in range(len(X)) if i != test_idx]\n​\n    model = train_model(X[train_idx], y[train_idx])\n    model.eval()\n​\n    with torch.no_grad():\n        logit = model(X[test_idx].unsqueeze(0))\n        prob = torch.sigmoid(logit).item()\n        pred = int(prob > 0.5)\n",[422],{"type":18,"tag":71,"props":423,"children":424},{"__ignoreMap":7},[425],{"type":23,"value":420},{"type":18,"tag":19,"props":427,"children":428},{},[429],{"type":23,"value":430},"对于小数据集来说，留一交叉验证比单次随机划分更适合用来观察模型表现。",{"type":18,"tag":76,"props":432,"children":434},{"id":433},"如何复现",[435],{"type":23,"value":433},{"type":18,"tag":19,"props":437,"children":438},{},[439,441,447],{"type":23,"value":440},"项目使用 ",{"type":18,"tag":71,"props":442,"children":444},{"className":443},[],[445],{"type":23,"value":446},"uv",{"type":23,"value":448}," 管理依赖。安装依赖后，直接运行主脚本：",{"type":18,"tag":66,"props":450,"children":452},{"code":451},"uv sync\nuv run python main.py\n",[453],{"type":18,"tag":71,"props":454,"children":455},{"__ignoreMap":7},[456],{"type":23,"value":451},{"type":18,"tag":19,"props":458,"children":459},{},[460],{"type":23,"value":461},"项目结构如下：",{"type":18,"tag":66,"props":463,"children":465},{"code":464},"├── data\u002F\n│   └── amino_acids.csv\n├── src\u002F\n│   ├── features.py\n│   └── model.py\n├── main.py\n└── pyproject.toml\n",[466],{"type":18,"tag":71,"props":467,"children":468},{"__ignoreMap":7},[469],{"type":23,"value":464},{"type":18,"tag":19,"props":471,"children":472},{},[473],{"type":23,"value":103},{"type":18,"tag":31,"props":475,"children":476},{},[477,488,499,510],{"type":18,"tag":35,"props":478,"children":479},{},[480,486],{"type":18,"tag":71,"props":481,"children":483},{"className":482},[],[484],{"type":23,"value":485},"data\u002Famino_acids.csv",{"type":23,"value":487}," 保存 20 种氨基酸的数据和标签",{"type":18,"tag":35,"props":489,"children":490},{},[491,497],{"type":18,"tag":71,"props":492,"children":494},{"className":493},[],[495],{"type":23,"value":496},"src\u002Ffeatures.py",{"type":23,"value":498}," 负责把 SMILES 转换成 Morgan 指纹",{"type":18,"tag":35,"props":500,"children":501},{},[502,508],{"type":18,"tag":71,"props":503,"children":505},{"className":504},[],[506],{"type":23,"value":507},"src\u002Fmodel.py",{"type":23,"value":509}," 定义 MLP 模型",{"type":18,"tag":35,"props":511,"children":512},{},[513,519],{"type":18,"tag":71,"props":514,"children":516},{"className":515},[],[517],{"type":23,"value":518},"main.py",{"type":23,"value":520}," 负责训练、留一交叉验证和结果输出",{"type":18,"tag":76,"props":522,"children":524},{"id":523},"实验结果",[525],{"type":23,"value":523},{"type":18,"tag":19,"props":527,"children":528},{},[529],{"type":23,"value":530},"当前运行结果是：",{"type":18,"tag":66,"props":532,"children":534},{"code":533},"留一交叉验证准确率: 65.0% (13\u002F20)\n",[535],{"type":18,"tag":71,"props":536,"children":537},{"__ignoreMap":7},[538],{"type":23,"value":533},{"type":18,"tag":19,"props":540,"children":541},{},[542],{"type":23,"value":543},"部分预测结果如下：",{"type":18,"tag":66,"props":545,"children":547},{"code":546},"✓ 丙氨酸: 预测=疏水 (疏水概率 0.70), 真实=疏水\n✓ 缬氨酸: 预测=疏水 (疏水概率 0.61), 真实=疏水\n✗ 亮氨酸: 预测=亲水 (疏水概率 0.31), 真实=疏水\n✓ 丝氨酸: 预测=亲水 (疏水概率 0.29), 真实=亲水\n✗ 酪氨酸: 预测=疏水 (疏水概率 0.97), 真实=亲水\n✓ 精氨酸: 预测=亲水 (疏水概率 0.02), 真实=亲水\n",[548],{"type":18,"tag":71,"props":549,"children":550},{"__ignoreMap":7},[551],{"type":23,"value":546},{"type":18,"tag":19,"props":553,"children":554},{},[555],{"type":23,"value":556},"这个结果说明模型确实学到了一部分规律，但还不稳定。",{"type":18,"tag":19,"props":558,"children":559},{},[560],{"type":23,"value":561},"比如它能识别一些明显的亲水氨基酸，也能识别一部分疏水氨基酸。但对于边界比较模糊，或者结构上有特殊基团的氨基酸，仍然容易出错。",{"type":18,"tag":19,"props":563,"children":564},{},[565],{"type":23,"value":566},"这也提醒我：不能只看训练集损失。如果训练损失很低，但留一交叉验证表现一般，那模型很可能只是记住了训练样本。",{"type":18,"tag":76,"props":568,"children":570},{"id":569},"我从这个项目学到了什么",[571],{"type":23,"value":569},{"type":18,"tag":573,"props":574,"children":576},"h3",{"id":575},"_1-特征工程是机器学习的入口",[577],{"type":23,"value":578},"1. 特征工程是机器学习的入口",{"type":18,"tag":19,"props":580,"children":581},{},[582],{"type":23,"value":583},"模型并不是直接学习 SMILES 字符串，而是学习 Morgan 指纹。",{"type":18,"tag":19,"props":585,"children":586},{},[587],{"type":23,"value":588},"所以这个项目真正的输入不是：",{"type":18,"tag":66,"props":590,"children":591},{"code":163},[592],{"type":18,"tag":71,"props":593,"children":594},{"__ignoreMap":7},[595],{"type":23,"value":163},{"type":18,"tag":19,"props":597,"children":598},{},[599],{"type":23,"value":600},"而是：",{"type":18,"tag":66,"props":602,"children":604},{"code":603},"2048 维 Morgan 指纹向量\n",[605],{"type":18,"tag":71,"props":606,"children":607},{"__ignoreMap":7},[608],{"type":23,"value":603},{"type":18,"tag":19,"props":610,"children":611},{},[612],{"type":23,"value":613},"特征工程决定了模型能看到什么信息。",{"type":18,"tag":573,"props":615,"children":617},{"id":616},"_2-小数据项目里评估比训练更重要",[618],{"type":23,"value":619},"2. 小数据项目里，评估比训练更重要",{"type":18,"tag":19,"props":621,"children":622},{},[623],{"type":23,"value":624},"只有 20 条数据时，模型很容易把训练集背下来。",{"type":18,"tag":19,"props":626,"children":627},{},[628],{"type":23,"value":629},"如果只看训练损失，很容易得到错误信心。留一交叉验证虽然不能让数据变多，但能让评估更完整。",{"type":18,"tag":573,"props":631,"children":633},{"id":632},"_3-正则化是在限制模型死记硬背",[634],{"type":23,"value":635},"3. 正则化是在限制模型死记硬背",{"type":18,"tag":19,"props":637,"children":638},{},[639],{"type":23,"value":640},"这个项目里使用了两种正则化方式：",{"type":18,"tag":31,"props":642,"children":643},{},[644,652],{"type":18,"tag":35,"props":645,"children":646},{},[647],{"type":18,"tag":71,"props":648,"children":650},{"className":649},[],[651],{"type":23,"value":320},{"type":18,"tag":35,"props":653,"children":654},{},[655],{"type":18,"tag":71,"props":656,"children":658},{"className":657},[],[659],{"type":23,"value":328},{"type":18,"tag":19,"props":661,"children":662},{},[663],{"type":23,"value":664},"它们的目的不是让模型更复杂，而是让模型更克制。",{"type":18,"tag":573,"props":666,"children":668},{"id":667},"_4-baseline-很重要",[669],{"type":23,"value":670},"4. Baseline 很重要",{"type":18,"tag":19,"props":672,"children":673},{},[674],{"type":23,"value":675},"这个项目目前已经有了 MLP，但下一步应该做 baseline。",{"type":18,"tag":19,"props":677,"children":678},{},[679],{"type":23,"value":680},"Baseline 就是一个简单参照模型，用来回答一个问题：",{"type":18,"tag":682,"props":683,"children":684},"blockquote",{},[685],{"type":18,"tag":19,"props":686,"children":687},{},[688],{"type":23,"value":689},"我的复杂模型真的比简单方法更好吗？",{"type":18,"tag":19,"props":691,"children":692},{},[693],{"type":23,"value":694},"可以尝试的 baseline 包括：",{"type":18,"tag":31,"props":696,"children":697},{},[698,703,708],{"type":18,"tag":35,"props":699,"children":700},{},[701],{"type":23,"value":702},"多数类 baseline：永远预测数据中数量更多的类别",{"type":18,"tag":35,"props":704,"children":705},{},[706],{"type":23,"value":707},"Logistic Regression：使用同样的 Morgan 指纹，但只训练线性分类器",{"type":18,"tag":35,"props":709,"children":710},{},[711],{"type":23,"value":712},"RDKit 描述符模型：使用 LogP、分子量、TPSA、氢键供体\u002F受体数量等少量特征",{"type":18,"tag":19,"props":714,"children":715},{},[716],{"type":23,"value":717},"如果一个简单 baseline 就能达到和 MLP 接近的准确率，那说明 MLP 可能并没有带来明显收益。",{"type":18,"tag":76,"props":719,"children":721},{"id":720},"项目局限",[722],{"type":23,"value":720},{"type":18,"tag":19,"props":724,"children":725},{},[726],{"type":23,"value":727},"这个项目是一个学习项目，不适合直接当作严肃的化学预测模型。",{"type":18,"tag":19,"props":729,"children":730},{},[731],{"type":23,"value":732},"主要局限有：",{"type":18,"tag":31,"props":734,"children":735},{},[736,741,746,751,756],{"type":18,"tag":35,"props":737,"children":738},{},[739],{"type":23,"value":740},"数据只有 20 条，远远不够训练稳定模型",{"type":18,"tag":35,"props":742,"children":743},{},[744],{"type":23,"value":745},"亲疏水性本身不是绝对二分类，不同教材或标度可能会有不同划分",{"type":18,"tag":35,"props":747,"children":748},{},[749],{"type":23,"value":750},"Morgan 指纹只是一种特征表示，可能没有捕捉到所有与亲疏水性相关的信息",{"type":18,"tag":35,"props":752,"children":753},{},[754],{"type":23,"value":755},"没有和 baseline 模型做系统比较",{"type":18,"tag":35,"props":757,"children":758},{},[759],{"type":23,"value":760},"没有调参实验，也没有更多评价指标",{"type":18,"tag":19,"props":762,"children":763},{},[764],{"type":23,"value":765},"这些局限并不代表项目没有价值。相反，它们正好说明了机器学习实验中最重要的一点：模型结果必须结合数据、特征、评估方式一起解释。",{"type":18,"tag":76,"props":767,"children":769},{"id":768},"后续可以继续做什么",[770],{"type":23,"value":768},{"type":18,"tag":19,"props":772,"children":773},{},[774],{"type":23,"value":775},"后续我想从三个方向继续优化：",{"type":18,"tag":389,"props":777,"children":778},{"start":391},[779,784,789],{"type":18,"tag":35,"props":780,"children":781},{},[782],{"type":23,"value":783},"增加 baseline 用 Logistic Regression、Random Forest 或多数类预测作为对照。",{"type":18,"tag":35,"props":785,"children":786},{},[787],{"type":23,"value":788},"尝试 RDKit 分子描述符 不只使用 Morgan 指纹，还可以加入 LogP、TPSA、分子量、氢键供体和受体数量等更直观的化学特征。",{"type":18,"tag":35,"props":790,"children":791},{},[792],{"type":23,"value":793},"输出更多评价指标 除了 accuracy，还可以看 confusion matrix、precision、recall，观察模型到底更容易把哪一类预测错。",{"type":18,"tag":76,"props":795,"children":797},{"id":796},"总结",[798],{"type":23,"value":796},{"type":18,"tag":19,"props":800,"children":801},{},[802],{"type":23,"value":803},"这个项目很小，但它让我完整走了一遍机器学习实验流程：",{"type":18,"tag":66,"props":805,"children":807},{"code":806},"数据 -> 特征工程 -> 模型 -> 训练 -> 正则化 -> 交叉验证 -> 结果解释\n",[808],{"type":18,"tag":71,"props":809,"children":810},{"__ignoreMap":7},[811],{"type":23,"value":806},{"type":18,"tag":19,"props":813,"children":814},{},[815],{"type":23,"value":816},"对我来说，这个项目最重要的收获不是准确率有多高，而是开始理解：",{"type":18,"tag":31,"props":818,"children":819},{},[820,825,830,835,840],{"type":18,"tag":35,"props":821,"children":822},{},[823],{"type":23,"value":824},"模型只能学习它看到的特征",{"type":18,"tag":35,"props":826,"children":827},{},[828],{"type":23,"value":829},"训练集表现好不代表泛化能力强",{"type":18,"tag":35,"props":831,"children":832},{},[833],{"type":23,"value":834},"小数据集更需要谨慎评估",{"type":18,"tag":35,"props":836,"children":837},{},[838],{"type":23,"value":839},"baseline 是判断模型价值的参照物",{"type":18,"tag":35,"props":841,"children":842},{},[843],{"type":23,"value":844},"机器学习结果需要被解释，而不是只被展示",{"type":18,"tag":19,"props":846,"children":847},{},[848],{"type":23,"value":849},"这也是我觉得这个项目适合作为机器学习入门练习的原因：它不大，但关键概念都在里面。",{"type":18,"tag":851,"props":852,"children":854},"h4",{"id":853},"项目地址httpsgiteecomo_insistmpl-hydrophobicity",[855,857],{"type":23,"value":856},"项目地址：",{"type":18,"tag":858,"props":859,"children":863},"a",{"href":860,"rel":861},"https:\u002F\u002Fgitee.com\u002Fo_insist\u002Fmpl-hydrophobicity",[862],"nofollow",[864],{"type":23,"value":860},{"title":7,"searchDepth":866,"depth":866,"links":867},2,[868,869,870,871,872,873,874,881,882,883],{"id":78,"depth":866,"text":78},{"id":152,"depth":866,"text":155},{"id":252,"depth":866,"text":252},{"id":365,"depth":866,"text":365},{"id":433,"depth":866,"text":433},{"id":523,"depth":866,"text":523},{"id":569,"depth":866,"text":569,"children":875},[876,878,879,880],{"id":575,"depth":877,"text":578},3,{"id":616,"depth":877,"text":619},{"id":632,"depth":877,"text":635},{"id":667,"depth":877,"text":670},{"id":720,"depth":866,"text":720},{"id":768,"depth":866,"text":768},{"id":796,"depth":866,"text":796},"markdown","content:articles:ai:mpl-hydrophobicity.md","content","articles\u002Fai\u002Fmpl-hydrophobicity.md","articles\u002Fai\u002Fmpl-hydrophobicity","md",1779811687795]