[{"data":1,"prerenderedAt":2108},["ShallowReactive",2],{"article-ai\u002Fsingle-cell":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"tags":11,"body":14,"_type":2102,"_id":2103,"_source":2104,"_file":2105,"_stem":2106,"_extension":2107},"\u002Farticles\u002Fai\u002Fsingle-cell","ai",false,"","Scanpy scRNA-seq 单细胞转录组标准分析实战","以 PBMC 3k 为例，用 Scanpy 跑通 scRNA-seq 全流程——QC 过滤、归一化、高变基因筛选、PCA 降维、UMAP 可视化、Leiden 聚类与细胞类型注释，并理解每条咒语背后的「为什么」。","2026-07-01",[12,13],"生物信息学","AI制药",{"type":15,"children":16,"toc":2087},"root",[17,26,40,50,62,124,133,180,187,197,213,238,248,291,306,363,369,381,391,441,463,502,508,515,527,579,594,599,682,688,740,746,823,836,875,880,927,933,943,952,991,1007,1030,1061,1067,1072,1087,1110,1148,1154,1204,1235,1274,1280,1325,1334,1356,1387,1393,1411,1434,1474,1497,1520,1526,1537,1546,1561,1584,1590,1602,1728,1738,1761,1773,1809,1818,1849,1859,1932,1971,2002,2008,2026,2067,2081],{"type":18,"tag":19,"props":20,"children":22},"element","h1",{"id":21},"scrna-seq-标准管道总览",[23],{"type":24,"value":25},"text","📊 scRNA-seq 标准管道总览",{"type":18,"tag":27,"props":28,"children":29},"p",{},[30,32,38],{"type":24,"value":31},"把整个分析当成一条 ",{"type":18,"tag":33,"props":34,"children":35},"strong",{},[36],{"type":24,"value":37},"ETL 数据管道",{"type":24,"value":39},"：输入一个又大又稀疏的矩阵（细胞 × 基因，整数计数，90%+ 是 0），经过一串变换，输出「每个细胞是什么类型」。",{"type":18,"tag":41,"props":42,"children":44},"pre",{"code":43},"QC过滤 → 归一化 → 选高变基因 → 降维(PCA→UMAP) → 聚类(Leiden) → 注释 → 差异表达\n",[45],{"type":18,"tag":46,"props":47,"children":48},"code",{"__ignoreMap":7},[49],{"type":24,"value":43},{"type":18,"tag":27,"props":51,"children":52},{},[53,55,60],{"type":24,"value":54},"数据装在 ",{"type":18,"tag":33,"props":56,"children":57},{},[58],{"type":24,"value":59},"AnnData",{"type":24,"value":61}," 对象里，整条管道就是不断往它身上加东西：",{"type":18,"tag":63,"props":64,"children":65},"ul",{},[66,78,96,113],{"type":18,"tag":67,"props":68,"children":69},"li",{},[70,76],{"type":18,"tag":46,"props":71,"children":73},{"className":72},[],[74],{"type":24,"value":75},".X",{"type":24,"value":77}," —— 表达矩阵（行=细胞，列=基因）",{"type":18,"tag":67,"props":79,"children":80},{},[81,87,89,94],{"type":18,"tag":46,"props":82,"children":84},{"className":83},[],[85],{"type":24,"value":86},".obs",{"type":24,"value":88}," —— 每个",{"type":18,"tag":33,"props":90,"children":91},{},[92],{"type":24,"value":93},"细胞",{"type":24,"value":95},"的元数据（行注释：QC 指标、聚类标签、细胞类型…）",{"type":18,"tag":67,"props":97,"children":98},{},[99,105,106,111],{"type":18,"tag":46,"props":100,"children":102},{"className":101},[],[103],{"type":24,"value":104},".var",{"type":24,"value":88},{"type":18,"tag":33,"props":107,"children":108},{},[109],{"type":24,"value":110},"基因",{"type":24,"value":112},"的元数据（列注释：是否线粒体、是否高变…）",{"type":18,"tag":67,"props":114,"children":115},{},[116,122],{"type":18,"tag":46,"props":117,"children":119},{"className":118},[],[120],{"type":24,"value":121},".uns",{"type":24,"value":123}," —— 非结构化的全局结果（邻居图参数、配色…）",{"type":18,"tag":125,"props":126,"children":127},"blockquote",{},[128],{"type":18,"tag":27,"props":129,"children":130},{},[131],{"type":24,"value":132},"记住这个结构：后面每一步在改谁（加 obs 列？裁 var？换 X？），就一目了然。",{"type":18,"tag":41,"props":134,"children":138},{"code":135,"language":136,"meta":7,"className":137,"style":7},"import scanpy as sc\nimport numpy as np\nsc.settings.verbosity = 1\nsc.settings.set_figure_params(dpi=80)\n","python","language-python shiki shiki-themes github-dark",[139],{"type":18,"tag":46,"props":140,"children":141},{"__ignoreMap":7},[142,153,162,171],{"type":18,"tag":143,"props":144,"children":147},"span",{"class":145,"line":146},"line",1,[148],{"type":18,"tag":143,"props":149,"children":150},{},[151],{"type":24,"value":152},"import scanpy as sc\n",{"type":18,"tag":143,"props":154,"children":156},{"class":145,"line":155},2,[157],{"type":18,"tag":143,"props":158,"children":159},{},[160],{"type":24,"value":161},"import numpy as np\n",{"type":18,"tag":143,"props":163,"children":165},{"class":145,"line":164},3,[166],{"type":18,"tag":143,"props":167,"children":168},{},[169],{"type":24,"value":170},"sc.settings.verbosity = 1\n",{"type":18,"tag":143,"props":172,"children":174},{"class":145,"line":173},4,[175],{"type":18,"tag":143,"props":176,"children":177},{},[178],{"type":24,"value":179},"sc.settings.set_figure_params(dpi=80)\n",{"type":18,"tag":181,"props":182,"children":184},"h2",{"id":183},"第-1-步加载数据-认识-anndata",[185],{"type":24,"value":186},"第 1 步：加载数据 + 认识 AnnData",{"type":18,"tag":27,"props":188,"children":189},{},[190,195],{"type":18,"tag":33,"props":191,"children":192},{},[193],{"type":24,"value":194},"干什么",{"type":24,"value":196},"：下载 PBMC 3k（外周血单个核细胞，~2700 个，Scanpy 官方入门标配），并把基因名去重。",{"type":18,"tag":27,"props":198,"children":199},{},[200,205,207],{"type":18,"tag":33,"props":201,"children":202},{},[203],{"type":24,"value":204},"看什么",{"type":24,"value":206},"：输出 ",{"type":18,"tag":46,"props":208,"children":210},{"className":209},[],[211],{"type":24,"value":212},"AnnData object with n_obs × n_vars = 2700 × 32738",{"type":18,"tag":63,"props":214,"children":215},{},[216,227],{"type":18,"tag":67,"props":217,"children":218},{},[219,225],{"type":18,"tag":46,"props":220,"children":222},{"className":221},[],[223],{"type":24,"value":224},"n_obs",{"type":24,"value":226}," = 细胞数（行）",{"type":18,"tag":67,"props":228,"children":229},{},[230,236],{"type":18,"tag":46,"props":231,"children":233},{"className":232},[],[234],{"type":24,"value":235},"n_vars",{"type":24,"value":237}," = 基因数（列）",{"type":18,"tag":27,"props":239,"children":240},{},[241,246],{"type":18,"tag":33,"props":242,"children":243},{},[244],{"type":24,"value":245},"动手感受数据结构",{"type":24,"value":247},"（把注释逐行打开跑一下）：",{"type":18,"tag":63,"props":249,"children":250},{},[251,269,280],{"type":18,"tag":67,"props":252,"children":253},{},[254,260,262,267],{"type":18,"tag":46,"props":255,"children":257},{"className":256},[],[258],{"type":24,"value":259},"adata.X[:5,:5].toarray()",{"type":24,"value":261}," —— 稀疏矩阵的一角，绝大多数是 0（这叫 ",{"type":18,"tag":33,"props":263,"children":264},{},[265],{"type":24,"value":266},"dropout",{"type":24,"value":268},"：不是真不表达，是没测到）",{"type":18,"tag":67,"props":270,"children":271},{},[272,278],{"type":18,"tag":46,"props":273,"children":275},{"className":274},[],[276],{"type":24,"value":277},"adata.obs.head()",{"type":24,"value":279}," —— 此刻几乎是空的，后面每一步会往这里加列",{"type":18,"tag":67,"props":281,"children":282},{},[283,289],{"type":18,"tag":46,"props":284,"children":286},{"className":285},[],[287],{"type":24,"value":288},"adata.var.head()",{"type":24,"value":290}," —— 每个基因的信息（现在只有 gene_ids）",{"type":18,"tag":125,"props":292,"children":293},{},[294],{"type":18,"tag":27,"props":295,"children":296},{},[297,299,304],{"type":24,"value":298},"对比 bulk RNA-seq：以前是把上百万细胞",{"type":18,"tag":33,"props":300,"children":301},{},[302],{"type":24,"value":303},"混在一起测平均",{"type":24,"value":305},"，丢了异质性。单细胞把这个平均「拆」到每个细胞——这才是能区分组织里有哪些细胞类型的基础。",{"type":18,"tag":41,"props":307,"children":309},{"code":308,"language":136,"meta":7,"className":137,"style":7},"adata = sc.datasets.pbmc3k()      # 自动下载\nadata.var_names_make_unique()     # 基因名去重，养成习惯\nprint(adata)\n# print(adata.X[:5,:5].toarray())  # 稀疏矩阵的一角，大多是 0\n# print(adata.obs.head())   # 每细胞元数据\nprint(adata.var.head())  # 每基因元数据\n",[310],{"type":18,"tag":46,"props":311,"children":312},{"__ignoreMap":7},[313,321,329,337,345,354],{"type":18,"tag":143,"props":314,"children":315},{"class":145,"line":146},[316],{"type":18,"tag":143,"props":317,"children":318},{},[319],{"type":24,"value":320},"adata = sc.datasets.pbmc3k()      # 自动下载\n",{"type":18,"tag":143,"props":322,"children":323},{"class":145,"line":155},[324],{"type":18,"tag":143,"props":325,"children":326},{},[327],{"type":24,"value":328},"adata.var_names_make_unique()     # 基因名去重，养成习惯\n",{"type":18,"tag":143,"props":330,"children":331},{"class":145,"line":164},[332],{"type":18,"tag":143,"props":333,"children":334},{},[335],{"type":24,"value":336},"print(adata)\n",{"type":18,"tag":143,"props":338,"children":339},{"class":145,"line":173},[340],{"type":18,"tag":143,"props":341,"children":342},{},[343],{"type":24,"value":344},"# print(adata.X[:5,:5].toarray())  # 稀疏矩阵的一角，大多是 0\n",{"type":18,"tag":143,"props":346,"children":348},{"class":145,"line":347},5,[349],{"type":18,"tag":143,"props":350,"children":351},{},[352],{"type":24,"value":353},"# print(adata.obs.head())   # 每细胞元数据\n",{"type":18,"tag":143,"props":355,"children":357},{"class":145,"line":356},6,[358],{"type":18,"tag":143,"props":359,"children":360},{},[361],{"type":24,"value":362},"print(adata.var.head())  # 每基因元数据\n",{"type":18,"tag":181,"props":364,"children":366},{"id":365},"第-2-步质控-qc-扔掉不是真细胞的行",[367],{"type":24,"value":368},"第 2 步：质控 QC —— 扔掉「不是真细胞」的行",{"type":18,"tag":27,"props":370,"children":371},{},[372,374,379],{"type":24,"value":373},"液滴测序里混着空泡、濒死细胞、还有 ",{"type":18,"tag":33,"props":375,"children":376},{},[377],{"type":24,"value":378},"doublet",{"type":24,"value":380},"（两个细胞挤进一个液滴被当成一个）。这些是技术噪声，不清掉会污染后面所有步骤。",{"type":18,"tag":27,"props":382,"children":383},{},[384,389],{"type":18,"tag":33,"props":385,"children":386},{},[387],{"type":24,"value":388},"先算三个每细胞指标",{"type":24,"value":390},"（下面这个 cell）：",{"type":18,"tag":63,"props":392,"children":393},{},[394,412,423],{"type":18,"tag":67,"props":395,"children":396},{},[397,403,405,410],{"type":18,"tag":46,"props":398,"children":400},{"className":399},[],[401],{"type":24,"value":402},"n_genes_by_counts",{"type":24,"value":404}," —— 这个细胞测到多少",{"type":18,"tag":33,"props":406,"children":407},{},[408],{"type":24,"value":409},"种",{"type":24,"value":411},"基因。太少 = 空液滴\u002F破细胞",{"type":18,"tag":67,"props":413,"children":414},{},[415,421],{"type":18,"tag":46,"props":416,"children":418},{"className":417},[],[419],{"type":24,"value":420},"total_counts",{"type":24,"value":422}," —— 总分子数，即测序深度",{"type":18,"tag":67,"props":424,"children":425},{},[426,432,434,439],{"type":18,"tag":46,"props":427,"children":429},{"className":428},[],[430],{"type":24,"value":431},"pct_counts_mt",{"type":24,"value":433}," —— ",{"type":18,"tag":33,"props":435,"children":436},{},[437],{"type":24,"value":438},"线粒体基因占比",{"type":24,"value":440},"。太高 = 细胞在死（膜破了，胞质 mRNA 漏走，线粒体 mRNA 还在，占比飙高）",{"type":18,"tag":125,"props":442,"children":443},{},[444],{"type":18,"tag":27,"props":445,"children":446},{},[447,453,455,461],{"type":18,"tag":46,"props":448,"children":450},{"className":449},[],[451],{"type":24,"value":452},"adata.var['mt'] = adata.var_names.str.startswith('MT-')",{"type":24,"value":454}," 是在标记哪些基因是线粒体基因（人类基因名以 ",{"type":18,"tag":46,"props":456,"children":458},{"className":457},[],[459],{"type":24,"value":460},"MT-",{"type":24,"value":462}," 开头），有了这个标记才能算出 mt 占比。",{"type":18,"tag":41,"props":464,"children":466},{"code":465,"language":136,"meta":7,"className":137,"style":7},"# 给每个基因打一个布尔标签，名字以MT-开头 -> True(线粒体基因)，其余的是False\n# 先告诉程序\"哪些基因是线粒体基因\"，再让 scanpy 一次性算出所有 QC 指标，为后续过滤低质量细胞做准备。\nadata.var['mt'] = adata.var_names.str.startswith('MT-')   # 线粒体基因（人类是 MT- 开头）\nsc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n",[467],{"type":18,"tag":46,"props":468,"children":469},{"__ignoreMap":7},[470,478,486,494],{"type":18,"tag":143,"props":471,"children":472},{"class":145,"line":146},[473],{"type":18,"tag":143,"props":474,"children":475},{},[476],{"type":24,"value":477},"# 给每个基因打一个布尔标签，名字以MT-开头 -> True(线粒体基因)，其余的是False\n",{"type":18,"tag":143,"props":479,"children":480},{"class":145,"line":155},[481],{"type":18,"tag":143,"props":482,"children":483},{},[484],{"type":24,"value":485},"# 先告诉程序\"哪些基因是线粒体基因\"，再让 scanpy 一次性算出所有 QC 指标，为后续过滤低质量细胞做准备。\n",{"type":18,"tag":143,"props":487,"children":488},{"class":145,"line":164},[489],{"type":18,"tag":143,"props":490,"children":491},{},[492],{"type":24,"value":493},"adata.var['mt'] = adata.var_names.str.startswith('MT-')   # 线粒体基因（人类是 MT- 开头）\n",{"type":18,"tag":143,"props":495,"children":496},{"class":145,"line":173},[497],{"type":18,"tag":143,"props":498,"children":499},{},[500],{"type":24,"value":501},"sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)\n",{"type":18,"tag":181,"props":503,"children":505},{"id":504},"看分布先看再砍-怎么读这几张图",[506],{"type":24,"value":507},"看分布（先看再砍）—— 怎么读这几张图",{"type":18,"tag":509,"props":510,"children":512},"h4",{"id":511},"先搞懂小提琴图violin在画什么",[513],{"type":24,"value":514},"先搞懂：小提琴图（violin）在画什么",{"type":18,"tag":27,"props":516,"children":517},{},[518,520,525],{"type":24,"value":519},"把每张小提琴图想象成一个",{"type":18,"tag":33,"props":521,"children":522},{},[523],{"type":24,"value":524},"竖着的、对称的「烛台\u002F陀螺」",{"type":24,"value":526},"：",{"type":18,"tag":63,"props":528,"children":529},{},[530,538,555],{"type":18,"tag":67,"props":531,"children":532},{},[533],{"type":18,"tag":33,"props":534,"children":535},{},[536],{"type":24,"value":537},"纵轴 = 指标数值；横轴没含义。",{"type":18,"tag":67,"props":539,"children":540},{},[541,546,548,553],{"type":18,"tag":33,"props":542,"children":543},{},[544],{"type":24,"value":545},"某高度上「胖」还是「瘦」= 有多少细胞落在那个数值",{"type":24,"value":547},"。肚子最胖处 = 大多数细胞扎堆的值；越往上越细 = 越少细胞有那么高的值。本质就是",{"type":18,"tag":33,"props":549,"children":550},{},[551],{"type":24,"value":552},"直方图竖过来再左右镜像",{"type":24,"value":554},"（两边一样是镜像，不是两组数据）。",{"type":18,"tag":67,"props":556,"children":557},{},[558,563,565,570,572,577],{"type":18,"tag":33,"props":559,"children":560},{},[561],{"type":24,"value":562},"黑点 = 一个个细胞",{"type":24,"value":564},"；它们左右散开（jitter）",{"type":18,"tag":33,"props":566,"children":567},{},[568],{"type":24,"value":569},"只为不重叠，横向位置随机、没意义",{"type":24,"value":571},"——只看每个点的",{"type":18,"tag":33,"props":573,"children":574},{},[575],{"type":24,"value":576},"高度",{"type":24,"value":578},"。",{"type":18,"tag":125,"props":580,"children":581},{},[582],{"type":18,"tag":27,"props":583,"children":584},{},[585,587,592],{"type":24,"value":586},"一句话：",{"type":18,"tag":33,"props":588,"children":589},{},[590],{"type":24,"value":591},"胖肚子 = 正常细胞扎堆处；向上拖出的细尾巴 + 稀疏黑点 = 数值异常的细胞",{"type":24,"value":593},"，QC 要砍的就是这些尾巴。",{"type":18,"tag":509,"props":595,"children":597},{"id":596},"逐张看",[598],{"type":24,"value":596},{"type":18,"tag":63,"props":600,"children":601},{},[602,637,652],{"type":18,"tag":67,"props":603,"children":604},{},[605,615,617,621,623,627,629,635],{"type":18,"tag":33,"props":606,"children":607},{},[608,610],{"type":24,"value":609},"图1 ",{"type":18,"tag":46,"props":611,"children":613},{"className":612},[],[614],{"type":24,"value":402},{"type":24,"value":616},"（每细胞测到多少",{"type":18,"tag":33,"props":618,"children":619},{},[620],{"type":24,"value":409},{"type":24,"value":622},"基因）：胖肚子 ~500-1200（正常）；少数点拖到 2500-3500 → 可疑 ",{"type":18,"tag":33,"props":624,"children":625},{},[626],{"type":24,"value":378},{"type":24,"value":628},"（两个细胞挤一起，基因种类翻倍）。→ 下一步 ",{"type":18,"tag":46,"props":630,"children":632},{"className":631},[],[633],{"type":24,"value":634},"\u003C 2500",{"type":24,"value":636}," 剪掉这条尾巴。",{"type":18,"tag":67,"props":638,"children":639},{},[640,650],{"type":18,"tag":33,"props":641,"children":642},{},[643,645],{"type":24,"value":644},"图2 ",{"type":18,"tag":46,"props":646,"children":648},{"className":647},[],[649],{"type":24,"value":420},{"type":24,"value":651},"（每细胞总分子数 = 测序深度）：胖肚子 ~1500-3500，尾巴到 16000。同理，特别高的也常是 doublet（和图1联动）。",{"type":18,"tag":67,"props":653,"children":654},{},[655,665,667,672,674,680],{"type":18,"tag":33,"props":656,"children":657},{},[658,660],{"type":24,"value":659},"图3 ",{"type":18,"tag":46,"props":661,"children":663},{"className":662},[],[664],{"type":24,"value":431},{"type":24,"value":666},"（线粒体占比 %）：胖肚子 ~1-3%（健康）；点拖到 5\u002F10\u002F20% → ",{"type":18,"tag":33,"props":668,"children":669},{},[670],{"type":24,"value":671},"濒死细胞",{"type":24,"value":673},"（膜破，胞质 mRNA 漏走，线粒体的还在，占比飙高）。→ 下一步 ",{"type":18,"tag":46,"props":675,"children":677},{"className":676},[],[678],{"type":24,"value":679},"\u003C 5",{"type":24,"value":681}," 砍掉死细胞。",{"type":18,"tag":509,"props":683,"children":685},{"id":684},"两张散点图每点仍是一个细胞看两指标的关系",[686],{"type":24,"value":687},"两张散点图（每点仍是一个细胞，看「两指标的关系」）",{"type":18,"tag":63,"props":689,"children":690},{},[691,713],{"type":18,"tag":67,"props":692,"children":693},{},[694,711],{"type":18,"tag":33,"props":695,"children":696},{},[697,702,704,709],{"type":18,"tag":46,"props":698,"children":700},{"className":699},[],[701],{"type":24,"value":420},{"type":24,"value":703},"(x) vs ",{"type":18,"tag":46,"props":705,"children":707},{"className":706},[],[708],{"type":24,"value":431},{"type":24,"value":710},"(y)",{"type":24,"value":712},"：看上方点（线粒体高）。典型死细胞 = 总数低 + 线粒体高。确认 5% 这条线合不合理。",{"type":18,"tag":67,"props":714,"children":715},{},[716,731,733,738],{"type":18,"tag":33,"props":717,"children":718},{},[719,724,725,730],{"type":18,"tag":46,"props":720,"children":722},{"className":721},[],[723],{"type":24,"value":420},{"type":24,"value":703},{"type":18,"tag":46,"props":726,"children":728},{"className":727},[],[729],{"type":24,"value":402},{"type":24,"value":710},{"type":24,"value":732},"：正常应",{"type":18,"tag":33,"props":734,"children":735},{},[736],{"type":24,"value":737},"正相关",{"type":24,"value":739},"（点排成往右上的曲线带）；偏离主带的点 = 可疑 doublet。确认数据行为正常、揪离群点。",{"type":18,"tag":509,"props":741,"children":743},{"id":742},"看图的目的-用眼睛定阈值",[744],{"type":24,"value":745},"看图的目的 = 用眼睛定阈值",{"type":18,"tag":747,"props":748,"children":749},"table",{},[750,774],{"type":18,"tag":751,"props":752,"children":753},"thead",{},[754],{"type":18,"tag":755,"props":756,"children":757},"tr",{},[758,764,769],{"type":18,"tag":759,"props":760,"children":761},"th",{},[762],{"type":24,"value":763},"看哪张",{"type":18,"tag":759,"props":765,"children":766},{},[767],{"type":24,"value":768},"看到什么",{"type":18,"tag":759,"props":770,"children":771},{},[772],{"type":24,"value":773},"定哪个阈值",{"type":18,"tag":775,"props":776,"children":777},"tbody",{},[778,801],{"type":18,"tag":755,"props":779,"children":780},{},[781,787,792],{"type":18,"tag":782,"props":783,"children":784},"td",{},[785],{"type":24,"value":786},"图1 n_genes",{"type":18,"tag":782,"props":788,"children":789},{},[790],{"type":24,"value":791},"主群到 ~2000，再上是稀疏尾巴",{"type":18,"tag":782,"props":793,"children":794},{},[795],{"type":18,"tag":46,"props":796,"children":798},{"className":797},[],[799],{"type":24,"value":800},"n_genes_by_counts \u003C 2500",{"type":18,"tag":755,"props":802,"children":803},{},[804,809,814],{"type":18,"tag":782,"props":805,"children":806},{},[807],{"type":24,"value":808},"图3 pct_mt",{"type":18,"tag":782,"props":810,"children":811},{},[812],{"type":24,"value":813},"主群在 3% 以下，5% 以上零星",{"type":18,"tag":782,"props":815,"children":816},{},[817],{"type":18,"tag":46,"props":818,"children":820},{"className":819},[],[821],{"type":24,"value":822},"pct_counts_mt \u003C 5",{"type":18,"tag":125,"props":824,"children":825},{},[826],{"type":18,"tag":27,"props":827,"children":828},{},[829,834],{"type":18,"tag":33,"props":830,"children":831},{},[832],{"type":24,"value":833},"「主群在哪结束、垃圾尾巴从哪开始」这条线是拿眼睛估的，没有标准答案",{"type":24,"value":835},"——这正是清单 3.0 的 L2 判断点。将来把这步做成 Agent 的 tool，这两个数就是要暴露给用户调、或写护栏（「砍掉超过 X% 细胞就报警」）的参数。",{"type":18,"tag":41,"props":837,"children":839},{"code":838,"language":136,"meta":7,"className":137,"style":7},"# 看图定阈值\nsc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)\nsc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')\nsc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')\n    \n",[840],{"type":18,"tag":46,"props":841,"children":842},{"__ignoreMap":7},[843,851,859,867],{"type":18,"tag":143,"props":844,"children":845},{"class":145,"line":146},[846],{"type":18,"tag":143,"props":847,"children":848},{},[849],{"type":24,"value":850},"# 看图定阈值\n",{"type":18,"tag":143,"props":852,"children":853},{"class":145,"line":155},[854],{"type":18,"tag":143,"props":855,"children":856},{},[857],{"type":24,"value":858},"sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True)\n",{"type":18,"tag":143,"props":860,"children":861},{"class":145,"line":164},[862],{"type":18,"tag":143,"props":863,"children":864},{},[865],{"type":24,"value":866},"sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')\n",{"type":18,"tag":143,"props":868,"children":869},{"class":145,"line":173},[870],{"type":18,"tag":143,"props":871,"children":872},{},[873],{"type":24,"value":874},"sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')\n",{"type":18,"tag":27,"props":876,"children":877},{},[878],{"type":24,"value":879},"过滤\n看什么：细胞数从 2700 降到 ~2638，基因数降到 ~13714。\n③ 坑\u002F决策点：2500 和 5% 这两个阈值不是金科玉律，是看着上面的图定的。砍太狠会丢稀有真细胞，太松留垃圾。← 这就是 3.0 说的 L2 判断；将来这就是你 tool 里要暴露\u002F加护栏的参数。",{"type":18,"tag":41,"props":881,"children":883},{"code":882,"language":136,"meta":7,"className":137,"style":7},"sc.pp.filter_cells(adata, min_genes=200)      # 细胞至少测到200种基因\nsc.pp.filter_genes(adata, min_cells=3)        # 基因至少在3个细胞里出现\nadata = adata[adata.obs.n_genes_by_counts \u003C 2500, :]   # 砍掉疑似doublet\nadata = adata[adata.obs.pct_counts_mt \u003C 5, :]          # 砍掉濒死细胞\nadata\n",[884],{"type":18,"tag":46,"props":885,"children":886},{"__ignoreMap":7},[887,895,903,911,919],{"type":18,"tag":143,"props":888,"children":889},{"class":145,"line":146},[890],{"type":18,"tag":143,"props":891,"children":892},{},[893],{"type":24,"value":894},"sc.pp.filter_cells(adata, min_genes=200)      # 细胞至少测到200种基因\n",{"type":18,"tag":143,"props":896,"children":897},{"class":145,"line":155},[898],{"type":18,"tag":143,"props":899,"children":900},{},[901],{"type":24,"value":902},"sc.pp.filter_genes(adata, min_cells=3)        # 基因至少在3个细胞里出现\n",{"type":18,"tag":143,"props":904,"children":905},{"class":145,"line":164},[906],{"type":18,"tag":143,"props":907,"children":908},{},[909],{"type":24,"value":910},"adata = adata[adata.obs.n_genes_by_counts \u003C 2500, :]   # 砍掉疑似doublet\n",{"type":18,"tag":143,"props":912,"children":913},{"class":145,"line":173},[914],{"type":18,"tag":143,"props":915,"children":916},{},[917],{"type":24,"value":918},"adata = adata[adata.obs.pct_counts_mt \u003C 5, :]          # 砍掉濒死细胞\n",{"type":18,"tag":143,"props":920,"children":921},{"class":145,"line":347},[922],{"type":18,"tag":143,"props":923,"children":924},{},[925],{"type":24,"value":926},"adata\n",{"type":18,"tag":181,"props":928,"children":930},{"id":929},"第-3-步归一化-让细胞之间可比",[931],{"type":24,"value":932},"第 3 步：归一化 —— 让细胞之间可比",{"type":18,"tag":27,"props":934,"children":935},{},[936,941],{"type":18,"tag":33,"props":937,"children":938},{},[939],{"type":24,"value":940},"问题",{"type":24,"value":942},"：每个细胞捕获的总分子数天差地别（纯技术原因）。细胞 A 测到 5000 条、B 测到 500 条，直接比没意义。",{"type":18,"tag":27,"props":944,"children":945},{},[946,951],{"type":18,"tag":33,"props":947,"children":948},{},[949],{"type":24,"value":950},"两步",{"type":24,"value":526},{"type":18,"tag":953,"props":954,"children":955},"ol",{},[956,967],{"type":18,"tag":67,"props":957,"children":958},{},[959,965],{"type":18,"tag":46,"props":960,"children":962},{"className":961},[],[963],{"type":24,"value":964},"normalize_total(target_sum=1e4)",{"type":24,"value":966}," —— 每个细胞缩放到相同总量（1 万）",{"type":18,"tag":67,"props":968,"children":969},{},[970,976,978,984,986],{"type":18,"tag":46,"props":971,"children":973},{"className":972},[],[974],{"type":24,"value":975},"log1p",{"type":24,"value":977}," —— 取 ",{"type":18,"tag":46,"props":979,"children":981},{"className":980},[],[982],{"type":24,"value":983},"log(1+x)",{"type":24,"value":985},"。基因表达跨好几个数量级，log 压缩量纲、让方差更均匀、防止高表达基因主导。",{"type":18,"tag":33,"props":987,"children":988},{},[989],{"type":24,"value":990},"等价于你做 ML 前的特征缩放。",{"type":18,"tag":27,"props":992,"children":993},{},[994,1000,1002],{"type":18,"tag":46,"props":995,"children":997},{"className":996},[],[998],{"type":24,"value":999},"adata.raw = adata",{"type":24,"value":1001}," —— 存一份当前（全基因、已归一化）状态的备份。第 4 步会把基因裁到 ~2000 个，但第 9 步画 marker 图想用全部基因，靠这份备份。",{"type":18,"tag":33,"props":1003,"children":1004},{},[1005],{"type":24,"value":1006},"别漏。",{"type":18,"tag":125,"props":1008,"children":1009},{},[1010],{"type":18,"tag":27,"props":1011,"children":1012},{},[1013,1015,1021,1023,1028],{"type":24,"value":1014},"⚠️ 你这次跑出了 ",{"type":18,"tag":46,"props":1016,"children":1018},{"className":1017},[],[1019],{"type":24,"value":1020},"WARNING: adata.X seems to be already log-transformed",{"type":24,"value":1022},"。这是 scanpy 的保守提示——通常是",{"type":18,"tag":33,"props":1024,"children":1025},{},[1026],{"type":24,"value":1027},"这个 cell 被重复执行了",{"type":24,"value":1029},"（log 又套了一层）。干净做法：从第 1 步重新跑一遍（菜单 Kernel → Restart Kernel and Run All Cells），保证每个预处理步骤只执行一次。本次教学不影响，但要养成「预处理步骤不重复跑」的习惯。",{"type":18,"tag":41,"props":1031,"children":1033},{"code":1032,"language":136,"meta":7,"className":137,"style":7},"sc.pp.normalize_total(adata, target_sum=1e4)   # 每细胞归一到总量1万\nsc.pp.log1p(adata)                             # log(1+x)\nadata.raw = adata                              # 存一份当前状态（注释画图时用全基因）\n",[1034],{"type":18,"tag":46,"props":1035,"children":1036},{"__ignoreMap":7},[1037,1045,1053],{"type":18,"tag":143,"props":1038,"children":1039},{"class":145,"line":146},[1040],{"type":18,"tag":143,"props":1041,"children":1042},{},[1043],{"type":24,"value":1044},"sc.pp.normalize_total(adata, target_sum=1e4)   # 每细胞归一到总量1万\n",{"type":18,"tag":143,"props":1046,"children":1047},{"class":145,"line":155},[1048],{"type":18,"tag":143,"props":1049,"children":1050},{},[1051],{"type":24,"value":1052},"sc.pp.log1p(adata)                             # log(1+x)\n",{"type":18,"tag":143,"props":1054,"children":1055},{"class":145,"line":164},[1056],{"type":18,"tag":143,"props":1057,"children":1058},{},[1059],{"type":24,"value":1060},"adata.raw = adata                              # 存一份当前状态（注释画图时用全基因）\n",{"type":18,"tag":181,"props":1062,"children":1064},{"id":1063},"第-4-步选高变基因-hvg-只留有区分力的列",[1065],{"type":24,"value":1066},"第 4 步：选高变基因 HVG —— 只留有区分力的列",{"type":18,"tag":27,"props":1068,"children":1069},{},[1070],{"type":24,"value":1071},"~2 万基因里，大部分是「管家基因」（每个细胞都差不多，对区分类型没用）。挑出变化最大的 ~2000 个。",{"type":18,"tag":27,"props":1073,"children":1074},{},[1075,1080,1082],{"type":18,"tag":33,"props":1076,"children":1077},{},[1078],{"type":24,"value":1079},"为什么",{"type":24,"value":1081},"：降维 + 去噪，只盯真正能区分细胞的基因。",{"type":18,"tag":33,"props":1083,"children":1084},{},[1085],{"type":24,"value":1086},"就是特征选择。",{"type":18,"tag":27,"props":1088,"children":1089},{},[1090,1094,1095,1101,1103,1108],{"type":18,"tag":33,"props":1091,"children":1092},{},[1093],{"type":24,"value":204},{"type":24,"value":526},{"type":18,"tag":46,"props":1096,"children":1098},{"className":1097},[],[1099],{"type":24,"value":1100},"sc.pl.highly_variable_genes",{"type":24,"value":1102}," 的图里，黑点 = 高变（保留）、灰点 = 不变（丢弃）。最后裁完 ",{"type":18,"tag":46,"props":1104,"children":1106},{"className":1105},[],[1107],{"type":24,"value":235},{"type":24,"value":1109}," 从 ~13714 降到 ~1800-2000。",{"type":18,"tag":41,"props":1111,"children":1113},{"code":1112,"language":136,"meta":7,"className":137,"style":7},"sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\nsc.pl.highly_variable_genes(adata)\nadata = adata[:, adata.var.highly_variable]    # 只留高变基因\nadata\n",[1114],{"type":18,"tag":46,"props":1115,"children":1116},{"__ignoreMap":7},[1117,1125,1133,1141],{"type":18,"tag":143,"props":1118,"children":1119},{"class":145,"line":146},[1120],{"type":18,"tag":143,"props":1121,"children":1122},{},[1123],{"type":24,"value":1124},"sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)\n",{"type":18,"tag":143,"props":1126,"children":1127},{"class":145,"line":155},[1128],{"type":18,"tag":143,"props":1129,"children":1130},{},[1131],{"type":24,"value":1132},"sc.pl.highly_variable_genes(adata)\n",{"type":18,"tag":143,"props":1134,"children":1135},{"class":145,"line":164},[1136],{"type":18,"tag":143,"props":1137,"children":1138},{},[1139],{"type":24,"value":1140},"adata = adata[:, adata.var.highly_variable]    # 只留高变基因\n",{"type":18,"tag":143,"props":1142,"children":1143},{"class":145,"line":173},[1144],{"type":18,"tag":143,"props":1145,"children":1146},{},[1147],{"type":24,"value":926},{"type":18,"tag":181,"props":1149,"children":1151},{"id":1150},"第-5-步缩放-pca-降到几十维",[1152],{"type":24,"value":1153},"第 5 步：缩放 + PCA —— 降到几十维",{"type":18,"tag":953,"props":1155,"children":1156},{},[1157,1175,1193],{"type":18,"tag":67,"props":1158,"children":1159},{},[1160,1166,1168,1173],{"type":18,"tag":46,"props":1161,"children":1163},{"className":1162},[],[1164],{"type":24,"value":1165},"regress_out",{"type":24,"value":1167},"（",{"type":18,"tag":33,"props":1169,"children":1170},{},[1171],{"type":24,"value":1172},"可选、较慢",{"type":24,"value":1174},"）—— 回归掉 total_counts、线粒体比例这些技术变量的残留影响。想快可以先跳过。",{"type":18,"tag":67,"props":1176,"children":1177},{},[1178,1184,1186,1191],{"type":18,"tag":46,"props":1179,"children":1181},{"className":1180},[],[1182],{"type":24,"value":1183},"scale(max_value=10)",{"type":24,"value":1185}," —— 每个基因做 z-score 标准化，截断到 10（防极端值）。之后 ",{"type":18,"tag":46,"props":1187,"children":1189},{"className":1188},[],[1190],{"type":24,"value":75},{"type":24,"value":1192}," 变成有正有负的标准化值。",{"type":18,"tag":67,"props":1194,"children":1195},{},[1196,1202],{"type":18,"tag":46,"props":1197,"children":1199},{"className":1198},[],[1200],{"type":24,"value":1201},"pca",{"type":24,"value":1203}," —— 把 ~2000 维压到几十个主成分，抓主要变异轴、去噪。",{"type":18,"tag":27,"props":1205,"children":1206},{},[1207,1211,1212,1218,1220,1226,1228,1233],{"type":18,"tag":33,"props":1208,"children":1209},{},[1210],{"type":24,"value":204},{"type":24,"value":526},{"type":18,"tag":46,"props":1213,"children":1215},{"className":1214},[],[1216],{"type":24,"value":1217},"pca_variance_ratio",{"type":24,"value":1219}," 图找「肘部」（曲线拐弯变平处），决定下一步 ",{"type":18,"tag":46,"props":1221,"children":1223},{"className":1222},[],[1224],{"type":24,"value":1225},"n_pcs",{"type":24,"value":1227}," 用多少。PBMC3k 大概 ",{"type":18,"tag":33,"props":1229,"children":1230},{},[1231],{"type":24,"value":1232},"30-40",{"type":24,"value":1234}," 个就够。",{"type":18,"tag":41,"props":1236,"children":1238},{"code":1237,"language":136,"meta":7,"className":137,"style":7},"sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])  # 可选：回归掉技术变量\nsc.pp.scale(adata, max_value=10)                             # z-score，截断到10\nsc.tl.pca(adata, svd_solver='arpack')\nsc.pl.pca_variance_ratio(adata, log=True)\n   \n",[1239],{"type":18,"tag":46,"props":1240,"children":1241},{"__ignoreMap":7},[1242,1250,1258,1266],{"type":18,"tag":143,"props":1243,"children":1244},{"class":145,"line":146},[1245],{"type":18,"tag":143,"props":1246,"children":1247},{},[1248],{"type":24,"value":1249},"sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])  # 可选：回归掉技术变量\n",{"type":18,"tag":143,"props":1251,"children":1252},{"class":145,"line":155},[1253],{"type":18,"tag":143,"props":1254,"children":1255},{},[1256],{"type":24,"value":1257},"sc.pp.scale(adata, max_value=10)                             # z-score，截断到10\n",{"type":18,"tag":143,"props":1259,"children":1260},{"class":145,"line":164},[1261],{"type":18,"tag":143,"props":1262,"children":1263},{},[1264],{"type":24,"value":1265},"sc.tl.pca(adata, svd_solver='arpack')\n",{"type":18,"tag":143,"props":1267,"children":1268},{"class":145,"line":173},[1269],{"type":18,"tag":143,"props":1270,"children":1271},{},[1272],{"type":24,"value":1273},"sc.pl.pca_variance_ratio(adata, log=True)\n",{"type":18,"tag":181,"props":1275,"children":1277},{"id":1276},"第-6-步邻居图-umap",[1278],{"type":24,"value":1279},"第 6 步：邻居图 + UMAP",{"type":18,"tag":953,"props":1281,"children":1282},{},[1283,1308],{"type":18,"tag":67,"props":1284,"children":1285},{},[1286,1292,1294,1299,1301,1306],{"type":18,"tag":46,"props":1287,"children":1289},{"className":1288},[],[1290],{"type":24,"value":1291},"neighbors(n_neighbors=10, n_pcs=40)",{"type":24,"value":1293}," —— 在 PCA 空间建 k 近邻图。",{"type":18,"tag":33,"props":1295,"children":1296},{},[1297],{"type":24,"value":1298},"这张图是后面聚类和 UMAP 共同的基础。",{"type":24,"value":1300}," ",{"type":18,"tag":46,"props":1302,"children":1304},{"className":1303},[],[1305],{"type":24,"value":1225},{"type":24,"value":1307}," 用上一步看的肘部值。",{"type":18,"tag":67,"props":1309,"children":1310},{},[1311,1317,1319,1324],{"type":18,"tag":46,"props":1312,"children":1314},{"className":1313},[],[1315],{"type":24,"value":1316},"umap",{"type":24,"value":1318}," —— 再压到 2 维",{"type":18,"tag":33,"props":1320,"children":1321},{},[1322],{"type":24,"value":1323},"只为了画图看",{"type":24,"value":578},{"type":18,"tag":27,"props":1326,"children":1327},{},[1328,1332],{"type":18,"tag":33,"props":1329,"children":1330},{},[1331],{"type":24,"value":204},{"type":24,"value":1333},"：一团点散成几坨——已经有结构了，但还没分组、没颜色。",{"type":18,"tag":125,"props":1335,"children":1336},{},[1337],{"type":18,"tag":27,"props":1338,"children":1339},{},[1340,1342,1347,1349,1354],{"type":24,"value":1341},"⚠️ ",{"type":18,"tag":33,"props":1343,"children":1344},{},[1345],{"type":24,"value":1346},"铁律 caveat",{"type":24,"value":1348},"：UMAP ",{"type":18,"tag":33,"props":1350,"children":1351},{},[1352],{"type":24,"value":1353},"只能看、不能量",{"type":24,"value":1355},"。两坨在图上挨得近 ≠ 生物学相近，UMAP 会扭曲全局距离。任何定量结论都不能基于 UMAP 的二维坐标。",{"type":18,"tag":41,"props":1357,"children":1359},{"code":1358,"language":136,"meta":7,"className":137,"style":7},"sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)   # n_pcs 用上一步看的肘部值\nsc.tl.umap(adata)\nsc.pl.umap(adata)\n    \n",[1360],{"type":18,"tag":46,"props":1361,"children":1362},{"__ignoreMap":7},[1363,1371,1379],{"type":18,"tag":143,"props":1364,"children":1365},{"class":145,"line":146},[1366],{"type":18,"tag":143,"props":1367,"children":1368},{},[1369],{"type":24,"value":1370},"sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)   # n_pcs 用上一步看的肘部值\n",{"type":18,"tag":143,"props":1372,"children":1373},{"class":145,"line":155},[1374],{"type":18,"tag":143,"props":1375,"children":1376},{},[1377],{"type":24,"value":1378},"sc.tl.umap(adata)\n",{"type":18,"tag":143,"props":1380,"children":1381},{"class":145,"line":164},[1382],{"type":18,"tag":143,"props":1383,"children":1384},{},[1385],{"type":24,"value":1386},"sc.pl.umap(adata)\n",{"type":18,"tag":181,"props":1388,"children":1390},{"id":1389},"第-7-步聚类-leiden-把相似细胞分组",[1391],{"type":24,"value":1392},"第 7 步：聚类 Leiden —— 把相似细胞分组",{"type":18,"tag":27,"props":1394,"children":1395},{},[1396,1398,1403,1405,1410],{"type":24,"value":1397},"在上一步那张近邻图上做",{"type":18,"tag":33,"props":1399,"children":1400},{},[1401],{"type":24,"value":1402},"社区发现",{"type":24,"value":1404},"（Leiden 算法），把抱团的细胞分成一簇簇 = ",{"type":18,"tag":33,"props":1406,"children":1407},{},[1408],{"type":24,"value":1409},"候选细胞类型\u002F状态",{"type":24,"value":578},{"type":18,"tag":27,"props":1412,"children":1413},{},[1414,1425,1427,1432],{"type":18,"tag":33,"props":1415,"children":1416},{},[1417,1419],{"type":24,"value":1418},"最重要的旋钮 ",{"type":18,"tag":46,"props":1420,"children":1422},{"className":1421},[],[1423],{"type":24,"value":1424},"resolution",{"type":24,"value":1426},"：调高 = 簇更多更细，调低 = 簇更少更粗。PBMC3k 在 1.0 附近大约 8 个簇（但",{"type":18,"tag":33,"props":1428,"children":1429},{},[1430],{"type":24,"value":1431},"随 scanpy\u002Fleiden 版本会有出入",{"type":24,"value":1433},"，第 9 步贴标签时要按你实际的簇数来）。",{"type":18,"tag":27,"props":1435,"children":1436},{},[1437,1442,1444,1449,1451,1457,1459,1465,1467,1472],{"type":18,"tag":33,"props":1438,"children":1439},{},[1440],{"type":24,"value":1441},"动手体会",{"type":24,"value":1443},"：把 ",{"type":18,"tag":46,"props":1445,"children":1447},{"className":1446},[],[1448],{"type":24,"value":1424},{"type":24,"value":1450}," 改成 ",{"type":18,"tag":46,"props":1452,"children":1454},{"className":1453},[],[1455],{"type":24,"value":1456},"0.5",{"type":24,"value":1458}," 和 ",{"type":18,"tag":46,"props":1460,"children":1462},{"className":1461},[],[1463],{"type":24,"value":1464},"2.0",{"type":24,"value":1466}," 各跑一遍，直观看到簇变少\u002F变多。这就是「欠聚类 \u002F 过度聚类」——",{"type":18,"tag":33,"props":1468,"children":1469},{},[1470],{"type":24,"value":1471},"没有标准答案",{"type":24,"value":1473},"，要靠下一步 marker 验证。",{"type":18,"tag":125,"props":1475,"children":1476},{},[1477],{"type":18,"tag":27,"props":1478,"children":1479},{},[1480,1482,1488,1490,1496],{"type":24,"value":1481},"小提示：新版 scanpy 可能提示 ",{"type":18,"tag":46,"props":1483,"children":1485},{"className":1484},[],[1486],{"type":24,"value":1487},"flavor",{"type":24,"value":1489}," 的 FutureWarning，不影响。想消掉写 ",{"type":18,"tag":46,"props":1491,"children":1493},{"className":1492},[],[1494],{"type":24,"value":1495},"sc.tl.leiden(adata, resolution=1.0, flavor=\"igraph\", n_iterations=2, directed=False)",{"type":24,"value":578},{"type":18,"tag":41,"props":1498,"children":1500},{"code":1499,"language":136,"meta":7,"className":137,"style":7},"sc.tl.leiden(adata, resolution=2.0)\nsc.pl.umap(adata, color=['leiden'])\n  \n",[1501],{"type":18,"tag":46,"props":1502,"children":1503},{"__ignoreMap":7},[1504,1512],{"type":18,"tag":143,"props":1505,"children":1506},{"class":145,"line":146},[1507],{"type":18,"tag":143,"props":1508,"children":1509},{},[1510],{"type":24,"value":1511},"sc.tl.leiden(adata, resolution=2.0)\n",{"type":18,"tag":143,"props":1513,"children":1514},{"class":145,"line":155},[1515],{"type":18,"tag":143,"props":1516,"children":1517},{},[1518],{"type":24,"value":1519},"sc.pl.umap(adata, color=['leiden'])\n",{"type":18,"tag":181,"props":1521,"children":1523},{"id":1522},"第-8-步找每簇的特征基因-为贴标签做准备",[1524],{"type":24,"value":1525},"第 8 步：找每簇的特征基因 —— 为贴标签做准备",{"type":18,"tag":27,"props":1527,"children":1528},{},[1529,1535],{"type":18,"tag":46,"props":1530,"children":1532},{"className":1531},[],[1533],{"type":24,"value":1534},"rank_genes_groups",{"type":24,"value":1536}," 找出每个簇相比其他簇显著高表达的基因（Wilcoxon 检验）。这些就是给簇「贴标签」的线索。",{"type":18,"tag":27,"props":1538,"children":1539},{},[1540,1544],{"type":18,"tag":33,"props":1541,"children":1542},{},[1543],{"type":24,"value":204},{"type":24,"value":1545},"：每个簇排名最高的一串基因名。",{"type":18,"tag":125,"props":1547,"children":1548},{},[1549],{"type":18,"tag":27,"props":1550,"children":1551},{},[1552,1554,1559],{"type":24,"value":1553},"⚠️ caveat：scRNA-seq 的 DE p 值普遍",{"type":18,"tag":33,"props":1555,"children":1556},{},[1557],{"type":24,"value":1558},"偏乐观",{"type":24,"value":1560},"（用同一份数据先聚类、又在同一份数据上检验，叫 double dipping）。当线索用，别当统计铁证。",{"type":18,"tag":41,"props":1562,"children":1564},{"code":1563,"language":136,"meta":7,"className":137,"style":7},"sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')\nsc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)\n  \n",[1565],{"type":18,"tag":46,"props":1566,"children":1567},{"__ignoreMap":7},[1568,1576],{"type":18,"tag":143,"props":1569,"children":1570},{"class":145,"line":146},[1571],{"type":18,"tag":143,"props":1572,"children":1573},{},[1574],{"type":24,"value":1575},"sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')\n",{"type":18,"tag":143,"props":1577,"children":1578},{"class":145,"line":155},[1579],{"type":18,"tag":143,"props":1580,"children":1581},{},[1582],{"type":24,"value":1583},"sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)\n",{"type":18,"tag":181,"props":1585,"children":1587},{"id":1586},"第-9-步注释-给每个簇贴上这是什么细胞",[1588],{"type":24,"value":1589},"第 9 步：注释 —— 给每个簇贴上「这是什么细胞」",{"type":18,"tag":27,"props":1591,"children":1592},{},[1593,1595,1600],{"type":24,"value":1594},"看每个簇高表达哪些",{"type":18,"tag":33,"props":1596,"children":1597},{},[1598],{"type":24,"value":1599},"已知 marker 基因",{"type":24,"value":1601},"，对照贴标签。用 dotplot 看 marker 在各簇的表达（点越大 = 表达该基因的细胞比例越高，越红 = 平均表达越强）：",{"type":18,"tag":747,"props":1603,"children":1604},{},[1605,1621],{"type":18,"tag":751,"props":1606,"children":1607},{},[1608],{"type":18,"tag":755,"props":1609,"children":1610},{},[1611,1616],{"type":18,"tag":759,"props":1612,"children":1613},{},[1614],{"type":24,"value":1615},"marker",{"type":18,"tag":759,"props":1617,"children":1618},{},[1619],{"type":24,"value":1620},"细胞类型",{"type":18,"tag":775,"props":1622,"children":1623},{},[1624,1637,1650,1663,1676,1689,1702,1715],{"type":18,"tag":755,"props":1625,"children":1626},{},[1627,1632],{"type":18,"tag":782,"props":1628,"children":1629},{},[1630],{"type":24,"value":1631},"IL7R",{"type":18,"tag":782,"props":1633,"children":1634},{},[1635],{"type":24,"value":1636},"CD4 T 细胞",{"type":18,"tag":755,"props":1638,"children":1639},{},[1640,1645],{"type":18,"tag":782,"props":1641,"children":1642},{},[1643],{"type":24,"value":1644},"CD14, LYZ",{"type":18,"tag":782,"props":1646,"children":1647},{},[1648],{"type":24,"value":1649},"CD14+ 单核细胞",{"type":18,"tag":755,"props":1651,"children":1652},{},[1653,1658],{"type":18,"tag":782,"props":1654,"children":1655},{},[1656],{"type":24,"value":1657},"MS4A1",{"type":18,"tag":782,"props":1659,"children":1660},{},[1661],{"type":24,"value":1662},"B 细胞",{"type":18,"tag":755,"props":1664,"children":1665},{},[1666,1671],{"type":18,"tag":782,"props":1667,"children":1668},{},[1669],{"type":24,"value":1670},"CD8A",{"type":18,"tag":782,"props":1672,"children":1673},{},[1674],{"type":24,"value":1675},"CD8 T 细胞",{"type":18,"tag":755,"props":1677,"children":1678},{},[1679,1684],{"type":18,"tag":782,"props":1680,"children":1681},{},[1682],{"type":24,"value":1683},"GNLY, NKG7",{"type":18,"tag":782,"props":1685,"children":1686},{},[1687],{"type":24,"value":1688},"NK 细胞",{"type":18,"tag":755,"props":1690,"children":1691},{},[1692,1697],{"type":18,"tag":782,"props":1693,"children":1694},{},[1695],{"type":24,"value":1696},"FCGR3A, MS4A7",{"type":18,"tag":782,"props":1698,"children":1699},{},[1700],{"type":24,"value":1701},"FCGR3A+ 单核细胞",{"type":18,"tag":755,"props":1703,"children":1704},{},[1705,1710],{"type":18,"tag":782,"props":1706,"children":1707},{},[1708],{"type":24,"value":1709},"FCER1A, CST3",{"type":18,"tag":782,"props":1711,"children":1712},{},[1713],{"type":24,"value":1714},"树突状细胞 (DC)",{"type":18,"tag":755,"props":1716,"children":1717},{},[1718,1723],{"type":18,"tag":782,"props":1719,"children":1720},{},[1721],{"type":24,"value":1722},"PPBP",{"type":18,"tag":782,"props":1724,"children":1725},{},[1726],{"type":24,"value":1727},"巨核细胞",{"type":18,"tag":27,"props":1729,"children":1730},{},[1731,1736],{"type":18,"tag":33,"props":1732,"children":1733},{},[1734],{"type":24,"value":1735},"这步最吃判断",{"type":24,"value":1737},"：marker 模糊、簇对不上、或某簇像 doublet 都正常——回到清单 3.0，「生物学上成不成立」该拉师兄师姐\u002F导师把关，不归你。",{"type":18,"tag":41,"props":1739,"children":1741},{"code":1740,"language":136,"meta":7,"className":137,"style":7},"marker_genes = ['IL7R','CD14','LYZ','MS4A1','CD8A','GNLY','NKG7','FCGR3A','MS4A7','FCER1A','CST3','PPBP']\nsc.pl.dotplot(adata, marker_genes, groupby='leiden')\n   \n",[1742],{"type":18,"tag":46,"props":1743,"children":1744},{"__ignoreMap":7},[1745,1753],{"type":18,"tag":143,"props":1746,"children":1747},{"class":145,"line":146},[1748],{"type":18,"tag":143,"props":1749,"children":1750},{},[1751],{"type":24,"value":1752},"marker_genes = ['IL7R','CD14','LYZ','MS4A1','CD8A','GNLY','NKG7','FCGR3A','MS4A7','FCER1A','CST3','PPBP']\n",{"type":18,"tag":143,"props":1754,"children":1755},{"class":145,"line":155},[1756],{"type":18,"tag":143,"props":1757,"children":1758},{},[1759],{"type":24,"value":1760},"sc.pl.dotplot(adata, marker_genes, groupby='leiden')\n",{"type":18,"tag":181,"props":1762,"children":1764},{"id":1763},"️-下面那个-cell-报错了new-categories-need-to-have-the-same-number-of-items",[1765,1767],{"type":24,"value":1766},"⚠️ 下面那个 cell 报错了：",{"type":18,"tag":46,"props":1768,"children":1770},{"className":1769},[],[1771],{"type":24,"value":1772},"new categories need to have the same number of items",{"type":18,"tag":27,"props":1774,"children":1775},{},[1776,1781,1782,1788,1790,1795,1797,1802,1804],{"type":18,"tag":33,"props":1777,"children":1778},{},[1779],{"type":24,"value":1780},"原因",{"type":24,"value":526},{"type":18,"tag":46,"props":1783,"children":1785},{"className":1784},[],[1786],{"type":24,"value":1787},"new_names",{"type":24,"value":1789}," 写了 ",{"type":18,"tag":33,"props":1791,"children":1792},{},[1793],{"type":24,"value":1794},"8",{"type":24,"value":1796}," 个名字，但你这次 Leiden 实际聚出的簇",{"type":18,"tag":33,"props":1798,"children":1799},{},[1800],{"type":24,"value":1801},"不是 8 个",{"type":24,"value":1803},"（簇数随 scanpy\u002Fleiden 版本和 resolution 变化）。盲抄别人的 8 个名字必然对不上——",{"type":18,"tag":33,"props":1805,"children":1806},{},[1807],{"type":24,"value":1808},"这正是清单 3.0 说的：不能照搬，要对应你自己的结果。",{"type":18,"tag":27,"props":1810,"children":1811},{},[1812,1817],{"type":18,"tag":33,"props":1813,"children":1814},{},[1815],{"type":24,"value":1816},"第一步，先看你到底有几个簇、各簇高表达什么基因",{"type":24,"value":526},{"type":18,"tag":41,"props":1819,"children":1821},{"code":1820,"language":136,"meta":7,"className":137,"style":7},"print(adata.obs['leiden'].cat.categories)        # 你实际有几个簇、编号是什么\nimport pandas as pd\npd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)   # 每簇 Top10 基因，对照第 9 步 marker 表\n",[1822],{"type":18,"tag":46,"props":1823,"children":1824},{"__ignoreMap":7},[1825,1833,1841],{"type":18,"tag":143,"props":1826,"children":1827},{"class":145,"line":146},[1828],{"type":18,"tag":143,"props":1829,"children":1830},{},[1831],{"type":24,"value":1832},"print(adata.obs['leiden'].cat.categories)        # 你实际有几个簇、编号是什么\n",{"type":18,"tag":143,"props":1834,"children":1835},{"class":145,"line":155},[1836],{"type":18,"tag":143,"props":1837,"children":1838},{},[1839],{"type":24,"value":1840},"import pandas as pd\n",{"type":18,"tag":143,"props":1842,"children":1843},{"class":145,"line":164},[1844],{"type":18,"tag":143,"props":1845,"children":1846},{},[1847],{"type":24,"value":1848},"pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)   # 每簇 Top10 基因，对照第 9 步 marker 表\n",{"type":18,"tag":27,"props":1850,"children":1851},{},[1852,1857],{"type":18,"tag":33,"props":1853,"children":1854},{},[1855],{"type":24,"value":1856},"第二步，用「按簇编号映射」的稳健写法",{"type":24,"value":1858},"（不依赖簇的数量和顺序）：",{"type":18,"tag":41,"props":1860,"children":1862},{"code":1861,"language":136,"meta":7,"className":137,"style":7},"# 按 dotplot + 上面 Top 基因的判断逐个填；键=簇编号字符串，值=细胞类型\ncluster2type = {\n    '0': 'CD4 T',\n    '1': 'CD14+ Mono',\n    # ... 把你实际的每一个簇都填上（有几个填几个）\n}\nadata.obs['cell_type'] = adata.obs['leiden'].map(cluster2type).astype('category')\nsc.pl.umap(adata, color='cell_type', legend_loc='on data')\n",[1863],{"type":18,"tag":46,"props":1864,"children":1865},{"__ignoreMap":7},[1866,1874,1882,1890,1898,1906,1914,1923],{"type":18,"tag":143,"props":1867,"children":1868},{"class":145,"line":146},[1869],{"type":18,"tag":143,"props":1870,"children":1871},{},[1872],{"type":24,"value":1873},"# 按 dotplot + 上面 Top 基因的判断逐个填；键=簇编号字符串，值=细胞类型\n",{"type":18,"tag":143,"props":1875,"children":1876},{"class":145,"line":155},[1877],{"type":18,"tag":143,"props":1878,"children":1879},{},[1880],{"type":24,"value":1881},"cluster2type = {\n",{"type":18,"tag":143,"props":1883,"children":1884},{"class":145,"line":164},[1885],{"type":18,"tag":143,"props":1886,"children":1887},{},[1888],{"type":24,"value":1889},"    '0': 'CD4 T',\n",{"type":18,"tag":143,"props":1891,"children":1892},{"class":145,"line":173},[1893],{"type":18,"tag":143,"props":1894,"children":1895},{},[1896],{"type":24,"value":1897},"    '1': 'CD14+ Mono',\n",{"type":18,"tag":143,"props":1899,"children":1900},{"class":145,"line":347},[1901],{"type":18,"tag":143,"props":1902,"children":1903},{},[1904],{"type":24,"value":1905},"    # ... 把你实际的每一个簇都填上（有几个填几个）\n",{"type":18,"tag":143,"props":1907,"children":1908},{"class":145,"line":356},[1909],{"type":18,"tag":143,"props":1910,"children":1911},{},[1912],{"type":24,"value":1913},"}\n",{"type":18,"tag":143,"props":1915,"children":1917},{"class":145,"line":1916},7,[1918],{"type":18,"tag":143,"props":1919,"children":1920},{},[1921],{"type":24,"value":1922},"adata.obs['cell_type'] = adata.obs['leiden'].map(cluster2type).astype('category')\n",{"type":18,"tag":143,"props":1924,"children":1926},{"class":145,"line":1925},8,[1927],{"type":18,"tag":143,"props":1928,"children":1929},{},[1930],{"type":24,"value":1931},"sc.pl.umap(adata, color='cell_type', legend_loc='on data')\n",{"type":18,"tag":125,"props":1933,"children":1934},{},[1935],{"type":18,"tag":27,"props":1936,"children":1937},{},[1938,1940,1946,1948,1954,1956,1961,1963,1969],{"type":24,"value":1939},"这样即使簇数是 7 或 9，也只是 ",{"type":18,"tag":46,"props":1941,"children":1943},{"className":1942},[],[1944],{"type":24,"value":1945},"cluster2type",{"type":24,"value":1947}," 多写少写几行，不会再因「数量对不上」报错。原来的 ",{"type":18,"tag":46,"props":1949,"children":1951},{"className":1950},[],[1952],{"type":24,"value":1953},"rename_categories(new_names)",{"type":24,"value":1955}," 要求名字数量 == 簇数量且",{"type":18,"tag":33,"props":1957,"children":1958},{},[1959],{"type":24,"value":1960},"按顺序",{"type":24,"value":1962},"，很脆；换成 ",{"type":18,"tag":46,"props":1964,"children":1966},{"className":1965},[],[1967],{"type":24,"value":1968},"map",{"type":24,"value":1970}," 字典更工程化——这也呼应你「领域是广度、工程是深度」的思路：用工程手段把脆弱的领域操作变稳。",{"type":18,"tag":41,"props":1972,"children":1974},{"code":1973,"language":136,"meta":7,"className":137,"style":7},"new_names = ['CD4 T', 'CD14+ Mono', 'B', 'CD8 T', 'NK', 'FCGR3A+ Mono', 'DC', 'Megakaryocyte']\nadata.rename_categories('leiden', new_names)\nsc.pl.umap(adata, color='leiden', legend_loc='on data') \n",[1975],{"type":18,"tag":46,"props":1976,"children":1977},{"__ignoreMap":7},[1978,1986,1994],{"type":18,"tag":143,"props":1979,"children":1980},{"class":145,"line":146},[1981],{"type":18,"tag":143,"props":1982,"children":1983},{},[1984],{"type":24,"value":1985},"new_names = ['CD4 T', 'CD14+ Mono', 'B', 'CD8 T', 'NK', 'FCGR3A+ Mono', 'DC', 'Megakaryocyte']\n",{"type":18,"tag":143,"props":1987,"children":1988},{"class":145,"line":155},[1989],{"type":18,"tag":143,"props":1990,"children":1991},{},[1992],{"type":24,"value":1993},"adata.rename_categories('leiden', new_names)\n",{"type":18,"tag":143,"props":1995,"children":1996},{"class":145,"line":164},[1997],{"type":18,"tag":143,"props":1998,"children":1999},{},[2000],{"type":24,"value":2001},"sc.pl.umap(adata, color='leiden', legend_loc='on data')\n",{"type":18,"tag":181,"props":2003,"children":2005},{"id":2004},"第-10-步存盘",[2006],{"type":24,"value":2007},"第 10 步：存盘",{"type":18,"tag":27,"props":2009,"children":2010},{},[2011,2017,2019,2025],{"type":18,"tag":46,"props":2012,"children":2014},{"className":2013},[],[2015],{"type":24,"value":2016},"adata.write('pbmc3k_annotated.h5ad')",{"type":24,"value":2018}," 把带注释的结果存成 ",{"type":18,"tag":46,"props":2020,"children":2022},{"className":2021},[],[2023],{"type":24,"value":2024},".h5ad",{"type":24,"value":578},{"type":18,"tag":125,"props":2027,"children":2028},{},[2029,2055],{"type":18,"tag":27,"props":2030,"children":2031},{},[2032,2034,2039,2041,2046,2048,2053],{"type":24,"value":2033},"这就是单细胞分析的",{"type":18,"tag":33,"props":2035,"children":2036},{},[2037],{"type":24,"value":2038},"标准产物",{"type":24,"value":2040},"。后面学空转（Squidpy + 10x Visium）时，数据格式",{"type":18,"tag":33,"props":2042,"children":2043},{},[2044],{"type":24,"value":2045},"完全一样",{"type":24,"value":2047},"是 ",{"type":18,"tag":46,"props":2049,"children":2051},{"className":2050},[],[2052],{"type":24,"value":2024},{"type":24,"value":2054},"，80% 的步骤你已经会了——只多了「空间坐标」那一维。",{"type":18,"tag":27,"props":2056,"children":2057},{},[2058,2060,2065],{"type":24,"value":2059},"✅ 跑到这里、能看到一张 UMAP 上每坨细胞标着 T\u002FB\u002FNK\u002F单核\u002FDC 等名字，你就完成了学习清单 ",{"type":18,"tag":33,"props":2061,"children":2062},{},[2063],{"type":24,"value":2064},"3.4 检查点第 1 条",{"type":24,"value":2066},"：用 Scanpy 把 scRNA-seq 从 QC 跑到细胞类型注释。",{"type":18,"tag":41,"props":2068,"children":2070},{"code":2069,"language":136,"meta":7,"className":137,"style":7},"adata.write('pbmc3k_annotated.h5ad')\n",[2071],{"type":18,"tag":46,"props":2072,"children":2073},{"__ignoreMap":7},[2074],{"type":18,"tag":143,"props":2075,"children":2076},{"class":145,"line":146},[2077],{"type":18,"tag":143,"props":2078,"children":2079},{},[2080],{"type":24,"value":2069},{"type":18,"tag":2082,"props":2083,"children":2084},"style",{},[2085],{"type":24,"value":2086},"html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}",{"title":7,"searchDepth":155,"depth":155,"links":2088},[2089,2090,2091,2092,2093,2094,2095,2096,2097,2098,2099,2101],{"id":183,"depth":155,"text":186},{"id":365,"depth":155,"text":368},{"id":504,"depth":155,"text":507},{"id":929,"depth":155,"text":932},{"id":1063,"depth":155,"text":1066},{"id":1150,"depth":155,"text":1153},{"id":1276,"depth":155,"text":1279},{"id":1389,"depth":155,"text":1392},{"id":1522,"depth":155,"text":1525},{"id":1586,"depth":155,"text":1589},{"id":1763,"depth":155,"text":2100},"⚠️ 下面那个 cell 报错了：new categories need to have the same number of items",{"id":2004,"depth":155,"text":2007},"markdown","content:articles:AI制药理论:single-cell.md","content","articles\u002FAI制药理论\u002Fsingle-cell.md","articles\u002FAI制药理论\u002Fsingle-cell","md",1783051875974]