🌟 2025:生物AI的"DeepSeek时刻"
当整个中文互联网为国产大语言模型DeepSeek欢呼时,生命科学界正悄然掀起一场静默革命——由Arc Institute领衔,斯坦福、UC Berkeley、哥大、UCSF携手英伟达等顶尖AI企业,共同推出百亿参数级生物语义理解引擎Evo2!这个能直接"读懂"核苷酸语言的神奇模型,正在重新定义我们对基因密码的认知方式🧬
🌟模型亮点速览:
🧬 直接解析核苷酸序列的"生物语言"
🚀 支持8192长度(base模型)1 百万(完整模型)超长基因片段处理
🌍 跨物种基因理解能力升级
🔓 完全开源!支持NVIDIA NIM云端部署和本地运行
(👉小贴士:想了解Evo1到Evo2的架构革命?快在评论区催更技术解析专题!)
🔍本期实战目标:
🛠️ 从零开始的保姆级教程
🚀用Embedding+DNN实现BRCA1突变效应预测
📊性能飞跃:相较于仅用score函数差值预测的0.7 AUROC(Fair级),Embedding+DNN方案直冲0.9 AUROC(Good级)📈
✨ 小贴士:
我从零开始,租用新的Auto-dl服务器,搭建环境,重跑code,以保证每个新手小白都能有成功感地一次性运行成功,不产生任何报错。已准备好开箱即用的Auto-dl镜像,评论区@zylAK(我的博客园昵称)即刻获取🚀 如果觉得帖子不错欢迎转发zylAK的帖子给小伙伴们,你们的支持是我更新的动力。如果还想了解Evo2更多的应用,例如如何设计新的核酸序列、如何获得可解释的大语言模型理解等,都可以在评论区催更。
废话不多说,以Auto-dl云服务器为例,直接上代码:
一、前期准备:
1.1 云服务器配置:
1.2 开启Auto-dl的学术加速并从github上下载Evo2项目文件,激活:
命令行是:
1 source /etc/network_turbo #开启学术加速
2 git clone https://github.com/ArcInstitute/evo2.git #下载Evo2项目文件
3cd evo2 #进入项目路径
4 git clone https://github.com/Zymrael/vortex.git#手动安装vortex依赖
5 python setup.py install#项目激活
1.3 从hugging face镜像站hf-mirror上下载对应Evo2模型,并存储在本地,以供后续调用。目前4090机器的配置可以运行1b和7b的模型(完整和base版均可),40b模型可能需要内存更高的机器且需要多卡GPU部署,这一期暂不讨论。三种参数量的模型效果相差不那么明显。
命令行是:
1 cd /root/autodl-tmp/evo2/evo2_models #在项目文件夹中单独创建一个evo2_model文件夹用于保存下载的模型,这样就不用每次调用时重新下载了23#设置huggingface-cli下载的镜像网址4 export HF_ENDPOINT=https://hf-mirror.com56#下载evo2_1b_base模型为例7 huggingface-cli download --resume-download arcinstitute/evo2_1b_base --local-dir /root/autodl-tmp/evo2/evo2_models
正确的下载运行时会有如下的输出(进度条逐渐增加)
二、模型加载与Embeddings提取
2.1 导入项目需要用到的所有packages
1fromBioimportSeqIO2importgzip3import matplotlib.pyplot as plt4import numpy as np5import pandas as pd6importos7import seaborn as sns8fromsklearn.metricsimportroc_auc_score9import numpy as np10importtorch11import torch.nn as nn12fromtorch.utils.dataimport DataLoader, TensorDataset13fromsklearn.model_selectionimporttrain_test_split14fromsklearn.metricsimportroc_auc_score15fromtorch.optim.lr_schedulerimportReduceLROnPlateau16frompathlibimportPath17fromtqdm.notebookimporttqdm18import transformer_engine.pytorch as te19fromtransformer_engine.commonimportrecipe
可能会出现报错:
但完全不影响后续代码运行。如果后期flash-attn包升级造成冲突,可以指定安装2.7.4版本。
2.2 加载模型
os.chdir('/root/autodl-tmp/evo2/evo2')model_path="/root/autodl-tmp/evo2/evo2_models/evo2_1b_base/evo2_1b_base.pt"fromevo2.modelsimportEvo2model= Evo2(model_name='evo2_1b_base',local_path=model_path)
2.3 加载并解析输入数据——BRCA1数据,包括序列数据,突变位点,突变效应分类。详细说明请见Evo2项目案例介绍:https://github.com/ArcInstitute/evo2/tree/main/notebooks/brca1
1os.chdir('/root/autodl-tmp/evo2')
brca1_df =pd.read_excel(
2os.path.join('notebooks','brca1','41586_2018_461_MOESM3_ESM.xlsx'),3header=2,4)5 brca1_df =brca1_df[[6'chromosome','position (hg19)','reference','alt','function.score.mean','func.class',7]]89#对列名重命名10brca1_df.rename(columns={11'chromosome':'chrom',12'position (hg19)':'pos',13'reference':'ref',14'alt':'alt',15'function.score.mean':'score',16'func.class':'class',17 }, inplace=True)1819#将突变效应命名为二分类标签20brca1_df['class'] = brca1_df['class'].replace(['FUNC','INT'],'FUNC/INT')2122 WINDOW_SIZE = 81922324#读取17号染色体参考基因组序列25 with gzip.open(os.path.join('notebooks','brca1','GRCh37.p13_chr17.fna.gz'),"rt") as handle:26forrecordinSeqIO.parse(handle,"fasta"):27 seq_chr17 =str(record.seq)28break2930def parse_sequences(pos, ref, alt):31"""32解析参考序列(未突变序列)和突变序列33"""34 p = pos - 1 # Convert to 0-indexed position35 full_seq =seq_chr173637 ref_seq_start = max(0, p - WINDOW_SIZE//2)38 ref_seq_end = min(len(full_seq), p + WINDOW_SIZE//2)39 ref_seq =seq_chr17[ref_seq_start:ref_seq_end]40 snv_pos_in_ref = min(WINDOW_SIZE//2, p)41 var_seq = ref_seq[:snv_pos_in_ref] + alt + ref_seq[snv_pos_in_ref+1:]4243#数据合理性检查44assert len(var_seq) ==len(ref_seq)45assert ref_seq[snv_pos_in_ref] ==ref46assert var_seq[snv_pos_in_ref] ==alt4748return ref_seq, var_seq4950#给参考序列一个索引值51 ref_seqs =[]52 ref_seq_to_index ={}5354#解析序列并存储索引值55 ref_seq_indexes =[]56 var_seqs =[]5758for _, row inbrca1_df.iterrows():59 ref_seq, var_seq = parse_sequences(row['pos'], row['ref'], row['alt'])6061#给当前循环到的参考序列获取/创建索引62ifref_seqnotinref_seq_to_index:63 ref_seq_to_index[ref_seq] =len(ref_seqs)64ref_seqs.append(ref_seq)6566ref_seq_indexes.append(ref_seq_to_index[ref_seq])67var_seqs.append(var_seq)6869 ref_seq_indexes = np.array(ref_seq_indexes)
2.4 以BCRA1序列为输入,提取全部全部层的Embedding并保存下来
1# ========== 配置参数 ==========2 device =next(model.model.parameters()).device3 candidate_layers = [f"blocks.{i}.pre_norm"foriinrange(25)]4 batch_size = 8 #根据GPU显存调整5 save_dir = "extract_embeddings"6 os.makedirs(save_dir, exist_ok=True)78# ========== 批量嵌入提取 ==========9def process_sequences(seq_list, layer_name, desc, prefix="ref"):10"""批量处理序列嵌入并确保文件保存"""11#生成标准化文件名12 sanitized_layer = layer_name.replace('.','_')13 memmap_path = os.path.join(save_dir, f"{prefix}_{sanitized_layer}.npy")1415#创建内存映射文件并立即保存头信息16 emb_mmap =np.lib.format.open_memmap(17memmap_path,18dtype=np.float32,19mode='w+',20 shape=(len(seq_list), 1920)21)2223try:24#分批处理25foriin tqdm(range(0, len(seq_list), batch_size), desc=desc, leave=False):26 batch_seqs = seq_list[i:i+batch_size]2728#Tokenize并填充29 batch_tokens =[]30forseqinbatch_seqs:31 tokens =model.tokenizer.tokenize(seq)32 batch_tokens.append(torch.tensor(tokens, dtype=torch.long))3334 max_len = max(len(t) fortinbatch_tokens)35 padded_tokens =torch.stack([36 torch.nn.functional.pad(t, (0, max_len - len(t))) fortinbatch_tokens37]).to(device)3839#前向传播40 with torch.no_grad():41 _, emb_dict =model.forward(42padded_tokens,43return_embeddings=True,44layer_names=[layer_name]45)4647#写入内存映射文件48 batch_emb = emb_dict[layer_name].float().mean(dim=1).cpu().numpy()49 emb_mmap[i:i+len(batch_emb)] =batch_emb5051#立即刷新写入磁盘52emb_mmap.flush()5354finally:55#确保文件关闭56delemb_mmap5758returnmemmap_path5960# ========== 主流程 ==========61# 预先生成全局索引文件 (只需保存一次)62np.save(os.path.join(save_dir,"ref_idx.npy"), ref_seq_indexes)6364forlayer_namein tqdm(candidate_layers, desc="🔍 Processing Layers"):65# 处理参考序列 (生成 ref_blocks_0_pre_norm.npy)66 _ =process_sequences(67ref_seqs,68layer_name,69f"🧬 Ref {layer_name}",70prefix="ref"71)7273# 处理变异序列 (生成 var_blocks_0_pre_norm.npy)74 _ =process_sequences(75var_seqs,76layer_name,77f"🧬 Var {layer_name}",78prefix="var"79)
正确运行后有如下输出显示:
三、基于保存的Embeddings开发下游的突变效应预测器:
3.1 Embedding数据加载函数的定义
1# ========== 新增配置参数 ==========2 embed_dir = Path("extract_embeddings")3 layers_to_train = [f"blocks.{i}.pre_norm"foriinrange(25)]#需要训练的层列表,全部的25层4 results_dir = Path("training_results")5results_dir.mkdir(exist_ok=True)67# ========== 数据加载函数 ==========8defload_layer_data(layer_name):9"""加载指定层的嵌入数据和标签"""10 sanitized = layer_name.replace('.','_')1112#加载嵌入数据(内存映射模式)13 ref_emb = np.load(embed_dir/f"ref_{sanitized}.npy", mmap_mode='r')14 var_emb = np.load(embed_dir/f"var_{sanitized}.npy", mmap_mode='r')15 ref_idx = np.load(embed_dir/"ref_idx.npy")1617#拼接特征18 X = np.concatenate([ref_emb[ref_idx], var_emb], axis=1)1920#获取标签(从原始数据框)21 y = brca1_df['class'].map({'FUNC/INT':0,'LOF':1}).values2223return X, y
3.2 Embeddings数据正确性检验。主要是验证数据是否存在,以及是否符合Evo2_1b_base模型中间层的维度(1920维)
1#示例,检查第24层文件是否存在2assertos.path.exists("extract_embeddings/ref_blocks_24_pre_norm.npy")3assertos.path.exists("extract_embeddings/var_blocks_24_pre_norm.npy")45#验证嵌入维度6 ref_emb = np.load("extract_embeddings/ref_blocks_24_pre_norm.npy", mmap_mode='r')7 var_emb = np.load("extract_embeddings/var_blocks_24_pre_norm.npy", mmap_mode='r')8print(f"参考序列嵌入维度: {ref_emb.shape}")# 应为 (N_ref, 1920)9print(f"变异序列嵌入维度: {var_emb.shape}")# 应为 (N_ref, 1920)1011forlayer_nameinlayers_to_train:12print(layer_name)13 X, y =load_layer_data(layer_name)14print(X.shape)
3.3 (重要)定义分离器DNN的结构(仿照原文中的DNN结构)
1# ========== 模型架构 ==========2classBRCA1Classifier(nn.Module):3def__init__(self, input_dim):4super().__init__()5 self.net =nn.Sequential(6 nn.Linear(input_dim, 512),7nn.ReLU(),8nn.BatchNorm1d(512),9nn.Dropout(0.3),1011 nn.Linear(512, 128),12nn.ReLU(),13nn.BatchNorm1d(128),14nn.Dropout(0.3),1516 nn.Linear(128, 32),17nn.ReLU(),18nn.BatchNorm1d(32),1920 nn.Linear(32, 1),21nn.Sigmoid()22)2324def forward(self, x):25returnself.net(x)
3.4 (重要)训练流程定义
1# ========== 训练流程 ==========2deftrain_for_layer(layer_name):3"""单层训练流程"""4print(f"\n=== 开始训练层 {layer_name} ===")56#加载数据7 X, y =load_layer_data(layer_name)89#数据划分10 X_temp, X_test, y_temp, y_test =train_test_split(11 X, y, test_size=0.2, random_state=42, stratify=y12)13 X_train, X_val, y_train, y_val =train_test_split(14 X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp15)1617# 转换为PyTorch Dataset18 train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train).unsqueeze(1))19 val_dataset = TensorDataset(torch.FloatTensor(X_val), torch.FloatTensor(y_val).unsqueeze(1))20 test_dataset = TensorDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test).unsqueeze(1))2122# ========== 训练配置 ==========23 device = torch.device("cuda"iftorch.cuda.is_available()else"cpu")24 model = BRCA1Classifier(X.shape[1]).to(device)2526#优化器和损失函数27 optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)28 criterion =nn.BCELoss()2930#学习率调度器31 scheduler =ReduceLROnPlateau(32optimizer,33mode='max',34factor=0.5,35patience=20,36min_lr=1e-637)3839#数据加载器40 train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)41 val_loader = DataLoader(val_dataset, batch_size=128)42 test_loader = DataLoader(test_dataset, batch_size=128)4344# ========== 训练循环 ==========45 best_auc =046 patience_counter =047 max_patience = 1004849forepochinrange(500):50#训练阶段51model.train()52 train_loss =053for inputs, labels intrain_loader:54 inputs, labels = inputs.to(device), labels.to(device)5556optimizer.zero_grad()57 outputs =model(inputs)58 loss = criterion(outputs, labels)5960#梯度裁剪61 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)6263loss.backward()64optimizer.step()65 train_loss += loss.item() *inputs.size(0)6667#验证阶段68model.eval()69 val_loss =070 y_true, y_pred = [], []71 with torch.no_grad():72for inputs, labels inval_loader:73 inputs, labels = inputs.to(device), labels.to(device)74 outputs =model(inputs)75 val_loss += criterion(outputs, labels).item() *inputs.size(0)76y_true.extend(labels.cpu().numpy())77y_pred.extend(outputs.cpu().numpy())7879#计算指标80 train_loss /=len(train_loader.dataset)81 val_loss /=len(val_loader.dataset)82 val_auc = roc_auc_score(y_true, y_pred)8384#学习率调整85scheduler.step(val_auc)8687#早停机制88if val_auc >best_auc:89 best_auc =val_auc90 patience_counter =091torch.save(model.state_dict(),'best_model.pth')92else:93 patience_counter += 194if patience_counter >=max_patience:95print(f"早停触发于第{epoch}轮")96break9798#打印进度99print(f"Epoch {epoch+1}: "100f"Train Loss: {train_loss:.4f} | "101f"Val Loss: {val_loss:.4f} | "102f"Val AUROC: {val_auc:.4f}")103104# ========== 最终评估 ==========105model.load_state_dict(torch.load('best_model.pth'))106model.eval()107 y_test_true, y_test_pred = [], []108 with torch.no_grad():109for inputs, labels intest_loader:110 inputs, labels = inputs.to(device), labels.to(device)111 outputs =model(inputs)112y_test_true.extend(labels.cpu().numpy())113y_test_pred.extend(outputs.cpu().numpy())114115 test_auc = roc_auc_score(y_test_true, y_test_pred)116print(f"\n最终测试集AUROC: {test_auc:.4f}")117118#保存结果119 sanitized = layer_name.replace('.','_')120 torch.save(model.state_dict(), results_dir/f"best_model_{sanitized}.pth")121np.save(results_dir/f"test_pred_{sanitized}.npy", y_test_pred)122123returntest_auc
3.5 执行训练流程
1# ========== 主执行流程 ==========2if__name__=="__main__":3 results ={}4forlayerin tqdm(layers_to_train, desc="Training Layers"):5try:6 auc =train_for_layer(layer)7 results[layer] =auc8except Exception as e:9print(f"训练层 {layer} 时出错: {str(e)}")10 results[layer] =None1112#保存最终结果13 with open(results_dir/"summary.txt","w") as f:14for layer, auc inresults.items():15f.write(f"{layer}: {auc:.4f}\n")
运行成功后会有类似如下输出:
最后,我们可以通过检查summary.txt获取训练最优的训练结果是利用哪一层Embedding。我训练的结果显示第12层embedding训练得到的DNN预测器效果最好,小伙伴伴们也可以自己尝试不同的模型下,不同的DNN结构,哪一层能获得最好的预测效果。