这个代码使用滑动窗口方法对DNA序列进行分词,然后将分词后的序列输入到DNABERT模型中以提取embedding。这段代码没有对DNABERT预训练模型进行微调,直接进行embedding。一般说来,要进行下游的分类等任务,是需要有标签的数据进行微调的。
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================
from Bio import SeqIO
from transformers import AutoTokenizer, AutoModel
import torch
def read_fasta(file):
sequences = []
for record in SeqIO.parse(file, "fasta"):
sequences.append(str(record.seq))
return sequences
def sliding_window(sequence, window_size, overlap):
return [sequence[i:i + window_size] for i in range(0, len(sequence) - window_size + 1, window_size - overlap)]
# 加载预训练的DNABERT模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M")
model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M")
# 读取fasta文件
sequences = read_fasta("sihua.fasta")
# 分词(滑动窗口)并提取embedding
embeddings = []
window_size = 15
overlap = 5
for seq in sequences:
windows = sliding_window(seq, window_size, overlap)
tokenized_sequence = tokenizer(' '.join(windows), return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**tokenized_sequence)
last_hidden_states = outputs.last_hidden_state
embeddings.append(last_hidden_states.mean(dim=1))
# 打印所有序列的embedding及其shape
for i, emb in enumerate(embeddings):
print(f"Embedding of sequence {i+1}:")
print(emb)
print(f"Shape of the embedding for sequence {i+1}:")
print(emb.shape)