给定accession number,批量下载fasta格式的DNA序列,并附带国家和采样日期信息。
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================
import csv
from time import sleep
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
Entrez.email = "sihuapeng@gmail.com"
def fetch_record(accession):
"""从GenBank获取给定访问号的记录"""
try:
handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
return SeqIO.read(handle, "genbank")
except Exception as e:
print(f"Connection error when fetching {accession}. Retrying in 5 seconds...")
sleep(5)
return fetch_record(accession)
def get_env_sequence_date_and_country(accession):
"""提取env基因的序列、收集日期和国家信息"""
record = fetch_record(accession)
env_sequence = None
date = None
country = None
for feature in record.features:
if feature.type == "source":
date = feature.qualifiers.get("collection_date", [None])[0]
country = feature.qualifiers.get("country", [None])[0]
elif feature.type == "gene" and "gene" in feature.qualifiers:
if feature.qualifiers["gene"][0] == "env":
start, end = int(feature.location.start), int(feature.location.end)
env_sequence = record.seq[start:end]
break # 如果找到env基因,就停止搜索
if not env_sequence:
print(f"No env gene sequence found for {accession}. Skipping...")
return None
new_record = SeqRecord(env_sequence, id=accession, description=f"{date} | {country}")
return new_record
if __name__ == "__main__":
with open("sample-651-acces.csv", "r") as file:
reader = csv.reader(file)
accession_numbers = [row[0] for row in reader]
records = [get_env_sequence_date_and_country(accession) for accession in accession_numbers]
records = [record for record in records if record] # Remove any None values
# Save the sequences with collection dates in a fasta file
with open("env-gene-651-samples.fasta", "w") as output_file:
SeqIO.write(records, output_file, "fasta")