给定accession number,批量下载fasta格式的DNA序列,并附带国家和采样日期信息。

    
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================
import csv
from time import sleep
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

Entrez.email = "sihuapeng@gmail.com"  

def fetch_record(accession):
    """从GenBank获取给定访问号的记录"""
    try:
        handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
        return SeqIO.read(handle, "genbank")
    except Exception as e:
        print(f"Connection error when fetching {accession}. Retrying in 5 seconds...")
        sleep(5)
        return fetch_record(accession)

def get_env_sequence_date_and_country(accession):
    """提取env基因的序列、收集日期和国家信息"""
    record = fetch_record(accession)

    env_sequence = None
    date = None
    country = None

    for feature in record.features:
        if feature.type == "source":
            date = feature.qualifiers.get("collection_date", [None])[0]
            country = feature.qualifiers.get("country", [None])[0]
        elif feature.type == "gene" and "gene" in feature.qualifiers:
            if feature.qualifiers["gene"][0] == "env":
                start, end = int(feature.location.start), int(feature.location.end)
                env_sequence = record.seq[start:end]
                break  # 如果找到env基因,就停止搜索

    if not env_sequence:
        print(f"No env gene sequence found for {accession}. Skipping...")
        return None

    new_record = SeqRecord(env_sequence, id=accession, description=f"{date} | {country}")
    return new_record

if __name__ == "__main__":
    with open("sample-651-acces.csv", "r") as file:
        reader = csv.reader(file)
        accession_numbers = [row[0] for row in reader]

    records = [get_env_sequence_date_and_country(accession) for accession in accession_numbers]
    records = [record for record in records if record]  # Remove any None values

    # Save the sequences with collection dates in a fasta file
    with open("env-gene-651-samples.fasta", "w") as output_file:
        SeqIO.write(records, output_file, "fasta")