给定accession number,批量下载fasta格式的DNA序列,并附带国家和采样日期信息,输入的csv文件每行第一个是accession number,第二个是起始下载位置,第三个是终点位置。

    
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================

import csv
from Bio import Entrez, SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# 定义从NCBI下载指定accession number和位置的函数
def fetch_sequence_from_ncbi(accession, start, end):
    Entrez.email = "sihuapeng@gmail.com"  
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
    record = SeqIO.read(handle, "genbank")
    handle.close()

    sequence_str = str(record.seq[int(start)-1:int(end)])
    
    # 查找 /collection_date 和 /country 的值
    for feature in record.features:
        if feature.type == "source":
            country = feature.qualifiers.get('country', [''])[0]
            collection_date = feature.qualifiers.get('collection_date', [''])[0]
            break

    # 设置序列的描述和ID
    seq_id = f"{accession}|{country}|{collection_date}"
    description = f"{accession} {country} {collection_date}"
    
    # 将提取的序列字符串转换为 SeqRecord 对象
    sequence_record = SeqRecord(Seq(sequence_str), id=seq_id, description=description)
    
    return sequence_record

# 从CSV文件读取信息并下载相应的序列
sequences = []
with open("Sample-561-with-position.csv", "r") as csv_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
        accession, start, end = row
        sequence = fetch_sequence_from_ncbi(accession, start, end)
        sequences.append(sequence)

# 保存结果到Sample-561.fasta文件
with open("Sample-561.fasta", "w") as output_file:
    SeqIO.write(sequences, output_file, "fasta")

print("所有的序列已保存到 Sample-561.fasta 文件中")