从NCBI下载DNA序列

    
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================

import random
from Bio import SeqIO, Seq
import pandas as pd
import os

# Read the CSV file containing filenames
csv_file = "L-segment.csv"
file_data = pd.read_csv(csv_file, header=None)
fasta_files = file_data[0].tolist()

# Initialize a dictionary to store selected sequences
selected_sequences = {}

# Iterate through each FASTA file
for fasta_file in fasta_files:
    records = list(SeqIO.parse(fasta_file, "fasta"))

    if len(records) >= 4:
        # Randomly select one sequence from the current FASTA file
        selected_record = random.choice(records)
        file_name_without_extension = os.path.splitext(os.path.basename(fasta_file))[0]
        selected_sequences[file_name_without_extension] = selected_record.seq
    else:
        print(f"Skipping {fasta_file} due to insufficient sequences")

# Save the selected sequences to a new FASTA file
output_dir = "GeneratedSamples-L-500"
#在这个GeneratedSamples-L-500目录下产生了n个（n段）fasta格式文件，下一步按顺序合并这n段，就得到一个样本fasta格式文件
os.makedirs(output_dir, exist_ok=True)

for filename, sequence in selected_sequences.items():
    output_file = os.path.join(output_dir, f"{filename}_merged.fas")
    with open(output_file, "w") as output_handle:
        record = SeqIO.SeqRecord(sequence, id=filename, description=f"Merged DNA sequence for {filename}")
        SeqIO.write(record, output_handle, "fasta")

    print(f"Merged DNA sequence for {filename} saved to {output_file}")