从n段的m个文件中随机选择一行(一共m行),保存到n个fasta格式文件中。 |
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================
import random
from Bio import SeqIO, Seq
import pandas as pd
import os
# Read the CSV file containing filenames
csv_file = "L-segment.csv"
file_data = pd.read_csv(csv_file, header=None)
fasta_files = file_data[0].tolist()
# Initialize a dictionary to store selected sequences
selected_sequences = {}
# Iterate through each FASTA file
for fasta_file in fasta_files:
records = list(SeqIO.parse(fasta_file, "fasta"))
if len(records) >= 4:
# Randomly select one sequence from the current FASTA file
selected_record = random.choice(records)
file_name_without_extension = os.path.splitext(os.path.basename(fasta_file))[0]
selected_sequences[file_name_without_extension] = selected_record.seq
else:
print(f"Skipping {fasta_file} due to insufficient sequences")
# Save the selected sequences to a new FASTA file
output_dir = "GeneratedSamples-L-500"
#在这个GeneratedSamples-L-500目录下产生了n个(n段)fasta格式文件,下一步按顺序合并这n段,就得到一个样本fasta格式文件
os.makedirs(output_dir, exist_ok=True)
for filename, sequence in selected_sequences.items():
output_file = os.path.join(output_dir, f"{filename}_merged.fas")
with open(output_file, "w") as output_handle:
record = SeqIO.SeqRecord(sequence, id=filename, description=f"Merged DNA sequence for {filename}")
SeqIO.write(record, output_handle, "fasta")
print(f"Merged DNA sequence for {filename} saved to {output_file}")