把比对后的m个fasta格式文件进行分段。输入文件是csv文件,一行一个fasta格式文件(m行)。分成n段,结果保存到n个csv文件,一段一个文件,一个文件中有m行DNA序列,每行的DNA序列长度相同。K-first-segment.csv中有m行DNA片段,供下一步随机选取。 |
#==========================================
# This code was written by Sihua Peng, PhD.
#==========================================
from Bio import SeqIO
import csv
# 定义函数将DNA序列分成n段
def split_sequence(dna_sequence):
segments = [
dna_sequence[0:250],
dna_sequence[250:500],
dna_sequence[500:750],
dna_sequence[750:1000],
dna_sequence[1000:1250],
dna_sequence[1250:1500],
dna_sequence[1500:1750],
dna_sequence[1750:2000],
dna_sequence[2000:2250],
dna_sequence[2250:]
]
return segments
# 清空CSV文件
def clear_csv_files():
output_files = [
'K-first-segment.csv',
'K-second-segment.csv',
'K-third-segment.csv',
'K-fourth-segment.csv',
'K-fifth-segment.csv',
'K-sixth-segment.csv',
'K-seventh-segment.csv',
'K-eighth-segment.csv',
'K-nineth-segment.csv',
'K-tenth-segment.csv'
]
for file in output_files:
with open(file, 'w') as csvfile:
pass
# 清空CSV文件
clear_csv_files()
# 读取sihua.csv文件,并处理每个fasta格式文件
with open('DNA_file-5.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
fasta_file = row[0] if row else "" # Use the value of the row as the filename
if fasta_file:
for record in SeqIO.parse(fasta_file, 'fasta'):
dna_sequence = str(record.seq)
segments = split_sequence(dna_sequence)
# 将每一段DNA序列写入对应的CSV文件
output_files = [
'K-first-segment.csv',
'K-second-segment.csv',
'K-third-segment.csv',
'K-fourth-segment.csv',
'K-fifth-segment.csv',
'K-sixth-segment.csv',
'K-seventh-segment.csv',
'K-eighth-segment.csv',
'K-nineth-segment.csv',
'K-tenth-segment.csv'
]
for j, segment in enumerate(segments):
with open(output_files[j], 'a', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([segment]) # 将片段作为单元素列表写入行中