这是对源代码(predict_seq.py)的订正,感谢代码原作者Bonnie I-Man Ng的贡献。


#==================================================#
#The original author is Bonnie I-Ma.
#The original code was revised by Sihua Peng, PhD. 
#==================================================#
import numpy as np
import re
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def DNA_matrix(seq):
    tem2 = ['[aA]','[cC]','[gG]','[tT]']
    for i in range(len(tem2)):
        ind = [m.start() for m in re.finditer(tem2[i], seq)]
        tem = np.zeros(len(seq),dtype=int)
        tem[ind] = 1
        if i == 0:
            a = np.zeros((len(seq),4))
        a[..., i] = tem
    return a

# Read labels
with open('humanvsran_label', 'r') as f:
    lines = f.readlines()
dat_y = np.zeros((len(lines), 2))
for i in range(len(lines)):
    dat_y[i, int(lines[i].rstrip())] = 1

# Read sequences
file_path = 'filename.txt'  # Replace with your actual filename
with open(file_path, 'r') as f:
    lines = f.readlines()
for i in range(len(lines)):
    tem = lines[i].rstrip()
    if i == 0:
        dat_x = np.zeros((len(lines), len(tem), 4))
    dat_x[i, ] = DNA_matrix(tem)

ind = list(range(20000))
random.shuffle(ind)
x_train = dat_x[ind[0:14000]]
y_train = dat_y[ind[0:14000]]
x_val = dat_x[ind[14001:20001]]
y_val = dat_y[ind[14001:20001]]

x_train = x_train.astype('float32')
x_val = x_val.astype('float32')

model = Sequential()
model.add(Conv1D(filters=300, kernel_size=19, strides=1, padding='valid', input_shape=(250,4)))
model.add(MaxPooling1D(pool_size=10, strides=5, padding='same'))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2, mode='min')
history = model.fit(x_train, y_train, batch_size=16, epochs=5, verbose=1, callbacks=[early_stopping])

print(history.history)

weights = model.layers[0].get_weights()
wt = np.transpose(weights[0][:, :, 0])
wtm = np.transpose(weights[0][:, :, 0]).min(axis=0)
wtp = wt-wtm
wtps = np.sum(wtp, axis=0)
print(np.round((wtp/wtps)*100))

y_score = model.predict(x_val)
loss, acc = model.evaluate(x_val, y_val, verbose=0)
print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))

def generate_results(y_test, y_score):
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.show()
    print('AUC: %f' % roc_auc)

generate_results(y_val[:, 0], y_score[:, 0])

    


请到原来的地址下载两个数据文件,并把数据文件放在代码的当前目录下。

humanvsran_label是标签文件,filename.txt是DNA序列文件。注意:DNA序列文件不是fasta格式,是一行一段DNA序列。