Hitherto, peak ND energies (peak $E$s) were used as the data points in the correlation test with experimental folding rate. The results show promise for two-state folders --a significant ($\alpha$ = 0.05) Pearson correlation with coefficient around -0.7 was achieved--, but less so for multi-state folders --a significant Pearson correlation with coefficient around -0.5 was achieved only after excluding an outlier.
In this notebook, we explore how much these results can be improved by selecting different data points from the choices presented by a ND variant, using a linear regression model. The regression function's main role here is to assist in the adjustment of ND energy data points to improve correlation with experimental folding rate.
Adjust Method:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, norm
import matplotlib.pyplot as plt
from sklearn import linear_model
import NDutils as nd # my lib
# constrain the candidate Es to those associated with peak Q
# shift in Q affected by considerable overlapping Gaussians
# new E is the mean of the Gaussian at new Q
def adjust1(reg, pids, y, peakEs, peakQs, ndstats: dict, nd_Eparams: dict, \
min_Q=0.1, max_Q=0.9):
"""Does one round of adjustments."""
a = nd.Qs.index(min_Q); b = nd.Qs.index(max_Q)
new_Es = peakEs.copy(); new_Qs = peakQs.copy()
num_changes = 0
for i, pid in enumerate(pids):
peak_q = peakQs[i] # peak Q for a pid
Es = ndstats[pid].loc[peak_q].sc_mje.values # E values associated with peak E Gaussian
pred_y = reg.predict(Es.reshape(-1,1))
j = np.argmin(np.abs(pred_y - y[i])) # get the index of the smallest residual
new_E = Es[j]; # x that produces the smallest residual
# find Q that most likely generates new_E
q_params = [ nd_Eparams[pid, q] for q in nd.Qs[a : b+1] ]
dens = [norm.pdf(new_E, m, s) for m, s in q_params] # Gaussian density of x at each q
new_Q = nd.Qs[np.argmax(dens) + a]
if new_Q != peak_q:
new_Qs[i] = new_Q
new_Es[i] = nd_Eparams[pid, new_Q][0] # take the mean of the Gaussian at new_Q
num_changes += 1
return new_Qs, new_Es, num_changes
def print_stats(Qs: list, Es: list, frates: np.array, peakQs: list):
print("Q stats:", np.round(np.mean(Qs), 4), np.round(np.std(Qs), 4), \
np.min(Qs), np.max(Qs), np.sum(np.array(Qs) > 0.5))
print("MAD from peak Qs:", np.mean(np.abs(np.array(peakQs) - np.array(Qs))))
print("E range:", np.round(nd.mjes_range(Es), 4))
x = np.array(Es)
print("Corr with exp. fold rate:", np.round(pearsonr(x, frates), 4))
y_true = frates.reshape(-1, 1)
reg.fit(x.reshape(-1,1), y_true)
y_pred = reg.predict(x.reshape(-1,1))
print("MSE, R2, adj_R2:", np.round(nd.mra(y_true, y_pred), 4)) # mse, r2, adj_r2
print("predicted range:", np.ptp(y_true), np.round(np.ptp(y_pred), 4))
return y_pred
fn = os.path.join('UZ_c', 'UZ_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist()
fold_rates = df.k_f.values
print(len(PIDs))
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the three ND variants
c_ndstats = nd.get_NDstats('UZ_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('UZ_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('UZ_r', PIDs, 'r')
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c' peak (unadjusted).")
peak_c_y_pred = print_stats(c_peakQs, c_peakEs, fold_rates, c_peakQs)
c_newQs, c_newEs, num_changes = adjust1(reg, PIDs, y_true, c_peakEs, c_peakQs,\
c_ndstats, c_mje_params)
print("'c' new (adjusted). ", num_changes)
new_c_y_pred = print_stats(c_newQs, c_newEs, fold_rates, c_peakQs)
x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x' peak (unadjusted).")
peak_x_y_pred = print_stats(x_peakQs, x_peakEs, fold_rates, x_peakQs)
x_newQs, x_newEs, num_changes = adjust1(reg, PIDs, y_true, x_peakEs, x_peakQs,\
x_ndstats, x_mje_params)
print("'x' new (adjusted). ", num_changes)
new_x_y_pred = print_stats(x_newQs, x_newEs, fold_rates, x_peakQs)
r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r' peak (unadjusted).")
peak_r_y_pred = print_stats(r_peakQs, r_peakEs, fold_rates, r_peakQs)
r_newQs, r_newEs, num_changes = adjust1(reg, PIDs, y_true, r_peakEs, r_peakQs,\
r_ndstats, r_mje_params)
print("'r' new (adjusted). ", num_changes)
new_r_y_pred = print_stats(r_newQs, r_newEs, fold_rates, r_peakQs)
fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
plt.suptitle("Change in Qs, Uzunoglo", fontsize=14)
ax = axes[0]
ax.scatter(PIDs, c_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, c_newQs, marker='x', label="new")
ax.set_title("'c' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.set_ylabel('Q', fontsize=14); ax.legend()
ax = axes[1]
ax.scatter(PIDs, x_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, x_newQs, marker='x', label="new")
ax.set_title("'x' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.legend()
ax = axes[2]
ax.scatter(PIDs, r_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, r_newQs, marker='x', label="new")
ax.set_title("'r' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
_ = ax.legend()
fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
fig.suptitle("Change in Es, Uzunoglo", fontsize=14)
ax = axes[0][0]
ax.scatter(c_peakEs, y_true, color='tab:blue', alpha=0.5, label='c peak')
ax.plot(c_peakEs, peak_c_y_pred, color='tab:blue', alpha=0.5)
ax.scatter(c_newEs, y_true, marker='.', color='tab:blue', label='c adj')
ax.plot(c_newEs, new_c_y_pred, color='tab:blue')
ax.set_title("'c' ND variant"); ax.set_xlabel("ND energy")
ax.set_ylabel("Predicted fold rate", fontsize=14); ax.legend()
ax = axes[0][1]
ax.scatter(x_peakEs, y_true, color='tab:orange', alpha=0.5, label='x peak')
ax.plot(x_peakEs, peak_x_y_pred, color='tab:orange', alpha=0.5)
ax.scatter(x_newEs, y_true, marker='.', color='tab:orange', label='x adj')
ax.plot(x_newEs, new_x_y_pred, color='tab:orange')
ax.set_title("'x' ND variant"); ax.set_xlabel("ND energy"); ax.legend()
ax = axes[0][2]
ax.scatter(r_peakEs, y_true, color='tab:green', alpha=0.5, label='r peak')
ax.plot(r_peakEs, peak_r_y_pred, color='tab:green', alpha=0.5)
ax.scatter(r_newEs, y_true, marker='.', color='tab:green', label='r adj')
ax.plot(r_newEs, new_r_y_pred, color='tab:green')
ax.set_title("'r' ND variant"); ax.set_xlabel("ND energy"); ax.legend()
ax = axes[1][0]
ax.scatter(c_peakEs, c_newEs, marker='.',color='tab:blue', label='c')
ax.set_xlabel('peak ND energy'); ax.legend()
ax.set_ylabel('adjusted ND energy', fontsize=14)
ax = axes[1][1]
ax.scatter(x_peakEs, x_newEs, marker='.',color='tab:orange', label='x')
ax.set_xlabel('peak ND energy'); ax.legend()
ax = axes[1][2]
ax.scatter(r_peakEs, r_newEs, marker='.', color='tab:green', label='r')
ax.set_xlabel('peak ND energy'); _ = ax.legend()
fn = os.path.join('K_c', 'K_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist()
fold_rates = df.k_f.values
print(len(PIDs))
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the three ND variants
c_ndstats = nd.get_NDstats('K_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('K_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('K_r', PIDs, 'r')
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c' peak (unadjusted).")
peak_c_y_pred = print_stats(c_peakQs, c_peakEs, fold_rates, c_peakQs)
c_newQs, c_newEs, num_changes = adjust1(reg, PIDs, y_true, c_peakEs, c_peakQs,\
c_ndstats, c_mje_params)
print("'c' new (adjusted). ", num_changes)
new_c_y_pred = print_stats(c_newQs, c_newEs, fold_rates, c_peakQs)
x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x' peak (unadjusted).")
peak_x_y_pred = print_stats(x_peakQs, x_peakEs, fold_rates, x_peakQs)
x_newQs, x_newEs, num_changes = adjust1(reg, PIDs, y_true, x_peakEs, x_peakQs,\
x_ndstats, x_mje_params)
print("'x' new (adjusted). ", num_changes)
new_x_y_pred = print_stats(x_newQs, x_newEs, fold_rates, x_peakQs)
r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r' peak (unadjusted).")
peak_r_y_pred = print_stats(r_peakQs, r_peakEs, fold_rates, r_peakQs)
r_newQs, r_newEs, num_changes = adjust1(reg, PIDs, y_true, r_peakEs, r_peakQs,\
r_ndstats, r_mje_params)
print("'r' new (adjusted). ", num_changes)
new_r_y_pred = print_stats(r_newQs, r_newEs, fold_rates, r_peakQs)
fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
plt.suptitle("Change in Qs, Kamagata", fontsize=14)
ax = axes[0]
ax.scatter(PIDs, c_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, c_newQs, marker='x', label="new")
ax.set_title("'c' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.set_ylabel('Q', fontsize=14); ax.legend()
ax = axes[1]
ax.scatter(PIDs, x_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, x_newQs, marker='x', label="new")
ax.set_title("'x' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.legend()
ax = axes[2]
ax.scatter(PIDs, r_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, r_newQs, marker='x', label="new")
ax.set_title("'r' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
_ = ax.legend()
fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
fig.suptitle("Change in Es, Kamagata", fontsize=14)
ax = axes[0][0]
ax.scatter(c_peakEs, y_true, color='tab:blue', alpha=0.5, label='c peak')
ax.plot(c_peakEs, peak_c_y_pred, color='tab:blue', alpha=0.5)
ax.scatter(c_newEs, y_true, marker='.', color='tab:blue', label='c adj')
ax.plot(c_newEs, new_c_y_pred, color='tab:blue')
ax.set_title("'c' ND variant"); ax.set_xlabel("ND energy")
ax.set_ylabel("Predicted fold rate", fontsize=14); ax.legend()
ax = axes[0][1]
ax.scatter(x_peakEs, y_true, color='tab:orange', alpha=0.5, label='x peak')
ax.plot(x_peakEs, peak_x_y_pred, color='tab:orange', alpha=0.5)
ax.scatter(x_newEs, y_true, marker='.', color='tab:orange', label='x adj')
ax.plot(x_newEs, new_x_y_pred, color='tab:orange')
ax.set_title("'x' ND variant"); ax.set_xlabel("ND energy"); ax.legend()
ax = axes[0][2]
ax.scatter(r_peakEs, y_true, color='tab:green', alpha=0.5, label='r peak')
ax.plot(r_peakEs, peak_r_y_pred, color='tab:green', alpha=0.5)
ax.scatter(r_newEs, y_true, marker='.', color='tab:green', label='r adj')
ax.plot(r_newEs, new_r_y_pred, color='tab:green')
ax.set_title("'r' ND variant"); ax.set_xlabel("ND energy"); ax.legend()
ax = axes[1][0]
ax.scatter(c_peakEs, c_newEs, marker='.',color='tab:blue', label='c')
ax.set_xlabel('peak ND energy'); ax.legend()
ax.set_ylabel('adjusted ND energy', fontsize=14)
ax = axes[1][1]
ax.scatter(x_peakEs, x_newEs, marker='.',color='tab:orange', label='x')
ax.set_xlabel('peak ND energy'); ax.legend()
ax = axes[1][2]
ax.scatter(r_peakEs, r_newEs, marker='.', color='tab:green', label='r')
ax.set_xlabel('peak ND energy'); _ = ax.legend()
SKLC September 2020