Adjusting ND energies.¶

Hitherto, peak ND energies (peak $E$s) were used as the data points in the correlation test with experimental folding rate. The results show promise for two-state folders --a significant ($\alpha$ = 0.05) Pearson correlation with coefficient around -0.7 was achieved--, but less so for multi-state folders --a significant Pearson correlation with coefficient around -0.5 was achieved only after excluding an outlier.

In this notebook, we explore how much these results can be improved by selecting different data points from the choices presented by a ND variant, using a linear regression model. The regression function's main role here is to assist in the adjustment of ND energy data points to improve correlation with experimental folding rate.

The idea is better ND variants require fewer adjustments to achieve stronger correlations.

Adjust Method:

Obtain $\hat Y = a X + b$, the regression function relating peak ND energies ($X$) with fold rates ($Y$) for a set of pids (train set).
For each pid, obtain a "new" ND energy point from the ND energy values generated at peak $Q$ that will maximize the linear relationship between ND energy and fold rate.
- Let $E_{peakQ}$ be the ND energies at peak $Q$. For each $x \in E_{peakQ}$, evaluate $\hat y = ax + b$, and find the $x$ that minimizes the difference between $\hat y$ and the fold rate $y$.
For each "new" ND energy point, find the ND energy Gaussian that maximizes its likelihood, and assign this $Q$ as the "new peak" $Q$ for a pid.
- Sample space for $Q$ is set as wide as reasonable to $[0.1, 0.9]$.
- We are interested in which $Q$ the method will choose, particularly for multi-state pids. Will they stay within the ND-TS region ($Q < 0.5$)? This depends on how the ND energy Gaussians overlap.
The means of the ND energy Gaussians associated with the "new peak" $Q$s are the "adjusted" ND energies ($\tilde X$).
Obtain a new regression function based on the "adjusted" ND energies: $\hat Y = a \tilde X + b$.

import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, norm
import matplotlib.pyplot as plt
from sklearn import linear_model
import NDutils as nd # my lib

# constrain the candidate Es to those associated with peak Q
# shift in Q affected by considerable overlapping Gaussians
# new E is the mean of the Gaussian at new Q
def adjust1(reg, pids, y, peakEs, peakQs, ndstats: dict, nd_Eparams: dict, \
            min_Q=0.1, max_Q=0.9):
    """Does one round of adjustments."""
    a = nd.Qs.index(min_Q); b = nd.Qs.index(max_Q)    
    new_Es = peakEs.copy(); new_Qs = peakQs.copy()
    num_changes = 0    
    for i, pid in enumerate(pids):
        peak_q = peakQs[i] # peak Q for a pid
        Es = ndstats[pid].loc[peak_q].sc_mje.values # E values associated with peak E Gaussian
        pred_y = reg.predict(Es.reshape(-1,1))        
        j = np.argmin(np.abs(pred_y - y[i])) # get the index of the smallest residual        
        new_E = Es[j]; # x that produces the smallest residual
        # find Q that most likely generates new_E        
        q_params = [ nd_Eparams[pid, q] for q in nd.Qs[a : b+1] ]
        dens = [norm.pdf(new_E, m, s) for m, s in q_params] # Gaussian density of x at each q
        new_Q = nd.Qs[np.argmax(dens) + a]
        if new_Q != peak_q: 
            new_Qs[i] = new_Q
            new_Es[i] = nd_Eparams[pid, new_Q][0] # take the mean of the Gaussian at new_Q
            num_changes += 1    
    return new_Qs, new_Es, num_changes

def print_stats(Qs: list, Es: list, frates: np.array, peakQs: list):    
    print("Q stats:", np.round(np.mean(Qs), 4), np.round(np.std(Qs), 4), \
          np.min(Qs), np.max(Qs), np.sum(np.array(Qs) > 0.5))
    print("MAD from peak Qs:", np.mean(np.abs(np.array(peakQs) - np.array(Qs))))
    print("E range:", np.round(nd.mjes_range(Es), 4))
    x = np.array(Es)
    print("Corr with exp. fold rate:", np.round(pearsonr(x, frates), 4))
    y_true = frates.reshape(-1, 1)
    reg.fit(x.reshape(-1,1), y_true)
    y_pred = reg.predict(x.reshape(-1,1))
    print("MSE, R2, adj_R2:", np.round(nd.mra(y_true, y_pred), 4)) # mse, r2, adj_r2
    print("predicted range:", np.ptp(y_true), np.round(np.ptp(y_pred), 4))
    return y_pred

Uzunoglo¶

fn = os.path.join('UZ_c', 'UZ_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist()
fold_rates = df.k_f.values
print(len(PIDs))
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()

52
Range of fold rates: 5.96

# load stats for the three ND variants
c_ndstats = nd.get_NDstats('UZ_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('UZ_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('UZ_r', PIDs, 'r')

'c' ND variant¶

c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c' peak (unadjusted).")
peak_c_y_pred = print_stats(c_peakQs, c_peakEs, fold_rates, c_peakQs)

'c' peak (unadjusted).
Q stats: 0.2442 0.0718 0.1 0.4 0
MAD from peak Qs: 0.0
E range: 4.3814
Corr with exp. fold rate: [-0.7145  0.    ]
MSE, R2, adj_R2: [0.9853 0.5104 0.4905]
predicted range: 5.96 4.758

c_newQs, c_newEs, num_changes = adjust1(reg, PIDs, y_true, c_peakEs, c_peakQs,\
                                      c_ndstats, c_mje_params) 
print("'c' new (adjusted). ", num_changes)
new_c_y_pred = print_stats(c_newQs, c_newEs, fold_rates, c_peakQs)

'c' new (adjusted).  31
Q stats: 0.2269 0.136 0.1 0.6 3
MAD from peak Qs: 0.09807692307692308
E range: 4.3985
Corr with exp. fold rate: [-0.8935  0.    ]
MSE, R2, adj_R2: [0.4059 0.7983 0.7901]
predicted range: 5.96 6.0148

'x' ND variant¶

x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x' peak (unadjusted).")
peak_x_y_pred = print_stats(x_peakQs, x_peakEs, fold_rates, x_peakQs)

'x' peak (unadjusted).
Q stats: 0.2577 0.0716 0.1 0.4 0
MAD from peak Qs: 0.0
E range: 4.2422
Corr with exp. fold rate: [-0.7048  0.    ]
MSE, R2, adj_R2: [1.0129 0.4967 0.4762]
predicted range: 5.96 4.5231

x_newQs, x_newEs, num_changes = adjust1(reg, PIDs, y_true, x_peakEs, x_peakQs,\
                                      x_ndstats, x_mje_params) 
print("'x' new (adjusted). ", num_changes)
new_x_y_pred = print_stats(x_newQs, x_newEs, fold_rates, x_peakQs)

'x' new (adjusted).  27
Q stats: 0.2423 0.1306 0.1 0.6 3
MAD from peak Qs: 0.08846153846153845
E range: 4.2227
Corr with exp. fold rate: [-0.9109  0.    ]
MSE, R2, adj_R2: [0.3427 0.8297 0.8228]
predicted range: 5.96 5.5904

'r' ND variant¶

r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r' peak (unadjusted).")
peak_r_y_pred = print_stats(r_peakQs, r_peakEs, fold_rates, r_peakQs)

'r' peak (unadjusted).
Q stats: 0.1596 0.0491 0.1 0.2 0
MAD from peak Qs: 0.0
E range: 1.9358
Corr with exp. fold rate: [-0.2101  0.135 ]
MSE, R2, adj_R2: [1.9238 0.0441 0.0051]
predicted range: 5.96 1.4201

r_newQs, r_newEs, num_changes = adjust1(reg, PIDs, y_true, r_peakEs, r_peakQs,\
                                      r_ndstats, r_mje_params) 
print("'r' new (adjusted). ", num_changes)
new_r_y_pred = print_stats(r_newQs, r_newEs, fold_rates, r_peakQs)

'r' new (adjusted).  30
Q stats: 0.2577 0.1182 0.1 0.5 0
MAD from peak Qs: 0.10576923076923078
E range: 4.2256
Corr with exp. fold rate: [-0.7376  0.    ]
MSE, R2, adj_R2: [0.9177 0.544  0.5254]
predicted range: 5.96 4.2997

The new $Q$s mainly stay within the ND-TS region, with a few going as far as $Q = 0.6$.
After one adjustment round, r2 score improves from about 0.5 to about 0.8 for both 'c' and 'x' ND variants.
The 'r' ND variant is able to achieve a r2 score of around 0.5, only after one adjustment round.

fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
plt.suptitle("Change in Qs, Uzunoglo", fontsize=14)

ax = axes[0]
ax.scatter(PIDs, c_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, c_newQs, marker='x', label="new")
ax.set_title("'c' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.set_ylabel('Q', fontsize=14); ax.legend()

ax = axes[1]
ax.scatter(PIDs, x_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, x_newQs, marker='x', label="new")
ax.set_title("'x' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.legend()

ax = axes[2]
ax.scatter(PIDs, r_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, r_newQs, marker='x', label="new")
ax.set_title("'r' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
_ = ax.legend()

fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
fig.suptitle("Change in Es, Uzunoglo", fontsize=14)

ax = axes[0][0]
ax.scatter(c_peakEs, y_true, color='tab:blue', alpha=0.5, label='c peak')
ax.plot(c_peakEs, peak_c_y_pred, color='tab:blue', alpha=0.5)
ax.scatter(c_newEs, y_true, marker='.', color='tab:blue', label='c adj')
ax.plot(c_newEs, new_c_y_pred, color='tab:blue')
ax.set_title("'c' ND variant"); ax.set_xlabel("ND energy")
ax.set_ylabel("Predicted fold rate", fontsize=14); ax.legend()

ax = axes[0][1]
ax.scatter(x_peakEs, y_true, color='tab:orange', alpha=0.5, label='x peak')
ax.plot(x_peakEs, peak_x_y_pred, color='tab:orange', alpha=0.5)
ax.scatter(x_newEs, y_true, marker='.', color='tab:orange', label='x adj')
ax.plot(x_newEs, new_x_y_pred, color='tab:orange')
ax.set_title("'x' ND variant"); ax.set_xlabel("ND energy"); ax.legend()

ax = axes[0][2]
ax.scatter(r_peakEs, y_true, color='tab:green', alpha=0.5, label='r peak')
ax.plot(r_peakEs, peak_r_y_pred, color='tab:green', alpha=0.5)
ax.scatter(r_newEs, y_true, marker='.', color='tab:green', label='r adj')
ax.plot(r_newEs, new_r_y_pred, color='tab:green')
ax.set_title("'r' ND variant"); ax.set_xlabel("ND energy"); ax.legend()

ax = axes[1][0]
ax.scatter(c_peakEs, c_newEs, marker='.',color='tab:blue', label='c')
ax.set_xlabel('peak ND energy'); ax.legend()
ax.set_ylabel('adjusted ND energy', fontsize=14)

ax = axes[1][1]
ax.scatter(x_peakEs, x_newEs, marker='.',color='tab:orange', label='x')
ax.set_xlabel('peak ND energy'); ax.legend()

ax = axes[1][2]
ax.scatter(r_peakEs, r_newEs, marker='.', color='tab:green', label='r')
ax.set_xlabel('peak ND energy'); _ = ax.legend()

Kamagata¶

fn = os.path.join('K_c', 'K_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist()
fold_rates = df.k_f.values
print(len(PIDs))
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()

21
Range of fold rates: 4.55

# load stats for the three ND variants
c_ndstats = nd.get_NDstats('K_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('K_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('K_r', PIDs, 'r')

'c' ND variant¶

c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c' peak (unadjusted).")
peak_c_y_pred = print_stats(c_peakQs, c_peakEs, fold_rates, c_peakQs)

'c' peak (unadjusted).
Q stats: 0.2524 0.0587 0.2 0.4 0
MAD from peak Qs: 0.0
E range: 4.7971
Corr with exp. fold rate: [-0.384   0.0857]
MSE, R2, adj_R2: [1.0917 0.1475 0.0527]
predicted range: 4.55 2.1069

c_newQs, c_newEs, num_changes = adjust1(reg, PIDs, y_true, c_peakEs, c_peakQs,\
                                      c_ndstats, c_mje_params) 
print("'c' new (adjusted). ", num_changes)
new_c_y_pred = print_stats(c_newQs, c_newEs, fold_rates, c_peakQs)

'c' new (adjusted).  14
Q stats: 0.3476 0.1367 0.1 0.6 1
MAD from peak Qs: 0.12380952380952381
E range: 4.6851
Corr with exp. fold rate: [-7.243e-01  2.000e-04]
MSE, R2, adj_R2: [0.6088 0.5246 0.4718]
predicted range: 4.55 3.3635

'x' ND variant¶

x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x' peak (unadjusted).")
peak_x_y_pred = print_stats(x_peakQs, x_peakEs, fold_rates, x_peakQs)

'x' peak (unadjusted).
Q stats: 0.3 0.0873 0.1 0.5 0
MAD from peak Qs: 0.0
E range: 4.7393
Corr with exp. fold rate: [-0.3704  0.0984]
MSE, R2, adj_R2: [1.1049 0.1372 0.0413]
predicted range: 4.55 2.0197

x_newQs, x_newEs, num_changes = adjust1(reg, PIDs, y_true, x_peakEs, x_peakQs,\
                                      x_ndstats, x_mje_params) 
print("'x' new (adjusted). ", num_changes)
new_x_y_pred = print_stats(x_newQs, x_newEs, fold_rates, x_peakQs)

'x' new (adjusted).  14
Q stats: 0.3143 0.1283 0.1 0.5 0
MAD from peak Qs: 0.11904761904761907
E range: 4.278
Corr with exp. fold rate: [-0.7901  0.    ]
MSE, R2, adj_R2: [0.4813 0.6242 0.5824]
predicted range: 4.55 3.2042

'r' ND variant¶

r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r' peak (unadjusted).")
peak_r_y_pred = print_stats(r_peakQs, r_peakEs, fold_rates, r_peakQs)

'r' peak (unadjusted).
Q stats: 0.1667 0.0471 0.1 0.2 0
MAD from peak Qs: 0.0
E range: 2.0637
Corr with exp. fold rate: [-0.2295  0.317 ]
MSE, R2, adj_R2: [ 1.2131  0.0526 -0.0526]
predicted range: 4.55 1.0328

r_newQs, r_newEs, num_changes = adjust1(reg, PIDs, y_true, r_peakEs, r_peakQs,\
                                      r_ndstats, r_mje_params) 
print("'r' new (adjusted). ", num_changes)
new_r_y_pred = print_stats(r_newQs, r_newEs, fold_rates, r_peakQs)

'r' new (adjusted).  12
Q stats: 0.2476 0.1096 0.1 0.4 0
MAD from peak Qs: 0.09047619047619047
E range: 4.448
Corr with exp. fold rate: [-0.5548  0.009 ]
MSE, R2, adj_R2: [0.8864 0.3078 0.2309]
predicted range: 4.55 1.8772

The new $Q$s mainly stay within the ND-TS region, with a few going as far as $Q = 0.6$.
After one adjustment round:
- Pearson correlation between simulated folding rate (ND energies) and experimental folding rate become significant (pvalue < 0.05) for all three ND variants, with coefficients of around -0.72, -0.79 and -0.55 for 'c', 'x', and 'r', respectively.
- r2 scores are 0.52, 0.62 and 0.31 for 'c', 'x', and 'r', respectively.

fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
plt.suptitle("Change in Qs, Kamagata", fontsize=14)

ax = axes[0]
ax.scatter(PIDs, c_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, c_newQs, marker='x', label="new")
ax.set_title("'c' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.set_ylabel('Q', fontsize=14); ax.legend()

ax = axes[1]
ax.scatter(PIDs, x_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, x_newQs, marker='x', label="new")
ax.set_title("'x' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
ax.legend()

ax = axes[2]
ax.scatter(PIDs, r_peakQs, marker='^', alpha=0.5, label='peak')
ax.scatter(PIDs, r_newQs, marker='x', label="new")
ax.set_title("'r' ND variant")
ax.set_xlabel('Proteins'); ax.set_xticks([])
_ = ax.legend()

fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
fig.suptitle("Change in Es, Kamagata", fontsize=14)

ax = axes[0][0]
ax.scatter(c_peakEs, y_true, color='tab:blue', alpha=0.5, label='c peak')
ax.plot(c_peakEs, peak_c_y_pred, color='tab:blue', alpha=0.5)
ax.scatter(c_newEs, y_true, marker='.', color='tab:blue', label='c adj')
ax.plot(c_newEs, new_c_y_pred, color='tab:blue')
ax.set_title("'c' ND variant"); ax.set_xlabel("ND energy")
ax.set_ylabel("Predicted fold rate", fontsize=14); ax.legend()

ax = axes[0][1]
ax.scatter(x_peakEs, y_true, color='tab:orange', alpha=0.5, label='x peak')
ax.plot(x_peakEs, peak_x_y_pred, color='tab:orange', alpha=0.5)
ax.scatter(x_newEs, y_true, marker='.', color='tab:orange', label='x adj')
ax.plot(x_newEs, new_x_y_pred, color='tab:orange')
ax.set_title("'x' ND variant"); ax.set_xlabel("ND energy"); ax.legend()

ax = axes[0][2]
ax.scatter(r_peakEs, y_true, color='tab:green', alpha=0.5, label='r peak')
ax.plot(r_peakEs, peak_r_y_pred, color='tab:green', alpha=0.5)
ax.scatter(r_newEs, y_true, marker='.', color='tab:green', label='r adj')
ax.plot(r_newEs, new_r_y_pred, color='tab:green')
ax.set_title("'r' ND variant"); ax.set_xlabel("ND energy"); ax.legend()

ax = axes[1][0]
ax.scatter(c_peakEs, c_newEs, marker='.',color='tab:blue', label='c')
ax.set_xlabel('peak ND energy'); ax.legend()
ax.set_ylabel('adjusted ND energy', fontsize=14)

ax = axes[1][1]
ax.scatter(x_peakEs, x_newEs, marker='.',color='tab:orange', label='x')
ax.set_xlabel('peak ND energy'); ax.legend()

ax = axes[1][2]
ax.scatter(r_peakEs, r_newEs, marker='.', color='tab:green', label='r')
ax.set_xlabel('peak ND energy'); _ = ax.legend()

End of notebook¶

SKLC September 2020