Linear correlation with experimental folding rate

In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn import linear_model
import matplotlib.pyplot as plt
In [2]:
import NDutils as nd # my lib
  • ND variants:

    • 'r': a contact forms with 0.5 probability, random node selection.
    • 'y': prefer long over short-range contacts, random node selection.
    • 'x': prefer short over long-range contacts, random node selection.
    • 'c': prefer short over long-range contacts, remaining degree node selection.
    • 'a': 'c' with crony (local neighborhood) node selection.
  • EDS variants:

    • 'a': C$_\alpha$-C$_\alpha$ distance
    • 'b': C$_\beta$-C$_\beta$ distance
    • 's': shorter of C$_\alpha$-C$_\alpha$ and C$_\beta$-C$_\beta$ distance
    • 'l': longer of C$_\alpha$-C$_\alpha$ and C$_\beta$-C$_\beta$ distance
    • 'v': average of C$_\alpha$-C$_\alpha$ and C$_\beta$-C$_\beta$ distance
  • Variables:

    • $E$ of ND networks.
    • Contact-Order of ND shortcut networks.
    • Network clustering of ND shortcut networks.
In [3]:
# plot colors for the five ND variants
NDVs = ['c', 'x', 'r', 'a', 'y']
ndv_colors = {'c': 'tab:blue', 'x': 'tab:orange', 'r': 'tab:green', 
              'a': 'tab:red', 'y': 'tab:purple'}
In [4]:
def analyze_E(nd_type, ndstats: dict, mje_params: dict, peakQs: list, peakEs: list,
              y_pred: np.array)->np.array:
    """Inputs: nd_type, ndstats.
    Outputs: mje_params, peakQs, peakEs, y_pred whose shape is (N, 1)
    Returns: list of values.
    """
    mje_params.update(nd.get_params(ndstats, 'sc_mje'))
    peakQs[:], peakEs[:] = nd.get_peaks(mje_params, PIDs)
    print(nd_type, np.mean(peakQs), np.std(peakQs), nd.mjes_range(peakEs))
    x = np.array(peakEs)
    reg.fit(x.reshape(-1,1), y_true)
    # y_pred[:] = reg.predict(x.reshape(-1,1)) # column-vector
    np.copyto(y_pred, reg.predict(x.reshape(-1,1))) # numpy 1.7.0
    print(pearsonr(fold_rates, x), r2_score(fold_rates, y_pred), np.ptp(y_pred))
    print(nd.mra(y_true, y_pred)) # mse, r2, adj_r2
    # follow Table columns in paper    
    c1, c2, c3, c4, c5 = pearsonr(fold_rates, x), np.mean(peakQs), nd.mjes_range(peakEs), \
    r2_score(fold_rates, y_pred), np.ptp(y_pred)
    return np.round([c1[0], c1[1], c2, c3, c4, c5], 3)

def plot_E(plt_title: str): # could choose which to plot
    fig = plt.figure(figsize=(8, 6))
    plt.title(plt_title, fontsize=14)
    plt.scatter(c_peakEs, fold_rates, color=ndv_colors['c'], alpha=0.5)
    plt.plot(c_peakEs, c_y_pred, color=ndv_colors['c'], label="'c' " + str(c_res[0]))
    plt.scatter(x_peakEs, fold_rates, color=ndv_colors['x'], alpha=0.5)
    plt.plot(x_peakEs, x_y_pred, color=ndv_colors['x'], label="'x' " + str(x_res[0]))
    plt.scatter(r_peakEs, fold_rates, color=ndv_colors['r'], alpha=0.5)
    plt.plot(r_peakEs, r_y_pred, color=ndv_colors['r'], label="'r' " + str(r_res[0]))
    plt.scatter(a_peakEs, fold_rates, color=ndv_colors['a'], alpha=0.5)
    plt.plot(a_peakEs, a_y_pred, color=ndv_colors['a'], label="'a' " + str(a_res[0]))    
    plt.scatter(y_peakEs, fold_rates, color=ndv_colors['y'], alpha=0.5)
    plt.plot(y_peakEs, y_y_pred, color=ndv_colors['y'], label="'y' " + str(y_res[0]))    
    plt.xlabel("Peak ND energy $E$", fontsize=14)
    plt.ylabel("Experimental folding rate.", fontsize=14)
    plt.legend(title="ndv corr"); plt.grid()     
    return plt

def fait_analyze_E():
    global c_mje_params, c_peakQs, c_peakEs, c_y_pred, c_res
    c_mje_params = {}; c_peakQs, c_peakEs = [],[]; c_y_pred = np.empty((num_pids, 1))
    c_res = analyze_E('c', c_ndstats, c_mje_params, c_peakQs, c_peakEs, c_y_pred)
    global x_mje_params, x_peakQs, x_peakEs, x_y_pred, x_res
    x_mje_params = {}; x_peakQs, x_peakEs = [],[]; x_y_pred = np.empty((num_pids, 1))
    x_res = analyze_E('x', x_ndstats, x_mje_params, x_peakQs, x_peakEs, x_y_pred)
    global r_mje_params, r_peakQs, r_peakEs, r_y_pred, r_res
    r_mje_params = {}; r_peakQs, r_peakEs = [],[]; r_y_pred = np.empty((num_pids, 1))
    r_res = analyze_E('r', r_ndstats, r_mje_params, r_peakQs, r_peakEs, r_y_pred)
    global a_mje_params, a_peakQs, a_peakEs, a_y_pred, a_res
    a_mje_params = {}; a_peakQs, a_peakEs = [],[]; a_y_pred = np.empty((num_pids, 1))
    a_res = analyze_E('a', a_ndstats, a_mje_params, a_peakQs, a_peakEs, a_y_pred)
    global y_mje_params, y_peakQs, y_peakEs, y_y_pred, y_res
    y_mje_params = {}; y_peakQs, y_peakEs = [],[]; y_y_pred = np.empty((num_pids, 1))
    y_res = analyze_E('y', y_ndstats, y_mje_params, y_peakQs, y_peakEs, y_y_pred)

    vals = np.vstack([c_res, x_res, r_res, a_res, y_res])
    df = pd.DataFrame(vals).assign(ndv = ['c', 'x', 'r', 'a', 'y']).set_index('ndv')
    df.columns = ['corr', 'pval', 'peakQ', 'E_range', 'r2_score', 'pred_range']    
    return df
In [5]:
def analyze_CO(nd_type: chr, CO_type:str):
    """Inputs: nd_type, CO_type.
    CO_type is one of scn0_CO, scn0_RCO and scn0_lnCo and must be a column in a ndstats df.    
    Returns: a tuple of two lists (correl coefficients, p-values).
    """
    ndstats = eval(nd_type + "_ndstats", globals()) # no error checking!
    CO_params = nd.get_params(ndstats, CO_type)
    CO_rhos, CO_pvals = nd.corrs(CO_params, fold_rates, PIDs)
    return CO_rhos, CO_pvals

def plot_CO(plt_title: str):
    fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
    plt.suptitle("Contact-Order of ND native shortcuts and exp. fold rate, " + plt_title, 
                 fontsize=14)
    for i, CO_type in enumerate(["scn0_CO", "scn0_lnCO", "scn0_RCO"]): # match col names
        ax = axes[0][i]; axp = axes[1][i]
        ax.set_title(CO_type[:4].upper() + CO_type[4:], fontsize=14)
        for nd_type in NDVs:
            CO_rhos, CO_pvals = analyze_CO(nd_type, CO_type)
            ax.plot(nd.Qs, CO_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
            axp.plot(nd.Qs, CO_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)        
        ax.set_ylabel("Pearson correlation coefficient", fontsize=14)
        ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
        ax.legend(); ax.grid()          
        axp.set_ylabel("p-values", fontsize=14)
        # axp.plot([0.1, 1.0], [0.05, 0.05], color='gray', label="pval=0.05")
        axp.set_xlabel("Q", fontsize=14); axp.set_xticks(nd.Qs)
        axp.legend(); axp.grid()
    return plt
In [6]:
def analyze_C(nd_type: chr, C_type: str):
    """Inputs: nd_type, C_type.
    C_type is scn0_C and must be a column in a ndstats df.    
    Returns: a tuple of two lists (correl coefficients, p-values).
    """
    ndstats = eval(nd_type + "_ndstats", globals()) # no error checking!
    C_params = nd.get_params(ndstats, C_type)
    C_rhos, C_pvals = nd.corrs(C_params, fold_rates, PIDs)
    return C_rhos, C_pvals

Uzunoglo

  • 52 protein chains that fold in a two-state manner.
In [7]:
dirname = os.path.join("UZ_ND3")
fn = os.path.join(dirname, 'UZ_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
52
Range of fold rates: 5.96
In [8]:
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)

ND long-range native shortcuts

  • Compare appearance of long-range native shortcuts (>10 sequence separation) in ND variants.
In [9]:
# dict keyed by (pid, q) -> avg stdev scle per q
c_scle_params = nd.get_params(c_ndstats, 'scn0_le')
x_scle_params = nd.get_params(x_ndstats, 'scn0_le')
r_scle_params = nd.get_params(r_ndstats, 'scn0_le')
a_scle_params = nd.get_params(a_ndstats, 'scn0_le')
y_scle_params = nd.get_params(y_ndstats, 'scn0_le')
In [10]:
import random
In [11]:
# random sample of 10 pids
sample_pids = random.sample(PIDs, 10)

fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
fig.suptitle("ND long-range native shortcuts for 10 chains in Uzunoglo dataset.", fontsize=14)

for i, pid in enumerate(sample_pids):
    r = i//5; c = i%5; ax = axes[r][c]
    ax.set_title(pid, fontsize=14)
    avg_scles = [ c_scle_params[(pid, q)][0] for q in nd.Qs ]
    ax.plot(nd.Qs, avg_scles, color=ndv_colors['c'], label='c')
    avg_scles = [ x_scle_params[(pid, q)][0] for q in nd.Qs ]
    ax.plot(nd.Qs, avg_scles, color=ndv_colors['x'], label='x')
    avg_scles = [ r_scle_params[(pid, q)][0] for q in nd.Qs ]
    ax.plot(nd.Qs, avg_scles, color=ndv_colors['r'], label='r') 
    avg_scles = [ a_scle_params[(pid, q)][0] for q in nd.Qs ]
    ax.plot(nd.Qs, avg_scles, color=ndv_colors['a'], label='a')    
    avg_scles = [ y_scle_params[(pid, q)][0] for q in nd.Qs ]
    ax.plot(nd.Qs, avg_scles, color=ndv_colors['y'], label='y')
    ax.set_ylabel("Mean SCLE", fontsize=14); ax.set_xlabel("Q", fontsize=14)
    ax.legend()

# ofn = os.path.join(dirname, "UZ_ndv_SCLEsample_" + edv + ".png")
# plt.savefig(ofn)
  • Compare 'a' 'c' 'x' ND scles; it's clear ND scles appear earlier in 'r' and 'y'.
In [12]:
agc, agx, cgx = [], [], [] # a > c, a > x, c > x
for q in nd.Qs:
    a_avg_scles = np.array([ a_scle_params[(pid, q)][0] for pid in PIDs ])
    c_avg_scles = np.array([ c_scle_params[(pid, q)][0] for pid in PIDs ])
    x_avg_scles = np.array([ x_scle_params[(pid, q)][0] for pid in PIDs ])    
    count = np.sum(a_avg_scles > c_avg_scles)
    frac = count/num_pids; agc.append(frac)
    # print(q, count, frac)
    count = np.sum(a_avg_scles > x_avg_scles)
    frac = count/num_pids; agx.append(frac)
    # print(q, count, frac)
    count = np.sum(c_avg_scles > x_avg_scles)
    frac = count/num_pids; cgx.append(frac)
    # print(q, count, frac); print()

plt.title("Comparison of long-range native shortcuts between ND variants \n (EDS variant '" + 
          edv + "'), Uzunoglo dataset")
plt.plot(nd.Qs, cgx, '.--', label="c > x")
plt.plot(nd.Qs, agx, '+--', color="tab:gray", label="a > x")
plt.plot(nd.Qs, agc, 'x--', color="tab:red", label="a > c")
plt.xlabel("Q"); plt.xticks(nd.Qs); 
plt.ylabel("Proportion of pids"); plt.yticks([0.0] + nd.Qs)
plt.legend(); plt.grid()

# ofn = os.path.join(dirname, "UZ_ndv_SCLE_" + edv + ".png")
# plt.savefig(ofn)
  • Since the proportions ($Q < 1.0$) are all above 0.6, we can say that
    • ND SCLE appear earlier in 'c' than 'x' ND variants (c > x),
    • ND SCLE appear earlier in 'a' than 'x' ND variants (a > x), and
    • ND SCLE appear earlier in 'a' than 'c' ND variants (a > c).
  • ND variants in early to late order of ND SCLE appearance is 'y', 'r', 'a', 'c', 'x', for a typical pid.

ND peak energies and peak Qs

In [13]:
df = fait_analyze_E()
ofn = os.path.join(dirname, "UZ_ndv_fratecorr_" + edv + ".csv")
df.to_csv(ofn)
df
c 0.2384615384615385 0.062492603112584304 4.331000867750622
(-0.7105938238310499, 3.6072970313305357e-09) 0.5049435824668331 4.7446057256074745
(0.996360121167872, 0.504943582466833, 0.4847371980777241)
x 0.25000000000000006 0.06044705248269888 4.3073383215487775
(-0.6995284684532532, 7.951351574866814e-09) 0.489340078176554 4.773488346833786
(1.0277640357010358, 0.489340078176554, 0.4684968160613112)
r 0.17499999999999996 0.04330127018922193 1.9282582276763653
(-0.21449836324481997, 0.12676000867152618) 0.04600954783470668 1.5070378162038274
(1.9200196358402395, 0.046009547834706566, 0.007071162032041611)
a 0.2269230769230769 0.06237413361625713 4.241093244873331
(-0.7267077457611376, 1.066350035925498e-09) 0.5281041477492344 4.655290394542063
(0.9497467195154355, 0.5281041477492343, 0.5088430925553256)
y 0.1423076923076923 0.04940474068717357 1.8660569137746168
(0.06635396404725291, 0.6402356178121927) 0.0044028485447842325 0.38845463066613517
(2.003758083576098, 0.0044028485447840104, -0.03623376988195948)
Out[13]:
corr pval peakQ E_range r2_score pred_range
ndv
c -0.711 0.000 0.238 4.331 0.505 4.745
x -0.700 0.000 0.250 4.307 0.489 4.773
r -0.214 0.127 0.175 1.928 0.046 1.507
a -0.727 0.000 0.227 4.241 0.528 4.655
y 0.066 0.640 0.142 1.866 0.004 0.388
  • ND variant 'a' produces a slightly stronger correlation, and earlier peak $Q$s on average, than both 'c' and 'x'.
  • The results show influence of timing of long-range contacts on peak $Q$ - ealier appearance seem to hasten peak $Q$s.
In [14]:
plt = plot_E("EDS variant '"+ edv + "', Uzunoglo dataset.")
ofn = os.path.join(dirname, "UZ_ndv_fratecorr_" + edv + ".png")
plt.savefig(ofn)
  • A look at folding rate correlation with ND energy by $Q$.
In [15]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8), constrained_layout=True)
plt.suptitle("ND energy and exp. fold rate, EDS variant '" + edv + "', Uzunoglo dataset.")

ax = axes[0][0]; axp = axes[1][0]
for nd_type in ['a', 'c', 'x']:
    mje_params = eval(nd_type + "_mje_params", globals())
    E_rhos, E_pvals = nd.corrs(mje_params, fold_rates, PIDs)    
    ax.plot(nd.Qs, E_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
    axp.plot(nd.Qs, E_pvals, '.--', color=ndv_colors[nd_type], label=nd_type) 
    res = eval(nd_type + "_res", globals())
    ax.scatter(res[2], res[0], marker='^', s=80, color=ndv_colors[nd_type], 
               label=nd_type + " peak")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()                
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()

ax = axes[0][1]; axp = axes[1][1]
for nd_type in ['r', 'y']:
    mje_params = eval(nd_type + "_mje_params", globals())
    E_rhos, E_pvals = nd.corrs(mje_params, fold_rates, PIDs)    
    ax.plot(nd.Qs, E_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
    axp.plot(nd.Qs, E_pvals, '.--', color=ndv_colors[nd_type], label=nd_type) 
    res = eval(nd_type + "_res", globals())
    ax.scatter(res[2], res[0], marker='^', s=80, color=ndv_colors[nd_type], 
               label=nd_type + " peak")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()                
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()

# ofn = os.path.join(dirname, "UZ_ndv_fratecorr1_" + edv + ".png")
# plt.savefig(ofn)

Contact-Order of ND native shortcut networks (SCN0)

  • Unlike RCO, but like (absolute) CO, lnCO is not affected by number of nodes $N$.
  • For ND networks, $N$ is the number of nodes with at least one native shortcut attached.
In [16]:
plt = plot_CO("EDS variant '" + edv + "', Uzunoglo dataset")
# ofn = os.path.join(dirname, "UZ_ndv_SCN0CO_" + edv + ".png")
# plt.savefig(ofn)

Network clustering of ND SCN0

In [17]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8), constrained_layout=True)
plt.suptitle("Network clustering $C$ of ND native shortcuts and exp. fold rate, " + 
             "EDS variant '" + edv + "', Uzunoglo dataset")

ax = axes[0][0]; axp = axes[1][0]
for nd_type in ['c', 'x', 'r', 'a']:
    C_rhos, C_pvals = analyze_C(nd_type, "scn0_C")
    ax.plot(nd.Qs, C_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
    axp.plot(nd.Qs, C_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()        
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
    
# plot 'y' separately because of its dramatically different behavior
ax = axes[0][1]; axp = axes[1][1]; nd_type = 'y'
C_rhos, C_pvals = analyze_C(nd_type, "scn0_C")
ax.plot(nd.Qs, C_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, C_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()        
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()

# ofn = os.path.join(dirname, "UZ_ndv_SCN0Clus_" + edv + ".png")
# plt.savefig(ofn)
  • ND variants 'a', 'c', and 'x' produce significant (p-value < 0.01) Pearson correlation between $C_{SCN0}$ and exp. fold rate within the ND-TS region ($Q < 0.5$). This correlation is stronger than the correlation with native-state SCN ($Q=1.0$).
  • Within the ND-TS region, the correlation is strongest with 'a' and 'c' ND variants.

Kamagata

  • 20 multi-state protein chains.
In [18]:
dirname = "K_ND3"
fn = os.path.join(dirname, 'K_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); fold_rates = df.k_f.values

# remove outlier: 1ael [12]
print(PIDs[12], fold_rates[12])
print(PIDs.pop(12))
fold_rates = np.delete(fold_rates, 12)

num_pids = len(PIDs)
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
1ael 0.88
1ael
20
Range of fold rates: 4.55
In [19]:
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_"+edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
In [20]:
df = fait_analyze_E()
# ofn = os.path.join(dirname, "K_ndv_fratecorr_" + edv + ".csv")
# df.to_csv(ofn)
df
c 0.25999999999999995 0.066332495807108 4.2284430914571125
(-0.5524559199054105, 0.0115375232517589) 0.3052075434385336 2.5156105600958947
(0.9327052002164994, 0.30520754343853373, 0.22346725443130244)
x 0.26999999999999996 0.06403124237432847 4.198016621792634
(-0.5542535228757642, 0.011215330762224491) 0.3071969676201953 2.573928594049516
(0.9300345519356366, 0.30719696762019544, 0.22569072851668903)
r 0.18500000000000005 0.035707142142714254 1.6890110343349
(-0.2779788103139703, 0.23535373942124632) 0.07727221898357028 1.301252795661096
(1.2386907652934736, 0.0772722189835704, -0.0312839905477742)
a 0.22000000000000003 0.059999999999999984 4.081307512542651
(-0.5617825478692583, 0.00994408494965854) 0.31559963109047584 2.262539507768007
(0.9187546253325382, 0.31559963109047595, 0.2350819406305319)
y 0.16000000000000003 0.048989794855663564 1.7265352000186733
(-0.09091545949860291, 0.7030567116917189) 0.008265620775842253 0.3820139858904721
(1.3313267926276369, 0.008265620775842364, -0.1084090120740584)
Out[20]:
corr pval peakQ E_range r2_score pred_range
ndv
c -0.552 0.012 0.260 4.228 0.305 2.516
x -0.554 0.011 0.270 4.198 0.307 2.574
r -0.278 0.235 0.185 1.689 0.077 1.301
a -0.562 0.010 0.220 4.081 0.316 2.263
y -0.091 0.703 0.160 1.727 0.008 0.382
In [21]:
plt = plot_E("EDS variant '" + edv + "', Kamagata dataset.")
# ofn = os.path.join(dirname, "K_ndv_fratecorr_" + edv + ".png")
# plt.savefig(ofn)

Paci

  • 10 small proteins that fold in a two-state manner.
In [22]:
dirname = "P_ND3"
fn = os.path.join(dirname, 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); fold_rates = df.k_f.values
num_pids = len(PIDs)
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
10
Range of fold rates: 4.328
In [23]:
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
In [24]:
df = fait_analyze_E()
# ofn = os.path.join(dirname, "P_ndv_fratecorr_" + edv + ".csv")
# df.to_csv(ofn)
df
c 0.26 0.04898979485566354 1.8711453949497967
(-0.8326901471750437, 0.0027867362980185874) 0.6933728812023958 3.433472881557701
(0.4487325647194381, 0.6933728812023958, 0.605765132974509)
x 0.25000000000000006 0.04999999999999999 2.2487040312412834
(-0.8955232013098765, 0.0004587141873736154) 0.8019618040842895 3.4530118250572412
(0.2898184215216963, 0.8019618040842895, 0.7453794623940866)
r 0.19 0.03 1.4174191962184999
(-0.05563325121177174, 0.8786782256482984) 0.0030950586403921676 0.22256014582324868
(1.4589176354393387, 0.0030950586403921676, -0.2817349246052101)
a 0.25000000000000006 0.04999999999999999 1.84833757867314
(-0.8479417111836289, 0.0019386037169391614) 0.7190051455650203 3.4822538727884833
(0.4112211020278447, 0.7190051455650203, 0.6387209014407405)
y 0.15000000000000005 0.05000000000000001 1.621999990733038
(0.2567066594223849, 0.47401892205094687) 0.06589830899180027 1.0028241139338674
(1.367008401470029, 0.06589830899180027, -0.20098788843911386)
Out[24]:
corr pval peakQ E_range r2_score pred_range
ndv
c -0.833 0.003 0.26 1.871 0.693 3.433
x -0.896 0.000 0.25 2.249 0.802 3.453
r -0.056 0.879 0.19 1.417 0.003 0.223
a -0.848 0.002 0.25 1.848 0.719 3.482
y 0.257 0.474 0.15 1.622 0.066 1.003
In [25]:
plt = plot_E("EDS variant '" + edv + "', Paci dataset")
# ofn = os.path.join(dirname, "P_ndv_fratecorr_" + edv + ".png")
# plt.savefig(ofn)
In [26]:
print("EDV", edv)
print('a', a_peakQs)
print('c', c_peakQs)
print('x', x_peakQs)
print('r', r_peakQs)
print('y', y_peakQs)
EDV a
a [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2]
c [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3]
x [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2]
r [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2]
y [0.2, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1]
In [27]:
peakQs = np.vstack([a_peakQs, c_peakQs, x_peakQs, r_peakQs, y_peakQs]).T
df = pd.DataFrame(peakQs)
df.columns = ['a', 'c', 'x', 'r', 'y']
df.mean()
Out[27]:
a    0.25
c    0.26
x    0.25
r    0.19
y    0.15
dtype: float64
In [28]:
ax = df.boxplot(showmeans=True)
ax.set_title("EDS variant " + edv + ", Paci dataset", fontsize=14)
ax.set_ylabel("peak Q", fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
_ = ax.set_xticklabels(df.columns.values, fontsize=14)
In [29]:
plt = plot_CO("EDS variant '" + edv + "', Paci dataset")
# ofn = os.path.join(dirname, "Paci_ndv_SCN0CO_" + edv + ".png")
# plt.savefig(ofn)

SingleDomain

  • no exp. fold rates: ignore corr, r2_score, simulated and predicted fold rates, only interested in peak Q.
In [30]:
dirname = "SingleDomain_ND3"
PIDs = [ "2f4k", "1bdd", "2abd_0", "1gb1_0", "1mi0_swissmin", "1mhx_swissmin", "2ptl_0", 
        "1shg", "1srm_1", "2ci2", "1aps_0", "2kjv_0", "2kjw_0", "1qys" ]

fold_rates = np.arange(0.1, 1.5, step=0.1) # dummy fold rates

num_pids = len(PIDs)
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
In [31]:
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
In [32]:
df = fait_analyze_E() # fake exp fold rate, called to get peakQs and peak Es
c 0.23571428571428574 0.0610285981808395 4.095552512275916
(0.6792993576922217, 0.007540343723719449) 0.4614476173610652 1.000104854847991
(0.08751476217882694, 0.4614476173610652, 0.36352900233580443)
x 0.2714285714285714 0.06998542122237651 3.9818323959605197
(0.6759930311785395, 0.007954368267447755) 0.4569665782019502 0.9045842378077681
(0.08824293104218313, 0.4569665782019502, 0.358233228784123)
r 0.15714285714285717 0.049487165930539354 1.3672986676123369
(0.34952298610103605, 0.2205877684299158) 0.12216631781298515 0.4275490738146568
(0.14264797335538998, 0.12216631781298515, -0.037439806221017724)
a 0.21428571428571433 0.05150787536377127 3.924462990312584
(0.6498605938248719, 0.01187919428494625) 0.42231879140641515 0.9172420902196368
(0.09387319639645757, 0.42231879140641515, 0.3172858443893998)
y 0.14285714285714285 0.049487165930539354 1.4478470070259966
(0.10046088777912489, 0.73256862395031) 0.01009238997337003 0.13163148451662998
(0.16085998662932743, 0.01009238997337003, -0.1698908118496536)
In [33]:
print("EDV", edv)
print('a', a_peakQs)
print('c', c_peakQs)
print('x', x_peakQs)
print('r', r_peakQs)
print('y', y_peakQs)
EDV a
a [0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.2, 0.3, 0.3, 0.2, 0.2, 0.2]
c [0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.2]
x [0.1, 0.4, 0.3, 0.2, 0.3, 0.3, 0.3, 0.3, 0.2, 0.3, 0.3, 0.3, 0.3, 0.2]
r [0.1, 0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1]
y [0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1]
In [34]:
peakQs = np.vstack([a_peakQs, c_peakQs, x_peakQs, r_peakQs, y_peakQs]).T
df = pd.DataFrame(peakQs)
df.columns = ['a', 'c', 'x', 'r', 'y']
df.mean()
Out[34]:
a    0.214286
c    0.235714
x    0.271429
r    0.157143
y    0.142857
dtype: float64
In [35]:
ax = df.boxplot(showmeans=True)
ax.set_title("EDS variant " + edv, fontsize=14)
ax.set_ylabel("peak Q", fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
_ = ax.set_xticklabels(df.columns.values, fontsize=14)
In [36]:
plt.plot(PIDs, c_peakEs, label='c')
plt.plot(PIDs, x_peakEs, label='x')
plt.plot(PIDs, r_peakEs, label='r')
plt.plot(PIDs, a_peakEs, label='a')
plt.plot(PIDs, y_peakEs, label='y')
plt.xticks(PIDs, [ p[:4] for p in PIDs ], rotation=90)
_ = plt.legend()

End of notebook

SKLC September 2020