Calculated phi-values from native-state (NS) and transition-state (TS) PRNs.

  • TS protein residue networks are PRNs built from Transition-State ensembles in the Paci dataset.
In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel # samples have same variance, paired 
from scipy.stats import ttest_ind_from_stats as tts # different variance, paired?
# check with R, use CI
In [2]:
# import matplotlib
# matplotlib.use('AGG')
# https://matplotlib.org/tutorials/introductory/usage.html?highlight=saving%20figures%20file
In [3]:
Qs =[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
dirname = "Paci"
PIDs = ["1imq", "1lmb4", "1bf4", "2ptl", "2ci2", "1aps", "1fmk", "1bk2", "1shf", "1ten"]

Long-range contacts in TSEs for residues with experimental phi-values

In [4]:
pid = "2ci2"
fn0 = os.path.join(dirname, pid + "_nodestats.out") # Native-state (PRN0) node statistics
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()
Out[4]:
nid exp_phi deg0 cent0
0 0 -1.00 1 147
1 1 -1.00 5 183
2 2 0.00 10 308
3 3 0.05 11 244
4 4 -1.00 8 178
In [5]:
dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
nids = dp.nid.values # nids with phi-values
print(len(nids))
37
In [6]:
fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
df1 = pd.read_table(fn1, sep=' ')
df1.head()
Out[6]:
ts_pid nid le scle scn_deg
0 2073 0 0 0 0
1 2073 1 2 0 0
2 2073 2 1 0 0
3 2073 3 3 0 0
4 2073 4 0 0 0
In [7]:
dg = df1.groupby('nid')
len(dg)
Out[7]:
65
In [8]:
dg.le.mean() # average number of long-range PRN edges in TSE for a pid
Out[8]:
nid
0     0.000000
1     0.282609
2     0.298913
3     0.521739
4     0.146739
        ...   
60    1.119565
61    0.739130
62    1.152174
63    0.750000
64    0.717391
Name: le, Length: 65, dtype: float64
In [9]:
fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True)
plt.suptitle("Average number of long-range edges and shortcuts per node in TSE PRNs.")
plt.delaxes(axes[3][1])
plt.delaxes(axes[3][2])

for i, pid in enumerate(PIDs):
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    # df0.head()
    dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
    nids = dp.nid.values
    fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
    df1 = pd.read_table(fn1, sep=' ')
    # df1.head()
    dg = df1.groupby('nid')
    dle = dg.le.mean() # mean le per node
    dscle = dg.scle.mean() # mean scle per node
    dphi = dle.iloc[nids]
    r = i//3; c = i%3; ax = axes[r][c]
    ax.plot(dle.index.values+1, dle.values, label="Long-range edge in TSE PRN0")
    ax.plot(dscle.index.values+1, dscle.values, label="Long-range shortcut in TSE SCN0")    
    ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimenal phi-value")
    dphi = dscle.iloc[nids]
    ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimental phi-value")
    ax.plot(dle.index.values, [1.0]*len(dle.index.values), linewidth=0.8, color="tab:gray",
           label="y=1")
    ax.set_title(pid); ax.set_xlabel("Node")
    if 0 == c:
        ax.set_ylabel("Average number of contacts per node")

h, l = axes[0][0].get_legend_handles_labels()
_ = fig.legend(h, l, loc="upper left", bbox_to_anchor=(0.4, 0.2))

# plt.savefig("TSE_scle.png")

Calculate phi-values from NS and TSE PRNs

  • Node statistics: degree and (EDS path) centrality/betweenness.
  • Both normalized by dividing with maximum value of a sequence. Interpretation of normalized values as importance of a node relative to other nodes in the network, with larger values having more importance.
  • Experimental phi-values are in [0.0, 1.0].
  • Normalization differs from the usual interpretation of experimental phi-values as the extent to which a node's native contacts are formed at the transition-state, i.e. (TS node degree) / (NS node degree).
    • Cannot use this formulation for NS node statistics.
    • Does not work readily with centrality because TS and ND node centrality can be larger than NS node centrality; resultant range is not bounded in [0.0, 1.0].
      • Could scale the resultant range for each structure, or compute node centrality values as the fraction of all paths.
    • Implication of dividing by a constant (per node) across all $Q$s for ND node statistics is examined in another notebook.
In [10]:
pid = "1imq"
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()
Out[10]:
nid exp_phi deg0 cent0
0 0 -1.0 1 237
1 1 -1.0 2 185
2 2 -1.0 14 480
3 3 -1.0 13 307
4 4 -1.0 9 243
In [11]:
fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg  
df1.head()
Out[11]:
nid deg_avg cent_avg
0 0 1.0000 226.375
1 1 1.3750 261.063
2 2 4.2500 613.813
3 3 4.6875 575.938
4 4 3.6875 393.250
In [12]:
c_norm = df1.cent_avg.values/df0.cent0.values
print(min(c_norm), max(c_norm))
0.9105477855477856 2.844799043062201
In [13]:
# The two ways of normalizing TS node degree are at best weakly correlated.
# Range of calculated phi-values is wider with normalization with max. 
for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
    df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg      
    m = df1.deg_avg.max()     # normalization with max
    d_norm = df1.deg_avg.values/m
    print(min(d_norm), max(d_norm), end='\t')    
    d_norm2 = df1.deg_avg.values/df0.deg0.values # normalization with NS statistic
    print(min(d_norm2), max(d_norm2))
    print(pid, pearsonr(d_norm, d_norm2))
0.05734767025089606 1.0	0.2801724137931034 1.0
1imq (-0.215831042712451, 0.04595175252214858)
0.1443785648262957 1.0	0.36712625 1.0
1lmb4 (-0.10076319972025095, 0.33922007933052833)
0.07028480762647145 1.0	0.13344625000000002 1.0
1bf4 (0.5167851780080396, 1.4501023914945505e-05)
0.1661738537437799 1.0	0.21164037037037037 0.8565307692307692
2ptl (0.27464193460545694, 0.028073860428368513)
0.0698558175924891 1.0	0.125 1.0
2ci2 (0.4084987349365345, 0.0007296363371582195)
0.1266664088894474 1.0	0.3264366666666667 0.8817714285714285
1aps (0.571450279811207, 7.988652162868365e-10)
0.07797334576679488 1.0	0.12040799999999999 0.9954633333333334
1fmk (0.5834920292623135, 1.9015876522821423e-06)
0.1546185688941455 1.0	0.35483857142857145 0.9239642857142857
1bk2 (0.4972804327321004, 8.312062657586012e-05)
0.066288125 1.0	0.1991342857142857 0.9886362499999999
1shf (0.49544197348405283, 6.617512094727042e-05)
0.09993067309553998 1.0	0.09411764705882353 0.9904757142857143
1ten (0.49630689286424917, 7.567563131927387e-07)
In [14]:
# inputs: exp_phi, ns_deg0, ns_cent0, tse_deg_avg, tse_cent_avg
# output: degcent_normed.out file for each pid contains normed deg_avg and cent_avg
corr_ns, corr_ts = [], [] # deg0~cent0 and deg_avg~cent_avg correlations
for pid in PIDs:
    print(pid)
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    r = pearsonr(df0.deg0.values, df0.cent0.values)
    print(r); corr_ns.append(r[0])
    df0 = df0.assign(deg0_norm = df0.deg0.values/df0.deg0.max())
    df0 = df0.assign(cent0_norm = df0.cent0.values/df0.cent0.max())
    
    fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
    df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg  
    r = pearsonr(df1.deg_avg.values, df1.cent_avg.values)
    print(r); corr_ts.append(r[0])
    m = df1.deg_avg.max()
    df1 = df1.assign(deg_avg_norm = df1.deg_avg.values/m)
    df1 = df1.assign(deg_avg_norm2 = df1.deg_avg.values/df0.deg0.values)
    df1 = df1.assign(cent_avg_norm = df1.cent_avg.values/df1.cent_avg.max())    

    df = df0.join(df1, rsuffix="_ts")    
    fn2 = os.path.join(dirname, pid + "_degcent_normed.out")
    df.to_csv(fn2, sep=' ', index=False)
1imq
(0.7558969268804924, 4.012079504672762e-17)
(0.7121122380072538, 1.502821945881671e-14)
1lmb4
(0.734286145453802, 8.182254971540257e-17)
(0.42777376163150493, 2.1066871340310242e-05)
1bf4
(0.8331380770622324, 2.4725153462041907e-17)
(0.2141558155358244, 0.09191415825395269)
2ptl
(0.8089544527164008, 6.138400200971352e-16)
(0.761844808708787, 2.660766356539596e-13)
2ci2
(0.7242935697029357, 9.16428859268319e-12)
(0.6412812418394419, 8.657914734671257e-09)
1aps
(0.832243017371372, 2.485047464677856e-26)
(0.7188158008970563, 7.790675718849913e-17)
1fmk
(0.7790290667014961, 9.608908036252283e-13)
(0.4358500966932449, 0.0007018681735138559)
1bk2
(0.8427664795977194, 2.0250960611567958e-16)
(0.7072244406109104, 7.773380930867111e-10)
1shf
(0.8576796838781358, 4.168120250965983e-18)
(0.4780795256370186, 0.00012841377932345095)
1ten
(0.7237901178849055, 1.1167776049285434e-15)
(0.6715019435547845, 5.921829012607581e-13)
In [15]:
fn = os.path.join(dirname, pid + "_degcent_normed.out")
print(fn)
df = pd.read_table(fn, sep="\s+")
df.head()
Paci\1ten_degcent_normed.out
Out[15]:
nid exp_phi deg0 cent0 deg0_norm cent0_norm nid_ts deg_avg cent_avg deg_avg_norm deg_avg_norm2 cent_avg_norm
0 0 0.04 17 220 0.739130 0.307263 0 1.60000 266.800 0.099931 0.094118 0.286584
1 1 -1.00 15 242 0.652174 0.337989 1 2.77778 286.344 0.173491 0.185185 0.307577
2 2 -1.00 16 295 0.695652 0.412011 2 4.35556 552.244 0.272034 0.272222 0.593194
3 3 0.04 19 399 0.826087 0.557263 3 4.25556 425.178 0.265788 0.223977 0.456706
4 4 -1.00 11 287 0.478261 0.400838 4 5.40000 475.900 0.337266 0.490909 0.511189
In [16]:
print(pearsonr(df.deg0.values, df.deg_avg.values))
print(pearsonr(df.cent0.values, df.cent_avg.values))
print(pearsonr(df.deg0_norm.values, df.deg_avg_norm.values))
print(pearsonr(df.cent0_norm.values, df.cent_avg_norm.values))
print(pearsonr(df.deg_avg_norm.values, df.deg_avg_norm2.values))
(0.6347314241420272, 2.398485663616192e-11)
(0.8097733568515959, 7.498918886312766e-22)
(0.6347314241420272, 2.398485663616192e-11)
(0.809773356851596, 7.498918886312611e-22)
(0.4963068928642492, 7.567563131927356e-07)
In [17]:
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality from NS PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn = os.path.join(dirname, pid + "_degcent_normed.out")
    df = pd.read_table(fn, sep="\s+")
    de = df.query("exp_phi > -1")
    mad_d = np.round(np.mean(np.abs(de.deg0_norm.values - de.exp_phi.values)), 2)
    mad_c = np.round(np.mean(np.abs(de.cent0_norm.values - de.exp_phi.values)), 2)
    print(pid, mad_d, mad_c)
    # print(pearsonr(de.deg0_norm.values, de.exp_phi.values))
    # print(pearsonr(de.cent0_norm.values, de.exp_phi.values))
    ax.plot(df.nid.values+1, df.deg0_norm.values, color="tab:blue", alpha=0.5, 
            label='NS deg ' + str(mad_d))
    ax.plot(df.nid.values+1, df.cent0_norm.values, color="tab:orange", alpha=0.5, 
            label='NS cent '  + str(mad_c))    
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
    ax.set_title(pid + ' ' + str(np.round(corr_ns[i], 2)), loc='left')
    ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))

# plt.savefig("NS_degcentphi.png")
1imq 0.48 0.44
1lmb4 0.34 0.3
1bf4 0.49 0.41
2ptl 0.35 0.29
2ci2 0.34 0.32
1aps 0.37 0.3
1fmk 0.45 0.43
1bk2 0.35 0.35
1shf 0.51 0.44
1ten 0.41 0.36
  • Figure above: Calculated phi-values as normalized node degree (deg) and as normalized node centrality (cent) of native-state (NS) PRNs. The numbers in the legend is the mean absolute difference between experimental phi-values with deg and cent respectively. The number in the title of each subplot is the Pearson correlation coefficient between un-normalized NS node degree and node centrality.
In [18]:
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality computed from TSE PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn = os.path.join(dirname, pid + "_degcent_normed.out")
    df = pd.read_table(fn, sep="\s+")
    de = df.query("exp_phi > -1")
    mad_d = np.round(np.mean(np.abs(de.deg_avg_norm.values - de.exp_phi.values)), 2)
    mad_d2 = np.round(np.mean(np.abs(de.deg_avg_norm2.values - de.exp_phi.values)), 2)
    mad_c = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 2)
    print(pid, mad_d, mad_d2, mad_c)    
    # print(pearsonr(de.deg_avg_norm.values, de.exp_phi.values))
    # print(pearsonr(de.cent_avg_norm.values, de.exp_phi.values))    
    ax.plot(df.nid.values+1, df.deg_avg_norm.values, color="tab:blue", alpha=0.5, 
           label='TSE deg ' + str(mad_d))
    ax.plot(df.nid.values+1, df.deg_avg_norm2.values, color="tab:gray", alpha=0.75, 
           label='TSE deg2 ' + str(mad_d2))
    ax.plot(df.nid.values+1, df.cent_avg_norm.values, color="tab:orange", alpha=0.5,
           label='TSE cent '  + str(mad_c))
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')    
    ax.set_title(pid + ' ' + str(np.round(corr_ts[i], 2)), loc='left')
    ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))    
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))

# plt.savefig("TSE_deg2centphi.png")
1imq 0.46 0.25 0.41
1lmb4 0.24 0.21 0.34
1bf4 0.35 0.4 0.37
2ptl 0.29 0.29 0.34
2ci2 0.28 0.36 0.29
1aps 0.26 0.35 0.26
1fmk 0.3 0.28 0.42
1bk2 0.28 0.31 0.26
1shf 0.3 0.22 0.43
1ten 0.3 0.32 0.4
  • Figure above: Calculated phi-values as normalized node degree (deg, deg2), and as normalized node centrality (cent) of transition-state (TSE) PRNs. TSE node statistics are ensemble averages. The numbers in the legend is the mean absolute difference between experimental phi-values with deg, deg2, and cent respectively. The number in the title of each subplot is the Pearson correlation coefficient between un-normalized TSE node degree and TSE node centrality.

Local node centrality

  • Set of paths to compute local node centrality is restricted to those involving the candidate initial fold substructure with the highest substructure centrality.
In [19]:
fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=2)     
df1.head()
Paci\1ten_locent.out
Out[19]:
nid walk_locent path_locent
0 0 22 22
1 1 23 23
2 2 18 18
3 3 35 35
4 4 23 23
In [20]:
np.all(df1.walk_locent.values == df1.path_locent.values)
Out[20]:
False
In [21]:
for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_deg0.out")
    df0 = pd.read_table(fn0, sep="\s+")
    # df0.head()
    fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
    df1 = pd.read_table(fn1, sep="\s+", header=2)     
    # df1.head()    
    m = df1.walk_locent.max()    
    wloc = df1.walk_locent.values/m
    m = df1.path_locent.max()    
    ploc = df1.path_locent.values/m    
    df = pd.DataFrame({"nid": df1.nid.values, "exp_phi": df0.exp_phi.values,
                       "wloc_norm": wloc, "ploc_norm": ploc})
    # df.head()
    fn2 = os.path.join(dirname, pid + "_locent_normed.out")
    df.to_csv(fn2, sep=' ', index=False)
In [22]:
fn2 = os.path.join(dirname, pid + "_locent_normed.out")
df1 = pd.read_table(fn2, sep="\s+")
df1.head()
Out[22]:
nid exp_phi wloc_norm ploc_norm
0 0 0.04 0.060606 0.060606
1 1 -1.00 0.063361 0.063361
2 2 -1.00 0.049587 0.049587
3 3 0.04 0.096419 0.096419
4 4 -1.00 0.063361 0.063361
In [23]:
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized local node centrality computed from NS PRN.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn2 = os.path.join(dirname, pid + "_locent_normed.out")
    df1 = pd.read_table(fn2, sep="\s+")
    # df1.head()
    de = df1.query("exp_phi > -1")    
    mad_w = np.round(np.mean(np.abs(de.wloc_norm.values - de.exp_phi.values)), 2)
    mad_p = np.round(np.mean(np.abs(de.ploc_norm.values - de.exp_phi.values)), 2)    
    print(pid, mad_w, pearsonr(de.wloc_norm.values, de.exp_phi.values))
    print(pid, mad_p, pearsonr(de.ploc_norm.values, de.exp_phi.values))
    # wloc and ploc results are pretty much the same (backtrack little effect on locent)
    # use ploc
    ax.plot(df1.nid.values+1, df1.ploc_norm.values, color="tab:orange", alpha=0.5, 
            label='NS locent ' + str(mad_p))
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
    ax.set_title(pid, loc='left'); ax.set_xlabel('Residue')
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0)) 

# plt.savefig("NS_locentphi.png")
1imq 0.33 (-0.025648230668538403, 0.9145241387030871)
1imq 0.33 (-0.025604563664157774, 0.9146691555757938)
1lmb4 0.5 (-0.5171574690974813, 0.23457821660661568)
1lmb4 0.5 (-0.5171574690974814, 0.23457821660661554)
1bf4 0.2 (0.4201774365017078, 0.0931032650714083)
1bf4 0.2 (0.42015242250513896, 0.09312477391591585)
2ptl 0.19 (0.5155735492095647, 0.0002455063634980463)
2ptl 0.19 (0.5124972428212422, 0.00027126442922180183)
2ci2 0.24 (-0.07821318144363226, 0.6454234991171355)
2ci2 0.24 (-0.09375632368399073, 0.5809883661597818)
1aps 0.22 (0.2567557505075375, 0.22584744820397268)
1aps 0.22 (0.24984368581197983, 0.239025239460789)
1fmk 0.3 (0.2382363823097531, 0.23143884215584937)
1fmk 0.3 (0.24128985526371682, 0.22533404197732862)
1bk2 0.17 (0.8243988135527502, 0.0009713195061109973)
1bk2 0.15 (0.8448885886452391, 0.0005417040867987477)
1shf 0.24 (0.4413417132768633, 0.15092434355704548)
1shf 0.24 (0.4408792947413657, 0.15140322582866247)
1ten 0.16 (0.5633977326697418, 0.002726431719489984)
1ten 0.16 (0.5626012840233487, 0.0027729402229222312)
  • Figure above: Calculated phi-values as normalized local node centrality (locent) of native-state (NS) PRNs. The number in the legend is the mean absolute difference between experimental phi-values and locent.

Summary

In [24]:
# MAD values
ns_deg = np.array([0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41])
ns_cent = np.array([0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36])
ns_locent = np.array([0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16])

ts_deg = np.array([0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3])
ts_deg2 = np.array([0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32])
ts_cent = np.array([0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4])
In [25]:
print("ns_deg:", ns_deg.mean(), ns_deg.std(ddof=1))
print("ns_cent:", ns_cent.mean(), ns_cent.std(ddof=1))
print("ns_locent:", ns_locent.mean(), ns_locent.std(ddof=1)) # larger std than the others!
print()
print("ts_deg:", ts_deg.mean(), ts_deg.std(ddof=1))
print("ts_deg2:", ts_deg2.mean(), ts_deg2.std(ddof=1))
print("ts_cent:", ts_cent.mean(), ts_cent.std(ddof=1))
ns_deg: 0.409 0.06789698078707182
ns_cent: 0.364 0.06131883886702357
ns_locent: 0.253 0.10360716405946283

ts_deg: 0.30599999999999994 0.06131883886702358
ts_deg2: 0.29900000000000004 0.061544924874255696
ts_cent: 0.352 0.0647731082746193
In [26]:
fig, axes = plt.subplots(2, 2, figsize=(9, 8), constrained_layout=True, sharey=True)
plt.suptitle("MAD between calc. and exp. $\phi$", fontsize=14)

ax = axes[0][0]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg ' + str(0.409))
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent ' + str(0.364))
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent ' + str(0.253))
ax.set_title("Native State")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.legend(loc="upper left", bbox_to_anchor=(0.27, 1.0)); ax.grid()

ax = axes[0][1]
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg ' + str(0.306))
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 ' + str(0.300))
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent ' + str(0.352)) 
ax.set_title("Transition-State Ensemble")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()

ax = axes[1][0]
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent 0.364')
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent 0.253')
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent 0.352')
# ax.plot(PIDs, nd_cent, 'o-', color='tab:blue', label="ND 'c' cent 0.31")
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.set_title("Node centrality")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(loc='upper right', ncol=2); ax.grid()

ax = axes[1][1]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg 0.409')
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg 0.306')
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 0.300')
# ax.plot(PIDs, nd_deg, 'o--', color='tab:blue', label="ND 'c' deg 0.36")
ax.set_title("Node degree")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()

# plt.savefig("MAD_phi_NSTSE.png")
  • There is no significant difference between TSE deg (node degree normalized by max of a sequence) and TSE deg2 (node degree normalized by NS value).
    • MAD scores for TSE deg average at 0.306, which is not significantly different from the MAD scores for TSE deg2, which average at 0.300. However, a substantial difference between the two is observed for 1IMQ.
  • Experimental phi-values correspond better with NS local node centrality (NS locent) than with TSE deg at $\alpha$ = 0.0924 with the Welch Two Sample t-test.
In [27]:
ts_deg - ts_deg2
Out[27]:
array([ 0.21,  0.03, -0.05,  0.  , -0.08, -0.09,  0.02, -0.03,  0.08,
       -0.02])
In [28]:
# two-sided pvalues
print("ts_deg2 ts_deg")
print(ttest_rel(ts_deg2, ts_deg))
print(tts(ts_deg2.mean(), ts_deg2.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nts_deg ts_cent")
print(ttest_rel(ts_deg, ts_cent))
print(tts(ts_cent.mean(), ts_cent.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ts_deg")
print(ttest_rel(ns_locent, ts_deg))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ns_cent")
print(ttest_rel(ns_locent, ns_cent))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10, 
          ns_cent.mean(), ns_cent.std(ddof=1), 10, equal_var=False))
ts_deg2 ts_deg
Ttest_relResult(statistic=-0.251700152021801, pvalue=0.8069261071191446)
Ttest_indResult(statistic=-0.25479358381213374, pvalue=0.8017708685555922)

ts_deg ts_cent
Ttest_relResult(statistic=-2.2987232860520543, pvalue=0.04709767501428968)
Ttest_indResult(statistic=1.6308821344023772, pvalue=0.12033957753441075)

ns_locent ts_deg
Ttest_relResult(statistic=-1.3828178332567058, pvalue=0.20006245739055756)
Ttest_indResult(statistic=-1.3921149025414925, pvalue=0.18470868371979515)

ns_locent ns_cent
Ttest_relResult(statistic=-2.8908471964114986, pvalue=0.017859607278043747)
Ttest_indResult(statistic=-2.915561399662374, pvalue=0.010885345852836424)
ns_deg = c(0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41)
ns_cent = c(0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36)
ns_locent = c(0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16)

ts_deg = c(0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3)
ts_deg2 = c(0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32)
ts_cent = c(0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4)

> t.test(ts_deg2, ts_deg, paired=T, alt="less")$p.value
[1] 0.4034631
> t.test(ts_deg, ts_cent, paired=T, alt="less")$p.value
[1] 0.02354884
> t.test(ns_locent, ts_deg, paired=T, alt="less")$p.value
[1] 0.1000312
> t.test(ns_locent, ts_deg, alt="less")$p.value # Welch Two Sample t-test.
[1] 0.09235434
In [29]:
import random
random.seed()
# mean_95CI with resampling/bootstrap
def bootstrap(vals):
    means = np.zeros(10000); stdevs = np.zeros(10000)
    for i in np.arange(10000):
        resample = random.choices(vals, k=10) # with replacement
        means[i], stdevs[i] = np.mean(resample), np.std(resample, ddof=1)
    a, z = np.percentile(means, 2.5), np.percentile(means, 97.5) 
    moe = (z-a)/2.0
    return moe, a, z

# calc the 95%CI of the mean, with sample stdev, n >= 30
def mean_95CI(sample_mean, sample_stdev, sample_size):
    moe = (1.96 * sample_stdev)/np.sqrt(sample_size) # margin of error
    return moe, sample_mean - moe, sample_mean + moe

def summary_MADstats(vals):
    mu = np.mean(vals); snu = np.std(vals, ddof=1)
    gmoe, ga, gz = mean_95CI(mu, snu, 10) # Gaussian estimated
    bmoe, ba, bz = bootstrap(vals) # bootstrap
    return np.round([mu, snu, np.ptp(vals), gmoe, ga, gz, bmoe, ba, bz], 4)
In [30]:
# sample size is too small for CLT which 95%meanCI assumes (should use bootstrap, resampling?)
print("MAD between experimental and calculated phi-values")
df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
                                  "gMOE", "95meanCI_gleft", "95meanCI_gright",
                                  "bMOE", "95meanCI_bleft", "95meanCI_bright",],
                "NS_deg": summary_MADstats(ns_deg), "NS_cent": summary_MADstats(ns_cent),
                "NS_locent": summary_MADstats(ns_locent), "TSE_deg": summary_MADstats(ts_deg),
                "TSE_deg2": summary_MADstats(ts_deg2), "TSE_cent": summary_MADstats(ts_cent) })
df = df.set_index("MAD_stats")
df
MAD between experimental and calculated phi-values
Out[30]:
NS_deg NS_cent NS_locent TSE_deg TSE_deg2 TSE_cent
MAD_stats
sample_avg 0.4090 0.3640 0.2530 0.3060 0.2990 0.3520
sample_stdev 0.0679 0.0613 0.1036 0.0613 0.0615 0.0648
range 0.1700 0.1500 0.3500 0.2200 0.1900 0.1700
gMOE 0.0421 0.0380 0.0642 0.0380 0.0381 0.0401
95meanCI_gleft 0.3669 0.3260 0.1888 0.2680 0.2609 0.3119
95meanCI_gright 0.4511 0.4020 0.3172 0.3440 0.3371 0.3921
bMOE 0.0405 0.0365 0.0605 0.0350 0.0360 0.0380
95meanCI_bleft 0.3690 0.3290 0.2000 0.2760 0.2640 0.3130
95meanCI_bright 0.4500 0.4020 0.3210 0.3460 0.3360 0.3890
In [31]:
methods = df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even

# both ways produce similar CIs
ga = df.loc['95meanCI_gleft'].values; gz = df.loc['95meanCI_gright'].values
# gmoe = df.loc['gMOE'].values
ba = df.loc['95meanCI_bleft'].values; bz = df.loc['95meanCI_bright'].values
# bmoe = df.loc['bMOE'].values
mu = df.loc["sample_avg"]

for i, p in enumerate(zip(ga, gz, ba, bz)):
    plt.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
    plt.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3])) 
    plt.scatter(i+0.2, mu[i], marker='^')

plt.title("Calculating $\phi$ values from native-state and TSE PRNs.")
plt.xticks(list(range(0, n, 1)), methods, rotation=45, fontsize=12)
plt.xlabel("Method to calculate phi-values", fontsize=12)
plt.ylabel("95% confidence interval for mean MAD", fontsize=12)
plt.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0,1.0))
plt.grid()
  • A Confidence Interval (CI) as above gives a range estimate for the average MAD of a method; the average MAD will lie somewhere in the interval 95% of the time the method is used.
  • Prefer a method that yields smaller largest average MAD, i.e. the best worst case, which we take to be the 97.5 percentile estimate for average MAD.
  • If calculating phi-values from TSE PRNs, better to use node degree than node centrality.
  • If calculating phi-values from NS PRNs, better to use local node centrality (locent) than (global) node centrality or node degree.
    • But local centrality has additional overhead compared to (global) centrality.
    • NS_locent also has the widest margin-of-error of the methods; more uncertainty associated with its performance.
  • NS_cent is next best, but more expensive than NS_deg.

Explore local centrality of all candidate initial fold substructures

  • Choice of using the max (as above) is satisfying (close to optimal with clear selection criterion).
In [32]:
fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=1)
df1.head()
Paci\1ten_locents.out
Out[32]:
nid 17 18 19 20 21 22 23
0 0 14 13 22 3 31 16 58
1 1 26 35 23 2 17 7 35
2 2 41 6 18 1 19 15 36
3 3 104 79 35 1 33 20 2
4 4 172 70 23 1 10 25 0
In [33]:
# combine exp_phi with degseq
for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_deg0.out")
    df0 = pd.read_table(fn0, sep="\s+") # to get nids with exp_phi values, and exp_phi values
    dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
    phis = dp.exp_phi.values # phi-values
    nids = dp.nid.values # nids with phi-values
    fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
    df1 = pd.read_table(fn1, sep="\s+", header=1)
    mads = []
    ncols = df1.shape[1] # 1..n are subs: candidate initial fold substructures
    for c in np.arange(1, ncols, 1):
        a = df1.iloc[:, c].values  # select column-wise
        normed_a = a/np.max(a)
        mad = np.round(np.mean(np.abs(normed_a.take(nids) - phis)), 3)
        mads.append(mad)
    print(pid, np.min(mads), np.argmin(mads), mads)
1imq 0.306 2 [0.329, 0.363, 0.306]
1lmb4 0.5 4 [0.53, 0.583, 0.501, 0.507, 0.5]
1bf4 0.204 3 [0.345, 0.373, 0.308, 0.204, 0.246, 0.242, 0.285]
2ptl 0.19 0 [0.19, 0.243, 0.329, 0.336, 0.334]
2ci2 0.186 3 [0.254, 0.244, 0.203, 0.186, 0.236, 0.262, 0.304]
1aps 0.21 0 [0.21, 0.246, 0.223, 0.234, 0.333, 0.277]
1fmk 0.207 5 [0.439, 0.355, 0.378, 0.322, 0.3, 0.207, 0.316, 0.399]
1bk2 0.153 5 [0.41, 0.307, 0.397, 0.312, 0.299, 0.153, 0.295, 0.341]
1shf 0.236 5 [0.39, 0.384, 0.487, 0.418, 0.24, 0.236, 0.396, 0.436]
1ten 0.142 4 [0.268, 0.21, 0.156, 0.217, 0.142, 0.209, 0.27]
In [34]:
# MAD scores with different criterion for local centrality
max_cent = [0.306, 0.501, 0.204, 0.19, 0.236, 0.223, 0.30, 0.153, 0.24, 0.156]
init_fold = [0.363, 0.507, 0.345, 0.19, 0.236, 0.223, 0.355, 0.153, 0.384, 0.209]
min_mad = [0.306, 0.50, 0.204, 0.19, 0.186, 0.21, 0.207, 0.153, 0.236, 0.142]
print(np.mean(max_cent), np.mean(init_fold), np.mean(min_mad))
0.2509 0.2965 0.2334
In [35]:
plt.title("Local centrality with different EDS path criterion", fontsize=12)
plt.plot(PIDs, max_cent, 'o-', color='tab:pink', label='max_cent 0.251')
plt.plot(PIDs, init_fold, 'x--', color='tab:brown', label='init_fold 0.296')
plt.plot(PIDs, min_mad, '.--', color='black', label='min_mad 0.233')

plt.xticks(PIDs, rotation=90)
plt.ylabel("MAD between calc. and exp. $\phi$", fontsize=12)
plt.legend(loc='upper right'); plt.grid()

# plt.savefig("MAD_phi_locents.png")

End of notebook

SKLC November 2020