Calculated phi-values from native-state (NS) and transition-state (TS) PRNs.¶

TS protein residue networks are PRNs built from Transition-State ensembles in the Paci dataset.

import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel # samples have same variance, paired 
from scipy.stats import ttest_ind_from_stats as tts # different variance, paired?
# check with R, use CI

# import matplotlib
# matplotlib.use('AGG')
# https://matplotlib.org/tutorials/introductory/usage.html?highlight=saving%20figures%20file

Qs =[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
dirname = "Paci"
PIDs = ["1imq", "1lmb4", "1bf4", "2ptl", "2ci2", "1aps", "1fmk", "1bk2", "1shf", "1ten"]

Long-range contacts in TSEs for residues with experimental phi-values¶

pid = "2ci2"
fn0 = os.path.join(dirname, pid + "_nodestats.out") # Native-state (PRN0) node statistics
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()

dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
nids = dp.nid.values # nids with phi-values
print(len(nids))

37

fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
df1 = pd.read_table(fn1, sep=' ')
df1.head()

dg = df1.groupby('nid')
len(dg)

65

dg.le.mean() # average number of long-range PRN edges in TSE for a pid

nid
0     0.000000
1     0.282609
2     0.298913
3     0.521739
4     0.146739
        ...   
60    1.119565
61    0.739130
62    1.152174
63    0.750000
64    0.717391
Name: le, Length: 65, dtype: float64

fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True)
plt.suptitle("Average number of long-range edges and shortcuts per node in TSE PRNs.")
plt.delaxes(axes[3][1])
plt.delaxes(axes[3][2])

for i, pid in enumerate(PIDs):
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    # df0.head()
    dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
    nids = dp.nid.values
    fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
    df1 = pd.read_table(fn1, sep=' ')
    # df1.head()
    dg = df1.groupby('nid')
    dle = dg.le.mean() # mean le per node
    dscle = dg.scle.mean() # mean scle per node
    dphi = dle.iloc[nids]
    r = i//3; c = i%3; ax = axes[r][c]
    ax.plot(dle.index.values+1, dle.values, label="Long-range edge in TSE PRN0")
    ax.plot(dscle.index.values+1, dscle.values, label="Long-range shortcut in TSE SCN0")    
    ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimenal phi-value")
    dphi = dscle.iloc[nids]
    ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimental phi-value")
    ax.plot(dle.index.values, [1.0]*len(dle.index.values), linewidth=0.8, color="tab:gray",
           label="y=1")
    ax.set_title(pid); ax.set_xlabel("Node")
    if 0 == c:
        ax.set_ylabel("Average number of contacts per node")

h, l = axes[0][0].get_legend_handles_labels()
_ = fig.legend(h, l, loc="upper left", bbox_to_anchor=(0.4, 0.2))

# plt.savefig("TSE_scle.png")

Calculate phi-values from NS and TSE PRNs¶

Node statistics: degree and (EDS path) centrality/betweenness.
Both normalized by dividing with maximum value of a sequence. Interpretation of normalized values as importance of a node relative to other nodes in the network, with larger values having more importance.
Experimental phi-values are in [0.0, 1.0].
Normalization differs from the usual interpretation of experimental phi-values as the extent to which a node's native contacts are formed at the transition-state, i.e. (TS node degree) / (NS node degree).
- Cannot use this formulation for NS node statistics.
- Does not work readily with centrality because TS and ND node centrality can be larger than NS node centrality; resultant range is not bounded in [0.0, 1.0].
  - Could scale the resultant range for each structure, or compute node centrality values as the fraction of all paths.
- Implication of dividing by a constant (per node) across all $Q$s for ND node statistics is examined in another notebook.

pid = "1imq"
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()

fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg  
df1.head()

c_norm = df1.cent_avg.values/df0.cent0.values
print(min(c_norm), max(c_norm))

0.9105477855477856 2.844799043062201

# The two ways of normalizing TS node degree are at best weakly correlated.
# Range of calculated phi-values is wider with normalization with max. 
for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
    df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg      
    m = df1.deg_avg.max()     # normalization with max
    d_norm = df1.deg_avg.values/m
    print(min(d_norm), max(d_norm), end='\t')    
    d_norm2 = df1.deg_avg.values/df0.deg0.values # normalization with NS statistic
    print(min(d_norm2), max(d_norm2))
    print(pid, pearsonr(d_norm, d_norm2))

0.05734767025089606 1.0	0.2801724137931034 1.0
1imq (-0.215831042712451, 0.04595175252214858)
0.1443785648262957 1.0	0.36712625 1.0
1lmb4 (-0.10076319972025095, 0.33922007933052833)
0.07028480762647145 1.0	0.13344625000000002 1.0
1bf4 (0.5167851780080396, 1.4501023914945505e-05)
0.1661738537437799 1.0	0.21164037037037037 0.8565307692307692
2ptl (0.27464193460545694, 0.028073860428368513)
0.0698558175924891 1.0	0.125 1.0
2ci2 (0.4084987349365345, 0.0007296363371582195)
0.1266664088894474 1.0	0.3264366666666667 0.8817714285714285
1aps (0.571450279811207, 7.988652162868365e-10)
0.07797334576679488 1.0	0.12040799999999999 0.9954633333333334
1fmk (0.5834920292623135, 1.9015876522821423e-06)
0.1546185688941455 1.0	0.35483857142857145 0.9239642857142857
1bk2 (0.4972804327321004, 8.312062657586012e-05)
0.066288125 1.0	0.1991342857142857 0.9886362499999999
1shf (0.49544197348405283, 6.617512094727042e-05)
0.09993067309553998 1.0	0.09411764705882353 0.9904757142857143
1ten (0.49630689286424917, 7.567563131927387e-07)

# inputs: exp_phi, ns_deg0, ns_cent0, tse_deg_avg, tse_cent_avg
# output: degcent_normed.out file for each pid contains normed deg_avg and cent_avg
corr_ns, corr_ts = [], [] # deg0~cent0 and deg_avg~cent_avg correlations
for pid in PIDs:
    print(pid)
    fn0 = os.path.join(dirname, pid + "_nodestats.out")
    df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
    r = pearsonr(df0.deg0.values, df0.cent0.values)
    print(r); corr_ns.append(r[0])
    df0 = df0.assign(deg0_norm = df0.deg0.values/df0.deg0.max())
    df0 = df0.assign(cent0_norm = df0.cent0.values/df0.cent0.max())
    
    fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
    df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg  
    r = pearsonr(df1.deg_avg.values, df1.cent_avg.values)
    print(r); corr_ts.append(r[0])
    m = df1.deg_avg.max()
    df1 = df1.assign(deg_avg_norm = df1.deg_avg.values/m)
    df1 = df1.assign(deg_avg_norm2 = df1.deg_avg.values/df0.deg0.values)
    df1 = df1.assign(cent_avg_norm = df1.cent_avg.values/df1.cent_avg.max())    

    df = df0.join(df1, rsuffix="_ts")    
    fn2 = os.path.join(dirname, pid + "_degcent_normed.out")
    df.to_csv(fn2, sep=' ', index=False)

1imq
(0.7558969268804924, 4.012079504672762e-17)
(0.7121122380072538, 1.502821945881671e-14)
1lmb4
(0.734286145453802, 8.182254971540257e-17)
(0.42777376163150493, 2.1066871340310242e-05)
1bf4
(0.8331380770622324, 2.4725153462041907e-17)
(0.2141558155358244, 0.09191415825395269)
2ptl
(0.8089544527164008, 6.138400200971352e-16)
(0.761844808708787, 2.660766356539596e-13)
2ci2
(0.7242935697029357, 9.16428859268319e-12)
(0.6412812418394419, 8.657914734671257e-09)
1aps
(0.832243017371372, 2.485047464677856e-26)
(0.7188158008970563, 7.790675718849913e-17)
1fmk
(0.7790290667014961, 9.608908036252283e-13)
(0.4358500966932449, 0.0007018681735138559)
1bk2
(0.8427664795977194, 2.0250960611567958e-16)
(0.7072244406109104, 7.773380930867111e-10)
1shf
(0.8576796838781358, 4.168120250965983e-18)
(0.4780795256370186, 0.00012841377932345095)
1ten
(0.7237901178849055, 1.1167776049285434e-15)
(0.6715019435547845, 5.921829012607581e-13)

fn = os.path.join(dirname, pid + "_degcent_normed.out")
print(fn)
df = pd.read_table(fn, sep="\s+")
df.head()

Paci\1ten_degcent_normed.out

print(pearsonr(df.deg0.values, df.deg_avg.values))
print(pearsonr(df.cent0.values, df.cent_avg.values))
print(pearsonr(df.deg0_norm.values, df.deg_avg_norm.values))
print(pearsonr(df.cent0_norm.values, df.cent_avg_norm.values))
print(pearsonr(df.deg_avg_norm.values, df.deg_avg_norm2.values))

(0.6347314241420272, 2.398485663616192e-11)
(0.8097733568515959, 7.498918886312766e-22)
(0.6347314241420272, 2.398485663616192e-11)
(0.809773356851596, 7.498918886312611e-22)
(0.4963068928642492, 7.567563131927356e-07)

fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality from NS PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn = os.path.join(dirname, pid + "_degcent_normed.out")
    df = pd.read_table(fn, sep="\s+")
    de = df.query("exp_phi > -1")
    mad_d = np.round(np.mean(np.abs(de.deg0_norm.values - de.exp_phi.values)), 2)
    mad_c = np.round(np.mean(np.abs(de.cent0_norm.values - de.exp_phi.values)), 2)
    print(pid, mad_d, mad_c)
    # print(pearsonr(de.deg0_norm.values, de.exp_phi.values))
    # print(pearsonr(de.cent0_norm.values, de.exp_phi.values))
    ax.plot(df.nid.values+1, df.deg0_norm.values, color="tab:blue", alpha=0.5, 
            label='NS deg ' + str(mad_d))
    ax.plot(df.nid.values+1, df.cent0_norm.values, color="tab:orange", alpha=0.5, 
            label='NS cent '  + str(mad_c))    
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
    ax.set_title(pid + ' ' + str(np.round(corr_ns[i], 2)), loc='left')
    ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))

# plt.savefig("NS_degcentphi.png")

1imq 0.48 0.44
1lmb4 0.34 0.3
1bf4 0.49 0.41
2ptl 0.35 0.29
2ci2 0.34 0.32
1aps 0.37 0.3
1fmk 0.45 0.43
1bk2 0.35 0.35
1shf 0.51 0.44
1ten 0.41 0.36

Figure above: Calculated phi-values as normalized node degree (deg) and as normalized node centrality (cent) of native-state (NS) PRNs. The numbers in the legend is the mean absolute difference between experimental phi-values with deg and cent respectively. The number in the title of each subplot is the Pearson correlation coefficient between un-normalized NS node degree and node centrality.

fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality computed from TSE PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn = os.path.join(dirname, pid + "_degcent_normed.out")
    df = pd.read_table(fn, sep="\s+")
    de = df.query("exp_phi > -1")
    mad_d = np.round(np.mean(np.abs(de.deg_avg_norm.values - de.exp_phi.values)), 2)
    mad_d2 = np.round(np.mean(np.abs(de.deg_avg_norm2.values - de.exp_phi.values)), 2)
    mad_c = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 2)
    print(pid, mad_d, mad_d2, mad_c)    
    # print(pearsonr(de.deg_avg_norm.values, de.exp_phi.values))
    # print(pearsonr(de.cent_avg_norm.values, de.exp_phi.values))    
    ax.plot(df.nid.values+1, df.deg_avg_norm.values, color="tab:blue", alpha=0.5, 
           label='TSE deg ' + str(mad_d))
    ax.plot(df.nid.values+1, df.deg_avg_norm2.values, color="tab:gray", alpha=0.75, 
           label='TSE deg2 ' + str(mad_d2))
    ax.plot(df.nid.values+1, df.cent_avg_norm.values, color="tab:orange", alpha=0.5,
           label='TSE cent '  + str(mad_c))
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')    
    ax.set_title(pid + ' ' + str(np.round(corr_ts[i], 2)), loc='left')
    ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))    
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))

# plt.savefig("TSE_deg2centphi.png")

1imq 0.46 0.25 0.41
1lmb4 0.24 0.21 0.34
1bf4 0.35 0.4 0.37
2ptl 0.29 0.29 0.34
2ci2 0.28 0.36 0.29
1aps 0.26 0.35 0.26
1fmk 0.3 0.28 0.42
1bk2 0.28 0.31 0.26
1shf 0.3 0.22 0.43
1ten 0.3 0.32 0.4

Figure above: Calculated phi-values as normalized node degree (deg, deg2), and as normalized node centrality (cent) of transition-state (TSE) PRNs. TSE node statistics are ensemble averages. The numbers in the legend is the mean absolute difference between experimental phi-values with deg, deg2, and cent respectively. The number in the title of each subplot is the Pearson correlation coefficient between un-normalized TSE node degree and TSE node centrality.

Local node centrality¶

Set of paths to compute local node centrality is restricted to those involving the candidate initial fold substructure with the highest substructure centrality.

fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=2)     
df1.head()

Paci\1ten_locent.out

np.all(df1.walk_locent.values == df1.path_locent.values)

False

for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_deg0.out")
    df0 = pd.read_table(fn0, sep="\s+")
    # df0.head()
    fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
    df1 = pd.read_table(fn1, sep="\s+", header=2)     
    # df1.head()    
    m = df1.walk_locent.max()    
    wloc = df1.walk_locent.values/m
    m = df1.path_locent.max()    
    ploc = df1.path_locent.values/m    
    df = pd.DataFrame({"nid": df1.nid.values, "exp_phi": df0.exp_phi.values,
                       "wloc_norm": wloc, "ploc_norm": ploc})
    # df.head()
    fn2 = os.path.join(dirname, pid + "_locent_normed.out")
    df.to_csv(fn2, sep=' ', index=False)

fn2 = os.path.join(dirname, pid + "_locent_normed.out")
df1 = pd.read_table(fn2, sep="\s+")
df1.head()

fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized local node centrality computed from NS PRN.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])

for i, pid in enumerate(PIDs):    
    r = i//3; c = i%3; ax = axes[r][c]
    fn2 = os.path.join(dirname, pid + "_locent_normed.out")
    df1 = pd.read_table(fn2, sep="\s+")
    # df1.head()
    de = df1.query("exp_phi > -1")    
    mad_w = np.round(np.mean(np.abs(de.wloc_norm.values - de.exp_phi.values)), 2)
    mad_p = np.round(np.mean(np.abs(de.ploc_norm.values - de.exp_phi.values)), 2)    
    print(pid, mad_w, pearsonr(de.wloc_norm.values, de.exp_phi.values))
    print(pid, mad_p, pearsonr(de.ploc_norm.values, de.exp_phi.values))
    # wloc and ploc results are pretty much the same (backtrack little effect on locent)
    # use ploc
    ax.plot(df1.nid.values+1, df1.ploc_norm.values, color="tab:orange", alpha=0.5, 
            label='NS locent ' + str(mad_p))
    ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
    ax.set_title(pid, loc='left'); ax.set_xlabel('Residue')
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0)) 

# plt.savefig("NS_locentphi.png")

1imq 0.33 (-0.025648230668538403, 0.9145241387030871)
1imq 0.33 (-0.025604563664157774, 0.9146691555757938)
1lmb4 0.5 (-0.5171574690974813, 0.23457821660661568)
1lmb4 0.5 (-0.5171574690974814, 0.23457821660661554)
1bf4 0.2 (0.4201774365017078, 0.0931032650714083)
1bf4 0.2 (0.42015242250513896, 0.09312477391591585)
2ptl 0.19 (0.5155735492095647, 0.0002455063634980463)
2ptl 0.19 (0.5124972428212422, 0.00027126442922180183)
2ci2 0.24 (-0.07821318144363226, 0.6454234991171355)
2ci2 0.24 (-0.09375632368399073, 0.5809883661597818)
1aps 0.22 (0.2567557505075375, 0.22584744820397268)
1aps 0.22 (0.24984368581197983, 0.239025239460789)
1fmk 0.3 (0.2382363823097531, 0.23143884215584937)
1fmk 0.3 (0.24128985526371682, 0.22533404197732862)
1bk2 0.17 (0.8243988135527502, 0.0009713195061109973)
1bk2 0.15 (0.8448885886452391, 0.0005417040867987477)
1shf 0.24 (0.4413417132768633, 0.15092434355704548)
1shf 0.24 (0.4408792947413657, 0.15140322582866247)
1ten 0.16 (0.5633977326697418, 0.002726431719489984)
1ten 0.16 (0.5626012840233487, 0.0027729402229222312)

Figure above: Calculated phi-values as normalized local node centrality (locent) of native-state (NS) PRNs. The number in the legend is the mean absolute difference between experimental phi-values and locent.

Summary¶

# MAD values
ns_deg = np.array([0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41])
ns_cent = np.array([0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36])
ns_locent = np.array([0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16])

ts_deg = np.array([0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3])
ts_deg2 = np.array([0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32])
ts_cent = np.array([0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4])

print("ns_deg:", ns_deg.mean(), ns_deg.std(ddof=1))
print("ns_cent:", ns_cent.mean(), ns_cent.std(ddof=1))
print("ns_locent:", ns_locent.mean(), ns_locent.std(ddof=1)) # larger std than the others!
print()
print("ts_deg:", ts_deg.mean(), ts_deg.std(ddof=1))
print("ts_deg2:", ts_deg2.mean(), ts_deg2.std(ddof=1))
print("ts_cent:", ts_cent.mean(), ts_cent.std(ddof=1))

ns_deg: 0.409 0.06789698078707182
ns_cent: 0.364 0.06131883886702357
ns_locent: 0.253 0.10360716405946283

ts_deg: 0.30599999999999994 0.06131883886702358
ts_deg2: 0.29900000000000004 0.061544924874255696
ts_cent: 0.352 0.0647731082746193

fig, axes = plt.subplots(2, 2, figsize=(9, 8), constrained_layout=True, sharey=True)
plt.suptitle("MAD between calc. and exp. $\phi$", fontsize=14)

ax = axes[0][0]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg ' + str(0.409))
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent ' + str(0.364))
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent ' + str(0.253))
ax.set_title("Native State")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.legend(loc="upper left", bbox_to_anchor=(0.27, 1.0)); ax.grid()

ax = axes[0][1]
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg ' + str(0.306))
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 ' + str(0.300))
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent ' + str(0.352)) 
ax.set_title("Transition-State Ensemble")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()

ax = axes[1][0]
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent 0.364')
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent 0.253')
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent 0.352')
# ax.plot(PIDs, nd_cent, 'o-', color='tab:blue', label="ND 'c' cent 0.31")
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.set_title("Node centrality")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(loc='upper right', ncol=2); ax.grid()

ax = axes[1][1]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg 0.409')
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg 0.306')
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 0.300')
# ax.plot(PIDs, nd_deg, 'o--', color='tab:blue', label="ND 'c' deg 0.36")
ax.set_title("Node degree")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()

# plt.savefig("MAD_phi_NSTSE.png")

There is no significant difference between TSE deg (node degree normalized by max of a sequence) and TSE deg2 (node degree normalized by NS value).
- MAD scores for TSE deg average at 0.306, which is not significantly different from the MAD scores for TSE deg2, which average at 0.300. However, a substantial difference between the two is observed for 1IMQ.
Experimental phi-values correspond better with NS local node centrality (NS locent) than with TSE deg at $\alpha$ = 0.0924 with the Welch Two Sample t-test.

ts_deg - ts_deg2

array([ 0.21,  0.03, -0.05,  0.  , -0.08, -0.09,  0.02, -0.03,  0.08,
       -0.02])

# two-sided pvalues
print("ts_deg2 ts_deg")
print(ttest_rel(ts_deg2, ts_deg))
print(tts(ts_deg2.mean(), ts_deg2.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nts_deg ts_cent")
print(ttest_rel(ts_deg, ts_cent))
print(tts(ts_cent.mean(), ts_cent.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ts_deg")
print(ttest_rel(ns_locent, ts_deg))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10, 
          ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ns_cent")
print(ttest_rel(ns_locent, ns_cent))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10, 
          ns_cent.mean(), ns_cent.std(ddof=1), 10, equal_var=False))

ts_deg2 ts_deg
Ttest_relResult(statistic=-0.251700152021801, pvalue=0.8069261071191446)
Ttest_indResult(statistic=-0.25479358381213374, pvalue=0.8017708685555922)

ts_deg ts_cent
Ttest_relResult(statistic=-2.2987232860520543, pvalue=0.04709767501428968)
Ttest_indResult(statistic=1.6308821344023772, pvalue=0.12033957753441075)

ns_locent ts_deg
Ttest_relResult(statistic=-1.3828178332567058, pvalue=0.20006245739055756)
Ttest_indResult(statistic=-1.3921149025414925, pvalue=0.18470868371979515)

ns_locent ns_cent
Ttest_relResult(statistic=-2.8908471964114986, pvalue=0.017859607278043747)
Ttest_indResult(statistic=-2.915561399662374, pvalue=0.010885345852836424)

ns_deg = c(0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41)
ns_cent = c(0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36)
ns_locent = c(0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16)

ts_deg = c(0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3)
ts_deg2 = c(0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32)
ts_cent = c(0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4)

> t.test(ts_deg2, ts_deg, paired=T, alt="less")$p.value
[1] 0.4034631
> t.test(ts_deg, ts_cent, paired=T, alt="less")$p.value
[1] 0.02354884
> t.test(ns_locent, ts_deg, paired=T, alt="less")$p.value
[1] 0.1000312
> t.test(ns_locent, ts_deg, alt="less")$p.value # Welch Two Sample t-test.
[1] 0.09235434

import random
random.seed()
# mean_95CI with resampling/bootstrap
def bootstrap(vals):
    means = np.zeros(10000); stdevs = np.zeros(10000)
    for i in np.arange(10000):
        resample = random.choices(vals, k=10) # with replacement
        means[i], stdevs[i] = np.mean(resample), np.std(resample, ddof=1)
    a, z = np.percentile(means, 2.5), np.percentile(means, 97.5) 
    moe = (z-a)/2.0
    return moe, a, z

# calc the 95%CI of the mean, with sample stdev, n >= 30
def mean_95CI(sample_mean, sample_stdev, sample_size):
    moe = (1.96 * sample_stdev)/np.sqrt(sample_size) # margin of error
    return moe, sample_mean - moe, sample_mean + moe

def summary_MADstats(vals):
    mu = np.mean(vals); snu = np.std(vals, ddof=1)
    gmoe, ga, gz = mean_95CI(mu, snu, 10) # Gaussian estimated
    bmoe, ba, bz = bootstrap(vals) # bootstrap
    return np.round([mu, snu, np.ptp(vals), gmoe, ga, gz, bmoe, ba, bz], 4)

# sample size is too small for CLT which 95%meanCI assumes (should use bootstrap, resampling?)
print("MAD between experimental and calculated phi-values")
df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
                                  "gMOE", "95meanCI_gleft", "95meanCI_gright",
                                  "bMOE", "95meanCI_bleft", "95meanCI_bright",],
                "NS_deg": summary_MADstats(ns_deg), "NS_cent": summary_MADstats(ns_cent),
                "NS_locent": summary_MADstats(ns_locent), "TSE_deg": summary_MADstats(ts_deg),
                "TSE_deg2": summary_MADstats(ts_deg2), "TSE_cent": summary_MADstats(ts_cent) })
df = df.set_index("MAD_stats")
df

MAD between experimental and calculated phi-values

methods = df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even

# both ways produce similar CIs
ga = df.loc['95meanCI_gleft'].values; gz = df.loc['95meanCI_gright'].values
# gmoe = df.loc['gMOE'].values
ba = df.loc['95meanCI_bleft'].values; bz = df.loc['95meanCI_bright'].values
# bmoe = df.loc['bMOE'].values
mu = df.loc["sample_avg"]

for i, p in enumerate(zip(ga, gz, ba, bz)):
    plt.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
    plt.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3])) 
    plt.scatter(i+0.2, mu[i], marker='^')

plt.title("Calculating $\phi$ values from native-state and TSE PRNs.")
plt.xticks(list(range(0, n, 1)), methods, rotation=45, fontsize=12)
plt.xlabel("Method to calculate phi-values", fontsize=12)
plt.ylabel("95% confidence interval for mean MAD", fontsize=12)
plt.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0,1.0))
plt.grid()

A Confidence Interval (CI) as above gives a range estimate for the average MAD of a method; the average MAD will lie somewhere in the interval 95% of the time the method is used.
Prefer a method that yields smaller largest average MAD, i.e. the best worst case, which we take to be the 97.5 percentile estimate for average MAD.
If calculating phi-values from TSE PRNs, better to use node degree than node centrality.
If calculating phi-values from NS PRNs, better to use local node centrality (locent) than (global) node centrality or node degree.
- But local centrality has additional overhead compared to (global) centrality.
- NS_locent also has the widest margin-of-error of the methods; more uncertainty associated with its performance.
NS_cent is next best, but more expensive than NS_deg.

Explore local centrality of all candidate initial fold substructures¶

Choice of using the max (as above) is satisfying (close to optimal with clear selection criterion).

fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=1)
df1.head()

Paci\1ten_locents.out

# combine exp_phi with degseq
for pid in PIDs:
    fn0 = os.path.join(dirname, pid + "_deg0.out")
    df0 = pd.read_table(fn0, sep="\s+") # to get nids with exp_phi values, and exp_phi values
    dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
    phis = dp.exp_phi.values # phi-values
    nids = dp.nid.values # nids with phi-values
    fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
    df1 = pd.read_table(fn1, sep="\s+", header=1)
    mads = []
    ncols = df1.shape[1] # 1..n are subs: candidate initial fold substructures
    for c in np.arange(1, ncols, 1):
        a = df1.iloc[:, c].values  # select column-wise
        normed_a = a/np.max(a)
        mad = np.round(np.mean(np.abs(normed_a.take(nids) - phis)), 3)
        mads.append(mad)
    print(pid, np.min(mads), np.argmin(mads), mads)

1imq 0.306 2 [0.329, 0.363, 0.306]
1lmb4 0.5 4 [0.53, 0.583, 0.501, 0.507, 0.5]
1bf4 0.204 3 [0.345, 0.373, 0.308, 0.204, 0.246, 0.242, 0.285]
2ptl 0.19 0 [0.19, 0.243, 0.329, 0.336, 0.334]
2ci2 0.186 3 [0.254, 0.244, 0.203, 0.186, 0.236, 0.262, 0.304]
1aps 0.21 0 [0.21, 0.246, 0.223, 0.234, 0.333, 0.277]
1fmk 0.207 5 [0.439, 0.355, 0.378, 0.322, 0.3, 0.207, 0.316, 0.399]
1bk2 0.153 5 [0.41, 0.307, 0.397, 0.312, 0.299, 0.153, 0.295, 0.341]
1shf 0.236 5 [0.39, 0.384, 0.487, 0.418, 0.24, 0.236, 0.396, 0.436]
1ten 0.142 4 [0.268, 0.21, 0.156, 0.217, 0.142, 0.209, 0.27]

# MAD scores with different criterion for local centrality
max_cent = [0.306, 0.501, 0.204, 0.19, 0.236, 0.223, 0.30, 0.153, 0.24, 0.156]
init_fold = [0.363, 0.507, 0.345, 0.19, 0.236, 0.223, 0.355, 0.153, 0.384, 0.209]
min_mad = [0.306, 0.50, 0.204, 0.19, 0.186, 0.21, 0.207, 0.153, 0.236, 0.142]
print(np.mean(max_cent), np.mean(init_fold), np.mean(min_mad))

0.2509 0.2965 0.2334

plt.title("Local centrality with different EDS path criterion", fontsize=12)
plt.plot(PIDs, max_cent, 'o-', color='tab:pink', label='max_cent 0.251')
plt.plot(PIDs, init_fold, 'x--', color='tab:brown', label='init_fold 0.296')
plt.plot(PIDs, min_mad, '.--', color='black', label='min_mad 0.233')

plt.xticks(PIDs, rotation=90)
plt.ylabel("MAD between calc. and exp. $\phi$", fontsize=12)
plt.legend(loc='upper right'); plt.grid()

# plt.savefig("MAD_phi_locents.png")

End of notebook¶

SKLC November 2020

	ts_pid	nid	le
0	2073	0	0
1	2073	1	2
2	2073	2	1
3	2073	3	3
4	2073	4	0

	nid	deg_avg	cent_avg
0	0	1.0000	226.375
1	1	1.3750	261.063
2	2	4.2500	613.813
3	3	4.6875	575.938
4	4	3.6875	393.250

	nid	exp_phi	deg0	cent0	deg0_norm	cent0_norm	nid_ts	deg_avg	cent_avg	deg_avg_norm	deg_avg_norm2	cent_avg_norm
0	0	0.04	17	220	0.739130	0.307263	0	1.60000	266.800	0.099931	0.094118	0.286584
1	1	-1.00	15	242	0.652174	0.337989	1	2.77778	286.344	0.173491	0.185185	0.307577
2	2	-1.00	16	295	0.695652	0.412011	2	4.35556	552.244	0.272034	0.272222	0.593194
3	3	0.04	19	399	0.826087	0.557263	3	4.25556	425.178	0.265788	0.223977	0.456706
4	4	-1.00	11	287	0.478261	0.400838	4	5.40000	475.900	0.337266	0.490909	0.511189

	nid	exp_phi	wloc_norm	ploc_norm
0	0	0.04	0.060606	0.060606
1	1	-1.00	0.063361	0.063361
2	2	-1.00	0.049587	0.049587
3	3	0.04	0.096419	0.096419
4	4	-1.00	0.063361	0.063361

	NS_deg	NS_cent	NS_locent	TSE_deg	TSE_deg2	TSE_cent
MAD_stats
sample_avg	0.4090	0.3640	0.2530	0.3060	0.2990	0.3520
sample_stdev	0.0679	0.0613	0.1036	0.0613	0.0615	0.0648
range	0.1700	0.1500	0.3500	0.2200	0.1900	0.1700
gMOE	0.0421	0.0380	0.0642	0.0380	0.0381	0.0401
95meanCI_gleft	0.3669	0.3260	0.1888	0.2680	0.2609	0.3119
95meanCI_gright	0.4511	0.4020	0.3172	0.3440	0.3371	0.3921
bMOE	0.0405	0.0365	0.0605	0.0350	0.0360	0.0380
95meanCI_bleft	0.3690	0.3290	0.2000	0.2760	0.2640	0.3130
95meanCI_bright	0.4500	0.4020	0.3210	0.3460	0.3360	0.3890

	nid	17	18	19	20	21	22	23
0	0	14	13	22	3	31	16	58
1	1	26	35	23	2	17	7	35
2	2	41	6	18	1	19	15	36
3	3	104	79	35	1	33	20	2
4	4	172	70	23	1	10	25	0

	nid	exp_phi	deg0	cent0
0	0	-1.00	1	147
1	1	-1.00	5	183
2	2	0.00	10	308
3	3	0.05	11	244
4	4	-1.00	8	178

	nid	exp_phi	deg0	cent0
0	0	-1.0	1	237
1	1	-1.0	2	185
2	2	-1.0	14	480
3	3	-1.0	13	307
4	4	-1.0	9	243

	nid	walk_locent	path_locent
0	0	22	22
1	1	23	23
2	2	18	18
3	3	35	35
4	4	23	23

	nid	17	18	19	20	21	22	23
0	0	14	13	22	3	31	16	58
1	1	26	35	23	2	17	7	35
2	2	41	6	18	1	19	15	36
3	3	104	79	35	1	33	20	2
4	4	172	70	23	1	10	25	0

	nid	17	18	19	20	21	22	23
0	0	14	13	22	3	31	16	58
1	1	26	35	23	2	17	7	35
2	2	41	6	18	1	19	15	36
3	3	104	79	35	1	33	20	2
4	4	172	70	23	1	10	25	0