Locating TSE nets with features of ND native shortcuts.

  • Features of ND native shortcut nets (SCN0):
    • Contact-Order: Absolute and Relative.
    • Network clustering $C$.

  • Approach: k-nearest neighbors.
    • For each TSE net, find $mode(Q_7)$, the most common $Q$ of the 7 nearest (distance is absolute difference) ND SCN0s. Ties are biased toward smaller $Q$ values.
    • TSE nets of a protein are located at $E[mode(Q_7)]$, the average $mode(Q_7)$ over all TSE nets for a protein.
In [1]:
import numpy as np
import pandas as pd
import os
from scipy.stats import pearsonr, mode
import matplotlib.pyplot as plt
import NDutils as nd # my lib 
In [2]:
def get_NDstats2(dirname, nd_type) -> dict:
    ndstats2 = {} # keyed by pid    
    for pid in PIDs:    
        fn = os.path.join(dirname, pid + '_stats2_' + nd_type + '.out')
        df = pd.read_table(fn, sep='\s+')
        ndstats2[pid] = df[['Q', 'SCN0_CO', 'SCN0_RCO', 'SCN0_C']].set_index('Q')
    return ndstats2

def get_TSEstats():
    tse_stats = {}
    for pid in PIDs:        
        fn = os.path.join("P_pts", pid + '_TSstats.out')
        df = pd.read_table(fn, sep='\s+')[['SCN0_CO', 'SCN0_RCO', 'SCN0_C']]
        tse_stats[pid] = df
    return tse_stats
In [3]:
def locate_TSE(tse_stats: dict, nd_stats2: dict, feature: str, k=7):
    """Locate TSE nets with feature.
    tse_stats and nd_stats2 are dicts of dataframes.
    Assumes feature is a column in both tse and nd dataframes. 
    """
    TSE_Qs = [] 
    for pid in PIDs:
        tse_df = tse_stats[pid]
        nd_df = nd_stats2[pid].query("Q < 1.0")        
        nn_qs = []
        for x in tse_df[feature].values:
            abs_diff = abs(nd_df[feature] - x) # series indexed on Q
            abs_diff.sort_values(inplace=True)
            nearest_qs = abs_diff.iloc[:k].index.to_numpy()
            mode_q = mode(nearest_qs)[0].item() # most frequent q in the k nearest
            nn_qs.append(mode_q)  
        mean_q = np.mean(nn_qs)
        print(pid, np.round(mean_q, 3), np.round(np.std(nn_qs), 3))
        TSE_Qs.append(np.round(mean_q, 1))        
    return TSE_Qs

Paci

In [4]:
fn = os.path.join('P_c', 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
In [5]:
TSE_stats = get_TSEstats()
In [6]:
TSE_stats['1bk2'].head()
Out[6]:
SCN0_CO SCN0_RCO SCN0_C
0 12.88640 0.314302 0.117544
1 10.13640 0.247228 0.160819
2 8.76923 0.208791 0.035673
3 9.77500 0.238415 0.044444
4 8.51282 0.212821 0.111111
In [7]:
# load stats2 for the three ND variants, dict of dfs
c_ndstats2 = get_NDstats2('P_c', 'c')
x_ndstats2 = get_NDstats2('P_x', 'x')
r_ndstats2 = get_NDstats2('P_r', 'r')
In [8]:
r_ndstats2['1bk2'].sample(5)
Out[8]:
SCN0_CO SCN0_RCO SCN0_C
Q
0.1 16.1818 1.244760 0.000000
0.8 14.4198 0.267032 0.158312
0.2 13.7727 0.510101 0.040936
0.2 16.1429 0.645714 0.064327
0.6 13.3651 0.267302 0.247995
In [9]:
# sets k (number of nearest neighbors) for the notebook
num_nn = 7
In [10]:
# load stats for the three ND variants, dict of dfs
c_ndstats = nd.get_NDstats('P_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('P_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('P_r', PIDs, 'r')

# ND peak energies and peak Qs
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c'", c_peakQs, np.mean(c_peakQs), np.std(c_peakQs))

x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x'", x_peakQs, np.mean(x_peakQs), np.std(x_peakQs))

r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r'", r_peakQs, np.mean(r_peakQs), np.std(r_peakQs))
'c' [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3] 0.26 0.04898979485566354
'x' [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2] 0.25000000000000006 0.04999999999999999
'r' [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2] 0.19 0.03
In [11]:
c_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_CO")
print("'c'", np.mean(c_TSE_CO_Qs), np.std(c_TSE_CO_Qs), nd.mad(c_peakQs, c_TSE_CO_Qs), "\n")

x_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_CO")
print("'x'", np.mean(x_TSE_CO_Qs), np.std(x_TSE_CO_Qs), nd.mad(x_peakQs, x_TSE_CO_Qs), "\n")

r_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_CO")
print("'r'", np.mean(r_TSE_CO_Qs), np.std(r_TSE_CO_Qs), nd.mad(r_peakQs, r_TSE_CO_Qs), "\n")
1imq 0.744 0.252
1lmb4 0.485 0.253
1bf4 0.804 0.183
2ptl 0.667 0.171
2ci2 0.432 0.178
1aps 0.766 0.071
1fmk 0.514 0.197
1bk2 0.635 0.175
1shf 0.515 0.212
1ten 0.672 0.147
'c' 0.62 0.132664991614216 0.36 

1imq 0.744 0.267
1lmb4 0.467 0.233
1bf4 0.795 0.19
2ptl 0.685 0.151
2ci2 0.449 0.191
1aps 0.786 0.063
1fmk 0.522 0.191
1bk2 0.665 0.17
1shf 0.606 0.176
1ten 0.688 0.133
'x' 0.64 0.12806248474865697 0.39 

1imq 0.188 0.105
1lmb4 0.151 0.114
1bf4 0.347 0.214
2ptl 0.244 0.19
2ci2 0.103 0.03
1aps 0.138 0.085
1fmk 0.122 0.076
1bk2 0.148 0.134
1shf 0.115 0.05
1ten 0.109 0.032
'r' 0.15000000000000002 0.0670820393249937 0.06 

In [12]:
c_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_RCO")
print("'c'", np.mean(c_TSE_RCO_Qs), np.std(c_TSE_RCO_Qs), nd.mad(c_peakQs, c_TSE_RCO_Qs), "\n")

x_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_RCO")
print("'x'", np.mean(x_TSE_RCO_Qs), np.std(x_TSE_RCO_Qs), nd.mad(x_peakQs, x_TSE_RCO_Qs), "\n")

r_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_RCO")
print("'r'", np.mean(r_TSE_RCO_Qs), np.std(r_TSE_RCO_Qs), nd.mad(r_peakQs, r_TSE_RCO_Qs), "\n")
1imq 0.4 0.35
1lmb4 0.575 0.171
1bf4 0.282 0.16
2ptl 0.34 0.223
2ci2 0.476 0.267
1aps 0.679 0.33
1fmk 0.467 0.319
1bk2 0.277 0.297
1shf 0.421 0.266
1ten 0.342 0.332
'c' 0.43 0.1345362404707371 0.16999999999999998 

1imq 0.3 0.292
1lmb4 0.577 0.191
1bf4 0.238 0.126
2ptl 0.294 0.223
2ci2 0.449 0.277
1aps 0.762 0.27
1fmk 0.47 0.327
1bk2 0.323 0.308
1shf 0.433 0.301
1ten 0.293 0.309
'x' 0.41 0.17 0.18 

1imq 0.431 0.236
1lmb4 0.66 0.095
1bf4 0.516 0.148
2ptl 0.563 0.179
2ci2 0.795 0.046
1aps 0.679 0.106
1fmk 0.608 0.138
1bk2 0.723 0.154
1shf 0.748 0.093
1ten 0.698 0.175
'r' 0.64 0.11135528725660043 0.45 

In [13]:
c_TSE_C_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_C")
print("'c'", np.mean(c_TSE_C_Qs), np.std(c_TSE_C_Qs), nd.mad(c_peakQs, c_TSE_C_Qs), "\n")

x_TSE_C_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_C")
print("'x'", np.mean(x_TSE_C_Qs), np.std(x_TSE_C_Qs), nd.mad(x_peakQs, x_TSE_C_Qs), "\n")

r_TSE_C_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_C")
print("'r'", np.mean(r_TSE_C_Qs), np.std(r_TSE_C_Qs), nd.mad(r_peakQs, r_TSE_C_Qs), "\n")
1imq 0.262 0.105
1lmb4 0.353 0.139
1bf4 0.309 0.141
2ptl 0.336 0.131
2ci2 0.179 0.107
1aps 0.259 0.11
1fmk 0.305 0.128
1bk2 0.339 0.118
1shf 0.397 0.114
1ten 0.353 0.134
'c' 0.31999999999999995 0.06000000000000001 0.08 

1imq 0.244 0.093
1lmb4 0.364 0.133
1bf4 0.295 0.142
2ptl 0.337 0.184
2ci2 0.16 0.108
1aps 0.262 0.122
1fmk 0.33 0.126
1bk2 0.316 0.132
1shf 0.355 0.123
1ten 0.364 0.134
'x' 0.31 0.06999999999999999 0.08 

1imq 0.325 0.13
1lmb4 0.422 0.148
1bf4 0.389 0.153
2ptl 0.383 0.145
2ci2 0.183 0.123
1aps 0.334 0.135
1fmk 0.297 0.164
1bk2 0.319 0.157
1shf 0.439 0.137
1ten 0.356 0.136
'r' 0.33999999999999997 0.06633249580710801 0.14999999999999997 

In [14]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
fig.suptitle("Locating TSE nets with ND native shortcuts.")

ax = axes[0]
ax.boxplot([c_TSE_CO_Qs, x_TSE_CO_Qs, r_TSE_CO_Qs ], showmeans=True)
ax.set_title("Absolute CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.set_ylabel('Q', fontsize=14); ax.set_yticks(nd.Qs)
ax.grid()

ax = axes[1]
ax.boxplot([c_TSE_RCO_Qs, x_TSE_RCO_Qs, r_TSE_RCO_Qs ], showmeans=True)
ax.set_title("Relative CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()

ax = axes[2]
ax.boxplot([c_TSE_C_Qs, x_TSE_C_Qs, r_TSE_C_Qs ], showmeans=True)
ax.set_title("Clustering")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()

# plt.savefig("tse_location.png")

Correlation with exp. fold rate

In [15]:
CO_dict, RCO_dict, C_dict = {}, {}, {} # dict of dfs, one for each pid
for pid in PIDs:    
    dg = c_ndstats2[pid].groupby('Q')
    c_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
    dg = x_ndstats2[pid].groupby('Q')
    x_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
    dg = r_ndstats2[pid].groupby('Q')
    r_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()    
    CO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_CO, 'x': x_df.SCN0_CO, 'r': r_df.SCN0_CO})
    RCO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_RCO, 'x': x_df.SCN0_RCO, 'r': r_df.SCN0_RCO})
    C_dict[pid] = pd.DataFrame({'c': c_df.SCN0_C, 'x': x_df.SCN0_C, 'r': r_df.SCN0_C})

CO_df = pd.concat(CO_dict)
RCO_df = pd.concat(RCO_dict)
C_df = pd.concat(C_dict)
In [16]:
dh = CO_df.query("Q==0.1")
dh
Out[16]:
c x r
Q
1imq 0.1 2.897350 2.853559 10.515476
1lmb4 0.1 2.872390 2.943813 5.435359
1bf4 0.1 3.771312 3.783685 6.744593
2ptl 0.1 4.105726 3.723213 7.598658
2ci2 0.1 4.043320 4.067840 18.200258
1aps 0.1 3.799357 3.194515 27.955507
1fmk 0.1 5.341808 5.122292 15.283126
1bk2 0.1 4.996181 5.081266 14.410316
1shf 0.1 6.256845 6.080101 14.947906
1ten 0.1 7.258595 6.969065 23.375072
In [17]:
res = dh.apply(lambda x: pearsonr(x, fold_rates))
res
Out[17]:
c      (-0.47628176842040404, 0.16404486156249984)
x      (-0.35943390661293534, 0.30768584467568894)
r    (-0.9172721960189688, 0.00018526970942597747)
dtype: object
In [18]:
CO_dict, RCO_dict, C_dict = {}, {}, {}
for q in nd.Qs:
    dq = CO_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates)) # get corr for each col with fold_rates
    CO_dict[q] = res
    dq = RCO_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates))
    RCO_dict[q] = res
    dq = C_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates))
    C_dict[q] = res

CO_dp = pd.concat(CO_dict) # dataframe of correlations as a function of Q
RCO_dp = pd.concat(RCO_dict)
C_dp = pd.concat(C_dict)
In [19]:
CO_dp.head()
Out[19]:
0.1  c      (-0.47628176842040404, 0.16404486156249984)
     x      (-0.35943390661293534, 0.30768584467568894)
     r    (-0.9172721960189688, 0.00018526970942597747)
0.2  c       (-0.5047032337724919, 0.13681367871285707)
     x      (-0.44555826079503225, 0.19687464973874505)
dtype: object
In [20]:
CO_dp[:,'c'] # extract correlation coefficients and pvalues for 'c'
Out[20]:
0.1      (-0.47628176842040404, 0.16404486156249984)
0.2       (-0.5047032337724919, 0.13681367871285707)
0.3        (-0.5145644179604214, 0.1280626470644844)
0.4       (-0.5845353715657238, 0.07594579654093217)
0.5        (-0.731684264384943, 0.01615963724559916)
0.6     (-0.8845966058623458, 0.0006736040512887011)
0.7      (-0.9355132085578669, 6.99601455363325e-05)
0.8    (-0.9535866105008435, 1.9193527692994492e-05)
0.9     (-0.9464131068658188, 3.380709198910606e-05)
1.0     (-0.8918429487895234, 0.0005244290154618247)
dtype: object
In [21]:
fig, axes = plt.subplots(3, 2, figsize=(9, 12), constrained_layout=True)
plt.suptitle("Correlation with exp. fold rate, Paci dataset")

c_rhos, c_pvals = zip(* CO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* CO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* CO_dp[:,'r'].values)
ax = axes[0][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_CO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[0][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

c_rhos, c_pvals = zip(* RCO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* RCO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* RCO_dp[:,'r'].values)
ax = axes[1][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_RCO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[1][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

c_rhos, c_pvals = zip(* C_dp[:,'c'].values)
x_rhos, x_pvals = zip(* C_dp[:,'x'].values)
r_rhos, r_pvals = zip(* C_dp[:,'r'].values)
ax = axes[2][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_C")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[2][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

End of notebook

SKLC September 2020