Locating TSE nets with features of ND native shortcuts.¶

Features of ND native shortcut nets (SCN0):
- Contact-Order: Absolute and Relative.
- Network clustering $C$.
Approach: k-nearest neighbors.
- For each TSE net, find $mode(Q_7)$, the most common $Q$ of the 7 nearest (distance is absolute difference) ND SCN0s. Ties are biased toward smaller $Q$ values.
- TSE nets of a protein are located at $E[mode(Q_7)]$, the average $mode(Q_7)$ over all TSE nets for a protein.

import numpy as np
import pandas as pd
import os
from scipy.stats import pearsonr, mode
import matplotlib.pyplot as plt
import NDutils as nd # my lib

def get_NDstats2(dirname, nd_type) -> dict:
    ndstats2 = {} # keyed by pid    
    for pid in PIDs:    
        fn = os.path.join(dirname, pid + '_stats2_' + nd_type + '.out')
        df = pd.read_table(fn, sep='\s+')
        ndstats2[pid] = df[['Q', 'SCN0_CO', 'SCN0_RCO', 'SCN0_C']].set_index('Q')
    return ndstats2

def get_TSEstats():
    tse_stats = {}
    for pid in PIDs:        
        fn = os.path.join("P_pts", pid + '_TSstats.out')
        df = pd.read_table(fn, sep='\s+')[['SCN0_CO', 'SCN0_RCO', 'SCN0_C']]
        tse_stats[pid] = df
    return tse_stats

def locate_TSE(tse_stats: dict, nd_stats2: dict, feature: str, k=7):
    """Locate TSE nets with feature.
    tse_stats and nd_stats2 are dicts of dataframes.
    Assumes feature is a column in both tse and nd dataframes. 
    """
    TSE_Qs = [] 
    for pid in PIDs:
        tse_df = tse_stats[pid]
        nd_df = nd_stats2[pid].query("Q < 1.0")        
        nn_qs = []
        for x in tse_df[feature].values:
            abs_diff = abs(nd_df[feature] - x) # series indexed on Q
            abs_diff.sort_values(inplace=True)
            nearest_qs = abs_diff.iloc[:k].index.to_numpy()
            mode_q = mode(nearest_qs)[0].item() # most frequent q in the k nearest
            nn_qs.append(mode_q)  
        mean_q = np.mean(nn_qs)
        print(pid, np.round(mean_q, 3), np.round(np.std(nn_qs), 3))
        TSE_Qs.append(np.round(mean_q, 1))        
    return TSE_Qs

Paci¶

fn = os.path.join('P_c', 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values

TSE_stats = get_TSEstats()

TSE_stats['1bk2'].head()

# load stats2 for the three ND variants, dict of dfs
c_ndstats2 = get_NDstats2('P_c', 'c')
x_ndstats2 = get_NDstats2('P_x', 'x')
r_ndstats2 = get_NDstats2('P_r', 'r')

r_ndstats2['1bk2'].sample(5)

# sets k (number of nearest neighbors) for the notebook
num_nn = 7

# load stats for the three ND variants, dict of dfs
c_ndstats = nd.get_NDstats('P_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('P_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('P_r', PIDs, 'r')

# ND peak energies and peak Qs
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c'", c_peakQs, np.mean(c_peakQs), np.std(c_peakQs))

x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x'", x_peakQs, np.mean(x_peakQs), np.std(x_peakQs))

r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r'", r_peakQs, np.mean(r_peakQs), np.std(r_peakQs))

'c' [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3] 0.26 0.04898979485566354
'x' [0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2] 0.25000000000000006 0.04999999999999999
'r' [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2] 0.19 0.03

c_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_CO")
print("'c'", np.mean(c_TSE_CO_Qs), np.std(c_TSE_CO_Qs), nd.mad(c_peakQs, c_TSE_CO_Qs), "\n")

x_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_CO")
print("'x'", np.mean(x_TSE_CO_Qs), np.std(x_TSE_CO_Qs), nd.mad(x_peakQs, x_TSE_CO_Qs), "\n")

r_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_CO")
print("'r'", np.mean(r_TSE_CO_Qs), np.std(r_TSE_CO_Qs), nd.mad(r_peakQs, r_TSE_CO_Qs), "\n")

1imq 0.744 0.252
1lmb4 0.485 0.253
1bf4 0.804 0.183
2ptl 0.667 0.171
2ci2 0.432 0.178
1aps 0.766 0.071
1fmk 0.514 0.197
1bk2 0.635 0.175
1shf 0.515 0.212
1ten 0.672 0.147
'c' 0.62 0.132664991614216 0.36 

1imq 0.744 0.267
1lmb4 0.467 0.233
1bf4 0.795 0.19
2ptl 0.685 0.151
2ci2 0.449 0.191
1aps 0.786 0.063
1fmk 0.522 0.191
1bk2 0.665 0.17
1shf 0.606 0.176
1ten 0.688 0.133
'x' 0.64 0.12806248474865697 0.39 

1imq 0.188 0.105
1lmb4 0.151 0.114
1bf4 0.347 0.214
2ptl 0.244 0.19
2ci2 0.103 0.03
1aps 0.138 0.085
1fmk 0.122 0.076
1bk2 0.148 0.134
1shf 0.115 0.05
1ten 0.109 0.032
'r' 0.15000000000000002 0.0670820393249937 0.06

c_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_RCO")
print("'c'", np.mean(c_TSE_RCO_Qs), np.std(c_TSE_RCO_Qs), nd.mad(c_peakQs, c_TSE_RCO_Qs), "\n")

x_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_RCO")
print("'x'", np.mean(x_TSE_RCO_Qs), np.std(x_TSE_RCO_Qs), nd.mad(x_peakQs, x_TSE_RCO_Qs), "\n")

r_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_RCO")
print("'r'", np.mean(r_TSE_RCO_Qs), np.std(r_TSE_RCO_Qs), nd.mad(r_peakQs, r_TSE_RCO_Qs), "\n")

1imq 0.4 0.35
1lmb4 0.575 0.171
1bf4 0.282 0.16
2ptl 0.34 0.223
2ci2 0.476 0.267
1aps 0.679 0.33
1fmk 0.467 0.319
1bk2 0.277 0.297
1shf 0.421 0.266
1ten 0.342 0.332
'c' 0.43 0.1345362404707371 0.16999999999999998 

1imq 0.3 0.292
1lmb4 0.577 0.191
1bf4 0.238 0.126
2ptl 0.294 0.223
2ci2 0.449 0.277
1aps 0.762 0.27
1fmk 0.47 0.327
1bk2 0.323 0.308
1shf 0.433 0.301
1ten 0.293 0.309
'x' 0.41 0.17 0.18 

1imq 0.431 0.236
1lmb4 0.66 0.095
1bf4 0.516 0.148
2ptl 0.563 0.179
2ci2 0.795 0.046
1aps 0.679 0.106
1fmk 0.608 0.138
1bk2 0.723 0.154
1shf 0.748 0.093
1ten 0.698 0.175
'r' 0.64 0.11135528725660043 0.45

c_TSE_C_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_C")
print("'c'", np.mean(c_TSE_C_Qs), np.std(c_TSE_C_Qs), nd.mad(c_peakQs, c_TSE_C_Qs), "\n")

x_TSE_C_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_C")
print("'x'", np.mean(x_TSE_C_Qs), np.std(x_TSE_C_Qs), nd.mad(x_peakQs, x_TSE_C_Qs), "\n")

r_TSE_C_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_C")
print("'r'", np.mean(r_TSE_C_Qs), np.std(r_TSE_C_Qs), nd.mad(r_peakQs, r_TSE_C_Qs), "\n")

1imq 0.262 0.105
1lmb4 0.353 0.139
1bf4 0.309 0.141
2ptl 0.336 0.131
2ci2 0.179 0.107
1aps 0.259 0.11
1fmk 0.305 0.128
1bk2 0.339 0.118
1shf 0.397 0.114
1ten 0.353 0.134
'c' 0.31999999999999995 0.06000000000000001 0.08 

1imq 0.244 0.093
1lmb4 0.364 0.133
1bf4 0.295 0.142
2ptl 0.337 0.184
2ci2 0.16 0.108
1aps 0.262 0.122
1fmk 0.33 0.126
1bk2 0.316 0.132
1shf 0.355 0.123
1ten 0.364 0.134
'x' 0.31 0.06999999999999999 0.08 

1imq 0.325 0.13
1lmb4 0.422 0.148
1bf4 0.389 0.153
2ptl 0.383 0.145
2ci2 0.183 0.123
1aps 0.334 0.135
1fmk 0.297 0.164
1bk2 0.319 0.157
1shf 0.439 0.137
1ten 0.356 0.136
'r' 0.33999999999999997 0.06633249580710801 0.14999999999999997

fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
fig.suptitle("Locating TSE nets with ND native shortcuts.")

ax = axes[0]
ax.boxplot([c_TSE_CO_Qs, x_TSE_CO_Qs, r_TSE_CO_Qs ], showmeans=True)
ax.set_title("Absolute CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.set_ylabel('Q', fontsize=14); ax.set_yticks(nd.Qs)
ax.grid()

ax = axes[1]
ax.boxplot([c_TSE_RCO_Qs, x_TSE_RCO_Qs, r_TSE_RCO_Qs ], showmeans=True)
ax.set_title("Relative CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()

ax = axes[2]
ax.boxplot([c_TSE_C_Qs, x_TSE_C_Qs, r_TSE_C_Qs ], showmeans=True)
ax.set_title("Clustering")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()

# plt.savefig("tse_location.png")

Correlation with exp. fold rate¶

CO_dict, RCO_dict, C_dict = {}, {}, {} # dict of dfs, one for each pid
for pid in PIDs:    
    dg = c_ndstats2[pid].groupby('Q')
    c_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
    dg = x_ndstats2[pid].groupby('Q')
    x_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
    dg = r_ndstats2[pid].groupby('Q')
    r_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()    
    CO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_CO, 'x': x_df.SCN0_CO, 'r': r_df.SCN0_CO})
    RCO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_RCO, 'x': x_df.SCN0_RCO, 'r': r_df.SCN0_RCO})
    C_dict[pid] = pd.DataFrame({'c': c_df.SCN0_C, 'x': x_df.SCN0_C, 'r': r_df.SCN0_C})

CO_df = pd.concat(CO_dict)
RCO_df = pd.concat(RCO_dict)
C_df = pd.concat(C_dict)

dh = CO_df.query("Q==0.1")
dh

res = dh.apply(lambda x: pearsonr(x, fold_rates))
res

c      (-0.47628176842040404, 0.16404486156249984)
x      (-0.35943390661293534, 0.30768584467568894)
r    (-0.9172721960189688, 0.00018526970942597747)
dtype: object

CO_dict, RCO_dict, C_dict = {}, {}, {}
for q in nd.Qs:
    dq = CO_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates)) # get corr for each col with fold_rates
    CO_dict[q] = res
    dq = RCO_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates))
    RCO_dict[q] = res
    dq = C_df.query("Q == @q")
    res = dq.apply(lambda x: pearsonr(x, fold_rates))
    C_dict[q] = res

CO_dp = pd.concat(CO_dict) # dataframe of correlations as a function of Q
RCO_dp = pd.concat(RCO_dict)
C_dp = pd.concat(C_dict)

CO_dp.head()

0.1  c      (-0.47628176842040404, 0.16404486156249984)
     x      (-0.35943390661293534, 0.30768584467568894)
     r    (-0.9172721960189688, 0.00018526970942597747)
0.2  c       (-0.5047032337724919, 0.13681367871285707)
     x      (-0.44555826079503225, 0.19687464973874505)
dtype: object

CO_dp[:,'c'] # extract correlation coefficients and pvalues for 'c'

0.1      (-0.47628176842040404, 0.16404486156249984)
0.2       (-0.5047032337724919, 0.13681367871285707)
0.3        (-0.5145644179604214, 0.1280626470644844)
0.4       (-0.5845353715657238, 0.07594579654093217)
0.5        (-0.731684264384943, 0.01615963724559916)
0.6     (-0.8845966058623458, 0.0006736040512887011)
0.7      (-0.9355132085578669, 6.99601455363325e-05)
0.8    (-0.9535866105008435, 1.9193527692994492e-05)
0.9     (-0.9464131068658188, 3.380709198910606e-05)
1.0     (-0.8918429487895234, 0.0005244290154618247)
dtype: object

fig, axes = plt.subplots(3, 2, figsize=(9, 12), constrained_layout=True)
plt.suptitle("Correlation with exp. fold rate, Paci dataset")

c_rhos, c_pvals = zip(* CO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* CO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* CO_dp[:,'r'].values)
ax = axes[0][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_CO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[0][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

c_rhos, c_pvals = zip(* RCO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* RCO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* RCO_dp[:,'r'].values)
ax = axes[1][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_RCO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[1][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

c_rhos, c_pvals = zip(* C_dp[:,'c'].values)
x_rhos, x_pvals = zip(* C_dp[:,'x'].values)
r_rhos, r_pvals = zip(* C_dp[:,'r'].values)
ax = axes[2][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_C")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[2][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()

End of notebook¶

SKLC September 2020

	SCN0_CO	SCN0_RCO	SCN0_C
0	12.88640	0.314302	0.117544
1	10.13640	0.247228	0.160819
2	8.76923	0.208791	0.035673
3	9.77500	0.238415	0.044444
4	8.51282	0.212821	0.111111

	SCN0_CO	SCN0_RCO	SCN0_C
Q
0.1	16.1818	1.244760	0.000000
0.8	14.4198	0.267032	0.158312
0.2	13.7727	0.510101	0.040936
0.2	16.1429	0.645714	0.064327
0.6	13.3651	0.267302	0.247995

		c	x	r
	Q
1imq	0.1	2.897350	2.853559	10.515476
1lmb4	0.1	2.872390	2.943813	5.435359
1bf4	0.1	3.771312	3.783685	6.744593
2ptl	0.1	4.105726	3.723213	7.598658
2ci2	0.1	4.043320	4.067840	18.200258
1aps	0.1	3.799357	3.194515	27.955507
1fmk	0.1	5.341808	5.122292	15.283126
1bk2	0.1	4.996181	5.081266	14.410316
1shf	0.1	6.256845	6.080101	14.947906
1ten	0.1	7.258595	6.969065	23.375072