import numpy as np
import pandas as pd
import os
from scipy.stats import pearsonr, mode
import matplotlib.pyplot as plt
import NDutils as nd # my lib
def get_NDstats2(dirname, nd_type) -> dict:
ndstats2 = {} # keyed by pid
for pid in PIDs:
fn = os.path.join(dirname, pid + '_stats2_' + nd_type + '.out')
df = pd.read_table(fn, sep='\s+')
ndstats2[pid] = df[['Q', 'SCN0_CO', 'SCN0_RCO', 'SCN0_C']].set_index('Q')
return ndstats2
def get_TSEstats():
tse_stats = {}
for pid in PIDs:
fn = os.path.join("P_pts", pid + '_TSstats.out')
df = pd.read_table(fn, sep='\s+')[['SCN0_CO', 'SCN0_RCO', 'SCN0_C']]
tse_stats[pid] = df
return tse_stats
def locate_TSE(tse_stats: dict, nd_stats2: dict, feature: str, k=7):
"""Locate TSE nets with feature.
tse_stats and nd_stats2 are dicts of dataframes.
Assumes feature is a column in both tse and nd dataframes.
"""
TSE_Qs = []
for pid in PIDs:
tse_df = tse_stats[pid]
nd_df = nd_stats2[pid].query("Q < 1.0")
nn_qs = []
for x in tse_df[feature].values:
abs_diff = abs(nd_df[feature] - x) # series indexed on Q
abs_diff.sort_values(inplace=True)
nearest_qs = abs_diff.iloc[:k].index.to_numpy()
mode_q = mode(nearest_qs)[0].item() # most frequent q in the k nearest
nn_qs.append(mode_q)
mean_q = np.mean(nn_qs)
print(pid, np.round(mean_q, 3), np.round(np.std(nn_qs), 3))
TSE_Qs.append(np.round(mean_q, 1))
return TSE_Qs
fn = os.path.join('P_c', 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
TSE_stats = get_TSEstats()
TSE_stats['1bk2'].head()
# load stats2 for the three ND variants, dict of dfs
c_ndstats2 = get_NDstats2('P_c', 'c')
x_ndstats2 = get_NDstats2('P_x', 'x')
r_ndstats2 = get_NDstats2('P_r', 'r')
r_ndstats2['1bk2'].sample(5)
# sets k (number of nearest neighbors) for the notebook
num_nn = 7
# load stats for the three ND variants, dict of dfs
c_ndstats = nd.get_NDstats('P_c', PIDs, 'c')
x_ndstats = nd.get_NDstats('P_x', PIDs, 'x')
r_ndstats = nd.get_NDstats('P_r', PIDs, 'r')
# ND peak energies and peak Qs
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c'", c_peakQs, np.mean(c_peakQs), np.std(c_peakQs))
x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x'", x_peakQs, np.mean(x_peakQs), np.std(x_peakQs))
r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r'", r_peakQs, np.mean(r_peakQs), np.std(r_peakQs))
c_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_CO")
print("'c'", np.mean(c_TSE_CO_Qs), np.std(c_TSE_CO_Qs), nd.mad(c_peakQs, c_TSE_CO_Qs), "\n")
x_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_CO")
print("'x'", np.mean(x_TSE_CO_Qs), np.std(x_TSE_CO_Qs), nd.mad(x_peakQs, x_TSE_CO_Qs), "\n")
r_TSE_CO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_CO")
print("'r'", np.mean(r_TSE_CO_Qs), np.std(r_TSE_CO_Qs), nd.mad(r_peakQs, r_TSE_CO_Qs), "\n")
c_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_RCO")
print("'c'", np.mean(c_TSE_RCO_Qs), np.std(c_TSE_RCO_Qs), nd.mad(c_peakQs, c_TSE_RCO_Qs), "\n")
x_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_RCO")
print("'x'", np.mean(x_TSE_RCO_Qs), np.std(x_TSE_RCO_Qs), nd.mad(x_peakQs, x_TSE_RCO_Qs), "\n")
r_TSE_RCO_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_RCO")
print("'r'", np.mean(r_TSE_RCO_Qs), np.std(r_TSE_RCO_Qs), nd.mad(r_peakQs, r_TSE_RCO_Qs), "\n")
c_TSE_C_Qs = locate_TSE(TSE_stats.copy(), c_ndstats2.copy(), "SCN0_C")
print("'c'", np.mean(c_TSE_C_Qs), np.std(c_TSE_C_Qs), nd.mad(c_peakQs, c_TSE_C_Qs), "\n")
x_TSE_C_Qs = locate_TSE(TSE_stats.copy(), x_ndstats2.copy(), "SCN0_C")
print("'x'", np.mean(x_TSE_C_Qs), np.std(x_TSE_C_Qs), nd.mad(x_peakQs, x_TSE_C_Qs), "\n")
r_TSE_C_Qs = locate_TSE(TSE_stats.copy(), r_ndstats2.copy(), "SCN0_C")
print("'r'", np.mean(r_TSE_C_Qs), np.std(r_TSE_C_Qs), nd.mad(r_peakQs, r_TSE_C_Qs), "\n")
fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True, sharey=True)
fig.suptitle("Locating TSE nets with ND native shortcuts.")
ax = axes[0]
ax.boxplot([c_TSE_CO_Qs, x_TSE_CO_Qs, r_TSE_CO_Qs ], showmeans=True)
ax.set_title("Absolute CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.set_ylabel('Q', fontsize=14); ax.set_yticks(nd.Qs)
ax.grid()
ax = axes[1]
ax.boxplot([c_TSE_RCO_Qs, x_TSE_RCO_Qs, r_TSE_RCO_Qs ], showmeans=True)
ax.set_title("Relative CO")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()
ax = axes[2]
ax.boxplot([c_TSE_C_Qs, x_TSE_C_Qs, r_TSE_C_Qs ], showmeans=True)
ax.set_title("Clustering")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['c', 'x', 'r'], fontsize=14)
ax.grid()
# plt.savefig("tse_location.png")
CO_dict, RCO_dict, C_dict = {}, {}, {} # dict of dfs, one for each pid
for pid in PIDs:
dg = c_ndstats2[pid].groupby('Q')
c_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
dg = x_ndstats2[pid].groupby('Q')
x_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
dg = r_ndstats2[pid].groupby('Q')
r_df = dg[["SCN0_CO", "SCN0_RCO", "SCN0_C"]].mean()
CO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_CO, 'x': x_df.SCN0_CO, 'r': r_df.SCN0_CO})
RCO_dict[pid] = pd.DataFrame({'c': c_df.SCN0_RCO, 'x': x_df.SCN0_RCO, 'r': r_df.SCN0_RCO})
C_dict[pid] = pd.DataFrame({'c': c_df.SCN0_C, 'x': x_df.SCN0_C, 'r': r_df.SCN0_C})
CO_df = pd.concat(CO_dict)
RCO_df = pd.concat(RCO_dict)
C_df = pd.concat(C_dict)
dh = CO_df.query("Q==0.1")
dh
res = dh.apply(lambda x: pearsonr(x, fold_rates))
res
CO_dict, RCO_dict, C_dict = {}, {}, {}
for q in nd.Qs:
dq = CO_df.query("Q == @q")
res = dq.apply(lambda x: pearsonr(x, fold_rates)) # get corr for each col with fold_rates
CO_dict[q] = res
dq = RCO_df.query("Q == @q")
res = dq.apply(lambda x: pearsonr(x, fold_rates))
RCO_dict[q] = res
dq = C_df.query("Q == @q")
res = dq.apply(lambda x: pearsonr(x, fold_rates))
C_dict[q] = res
CO_dp = pd.concat(CO_dict) # dataframe of correlations as a function of Q
RCO_dp = pd.concat(RCO_dict)
C_dp = pd.concat(C_dict)
CO_dp.head()
CO_dp[:,'c'] # extract correlation coefficients and pvalues for 'c'
fig, axes = plt.subplots(3, 2, figsize=(9, 12), constrained_layout=True)
plt.suptitle("Correlation with exp. fold rate, Paci dataset")
c_rhos, c_pvals = zip(* CO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* CO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* CO_dp[:,'r'].values)
ax = axes[0][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_CO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[0][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
c_rhos, c_pvals = zip(* RCO_dp[:,'c'].values)
x_rhos, x_pvals = zip(* RCO_dp[:,'x'].values)
r_rhos, r_pvals = zip(* RCO_dp[:,'r'].values)
ax = axes[1][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_RCO")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[1][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
c_rhos, c_pvals = zip(* C_dp[:,'c'].values)
x_rhos, x_pvals = zip(* C_dp[:,'x'].values)
r_rhos, r_pvals = zip(* C_dp[:,'r'].values)
ax = axes[2][0]
ax.plot(nd.Qs, c_rhos, 'o--', label="'c'")
ax.plot(nd.Qs, x_rhos, 'o--', label="'x'")
ax.plot(nd.Qs, r_rhos, '.--', label="'r'")
ax.set_title("SCN0_C")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
ax = axes[2][1]
ax.plot(nd.Qs, c_pvals, 'o--', label="'c'")
ax.plot(nd.Qs, x_pvals, 'o--', label="'x'")
ax.plot(nd.Qs, r_pvals, '.--', label="'r'")
ax.set_ylabel("p-values")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs); ax.legend(); ax.grid()
SKLC September 2020