import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel # samples have same variance, paired
from scipy.stats import ttest_ind_from_stats as tts # different variance, paired?
# check with R, use CI
# import matplotlib
# matplotlib.use('AGG')
# https://matplotlib.org/tutorials/introductory/usage.html?highlight=saving%20figures%20file
Qs =[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
dirname = "Paci"
PIDs = ["1imq", "1lmb4", "1bf4", "2ptl", "2ci2", "1aps", "1fmk", "1bk2", "1shf", "1ten"]
pid = "2ci2"
fn0 = os.path.join(dirname, pid + "_nodestats.out") # Native-state (PRN0) node statistics
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()
dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
nids = dp.nid.values # nids with phi-values
print(len(nids))
fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
df1 = pd.read_table(fn1, sep=' ')
df1.head()
dg = df1.groupby('nid')
len(dg)
dg.le.mean() # average number of long-range PRN edges in TSE for a pid
fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True)
plt.suptitle("Average number of long-range edges and shortcuts per node in TSE PRNs.")
plt.delaxes(axes[3][1])
plt.delaxes(axes[3][2])
for i, pid in enumerate(PIDs):
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
# df0.head()
dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
nids = dp.nid.values
fn1 = os.path.join(dirname, pid + "_ts_nodestats2.out")
df1 = pd.read_table(fn1, sep=' ')
# df1.head()
dg = df1.groupby('nid')
dle = dg.le.mean() # mean le per node
dscle = dg.scle.mean() # mean scle per node
dphi = dle.iloc[nids]
r = i//3; c = i%3; ax = axes[r][c]
ax.plot(dle.index.values+1, dle.values, label="Long-range edge in TSE PRN0")
ax.plot(dscle.index.values+1, dscle.values, label="Long-range shortcut in TSE SCN0")
ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimenal phi-value")
dphi = dscle.iloc[nids]
ax.scatter(dphi.index.values+1, dphi.values, label="Residue with experimental phi-value")
ax.plot(dle.index.values, [1.0]*len(dle.index.values), linewidth=0.8, color="tab:gray",
label="y=1")
ax.set_title(pid); ax.set_xlabel("Node")
if 0 == c:
ax.set_ylabel("Average number of contacts per node")
h, l = axes[0][0].get_legend_handles_labels()
_ = fig.legend(h, l, loc="upper left", bbox_to_anchor=(0.4, 0.2))
# plt.savefig("TSE_scle.png")
pid = "1imq"
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
df0.head()
fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg
df1.head()
c_norm = df1.cent_avg.values/df0.cent0.values
print(min(c_norm), max(c_norm))
# The two ways of normalizing TS node degree are at best weakly correlated.
# Range of calculated phi-values is wider with normalization with max.
for pid in PIDs:
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg
m = df1.deg_avg.max() # normalization with max
d_norm = df1.deg_avg.values/m
print(min(d_norm), max(d_norm), end='\t')
d_norm2 = df1.deg_avg.values/df0.deg0.values # normalization with NS statistic
print(min(d_norm2), max(d_norm2))
print(pid, pearsonr(d_norm, d_norm2))
# inputs: exp_phi, ns_deg0, ns_cent0, tse_deg_avg, tse_cent_avg
# output: degcent_normed.out file for each pid contains normed deg_avg and cent_avg
corr_ns, corr_ts = [], [] # deg0~cent0 and deg_avg~cent_avg correlations
for pid in PIDs:
print(pid)
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
r = pearsonr(df0.deg0.values, df0.cent0.values)
print(r); corr_ns.append(r[0])
df0 = df0.assign(deg0_norm = df0.deg0.values/df0.deg0.max())
df0 = df0.assign(cent0_norm = df0.cent0.values/df0.cent0.max())
fn1 = os.path.join(dirname, pid + "_ts_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+", header=1) # nid deg_avg cent_avg
r = pearsonr(df1.deg_avg.values, df1.cent_avg.values)
print(r); corr_ts.append(r[0])
m = df1.deg_avg.max()
df1 = df1.assign(deg_avg_norm = df1.deg_avg.values/m)
df1 = df1.assign(deg_avg_norm2 = df1.deg_avg.values/df0.deg0.values)
df1 = df1.assign(cent_avg_norm = df1.cent_avg.values/df1.cent_avg.max())
df = df0.join(df1, rsuffix="_ts")
fn2 = os.path.join(dirname, pid + "_degcent_normed.out")
df.to_csv(fn2, sep=' ', index=False)
fn = os.path.join(dirname, pid + "_degcent_normed.out")
print(fn)
df = pd.read_table(fn, sep="\s+")
df.head()
print(pearsonr(df.deg0.values, df.deg_avg.values))
print(pearsonr(df.cent0.values, df.cent_avg.values))
print(pearsonr(df.deg0_norm.values, df.deg_avg_norm.values))
print(pearsonr(df.cent0_norm.values, df.cent_avg_norm.values))
print(pearsonr(df.deg_avg_norm.values, df.deg_avg_norm2.values))
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality from NS PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
fn = os.path.join(dirname, pid + "_degcent_normed.out")
df = pd.read_table(fn, sep="\s+")
de = df.query("exp_phi > -1")
mad_d = np.round(np.mean(np.abs(de.deg0_norm.values - de.exp_phi.values)), 2)
mad_c = np.round(np.mean(np.abs(de.cent0_norm.values - de.exp_phi.values)), 2)
print(pid, mad_d, mad_c)
# print(pearsonr(de.deg0_norm.values, de.exp_phi.values))
# print(pearsonr(de.cent0_norm.values, de.exp_phi.values))
ax.plot(df.nid.values+1, df.deg0_norm.values, color="tab:blue", alpha=0.5,
label='NS deg ' + str(mad_d))
ax.plot(df.nid.values+1, df.cent0_norm.values, color="tab:orange", alpha=0.5,
label='NS cent ' + str(mad_c))
ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
ax.set_title(pid + ' ' + str(np.round(corr_ns[i], 2)), loc='left')
ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))
# plt.savefig("NS_degcentphi.png")
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized node degree and centrality computed from TSE PRNs.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
fn = os.path.join(dirname, pid + "_degcent_normed.out")
df = pd.read_table(fn, sep="\s+")
de = df.query("exp_phi > -1")
mad_d = np.round(np.mean(np.abs(de.deg_avg_norm.values - de.exp_phi.values)), 2)
mad_d2 = np.round(np.mean(np.abs(de.deg_avg_norm2.values - de.exp_phi.values)), 2)
mad_c = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 2)
print(pid, mad_d, mad_d2, mad_c)
# print(pearsonr(de.deg_avg_norm.values, de.exp_phi.values))
# print(pearsonr(de.cent_avg_norm.values, de.exp_phi.values))
ax.plot(df.nid.values+1, df.deg_avg_norm.values, color="tab:blue", alpha=0.5,
label='TSE deg ' + str(mad_d))
ax.plot(df.nid.values+1, df.deg_avg_norm2.values, color="tab:gray", alpha=0.75,
label='TSE deg2 ' + str(mad_d2))
ax.plot(df.nid.values+1, df.cent_avg_norm.values, color="tab:orange", alpha=0.5,
label='TSE cent ' + str(mad_c))
ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
ax.set_title(pid + ' ' + str(np.round(corr_ts[i], 2)), loc='left')
ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))
# plt.savefig("TSE_deg2centphi.png")
fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=2)
df1.head()
np.all(df1.walk_locent.values == df1.path_locent.values)
for pid in PIDs:
fn0 = os.path.join(dirname, pid + "_deg0.out")
df0 = pd.read_table(fn0, sep="\s+")
# df0.head()
fn1 = os.path.join(dirname, pid + "_locent.out") # local cent
df1 = pd.read_table(fn1, sep="\s+", header=2)
# df1.head()
m = df1.walk_locent.max()
wloc = df1.walk_locent.values/m
m = df1.path_locent.max()
ploc = df1.path_locent.values/m
df = pd.DataFrame({"nid": df1.nid.values, "exp_phi": df0.exp_phi.values,
"wloc_norm": wloc, "ploc_norm": ploc})
# df.head()
fn2 = os.path.join(dirname, pid + "_locent_normed.out")
df.to_csv(fn2, sep=' ', index=False)
fn2 = os.path.join(dirname, pid + "_locent_normed.out")
df1 = pd.read_table(fn2, sep="\s+")
df1.head()
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Normalized local node centrality computed from NS PRN.")
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
fn2 = os.path.join(dirname, pid + "_locent_normed.out")
df1 = pd.read_table(fn2, sep="\s+")
# df1.head()
de = df1.query("exp_phi > -1")
mad_w = np.round(np.mean(np.abs(de.wloc_norm.values - de.exp_phi.values)), 2)
mad_p = np.round(np.mean(np.abs(de.ploc_norm.values - de.exp_phi.values)), 2)
print(pid, mad_w, pearsonr(de.wloc_norm.values, de.exp_phi.values))
print(pid, mad_p, pearsonr(de.ploc_norm.values, de.exp_phi.values))
# wloc and ploc results are pretty much the same (backtrack little effect on locent)
# use ploc
ax.plot(df1.nid.values+1, df1.ploc_norm.values, color="tab:orange", alpha=0.5,
label='NS locent ' + str(mad_p))
ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
ax.set_title(pid, loc='left'); ax.set_xlabel('Residue')
ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))
# plt.savefig("NS_locentphi.png")
# MAD values
ns_deg = np.array([0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41])
ns_cent = np.array([0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36])
ns_locent = np.array([0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16])
ts_deg = np.array([0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3])
ts_deg2 = np.array([0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32])
ts_cent = np.array([0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4])
print("ns_deg:", ns_deg.mean(), ns_deg.std(ddof=1))
print("ns_cent:", ns_cent.mean(), ns_cent.std(ddof=1))
print("ns_locent:", ns_locent.mean(), ns_locent.std(ddof=1)) # larger std than the others!
print()
print("ts_deg:", ts_deg.mean(), ts_deg.std(ddof=1))
print("ts_deg2:", ts_deg2.mean(), ts_deg2.std(ddof=1))
print("ts_cent:", ts_cent.mean(), ts_cent.std(ddof=1))
fig, axes = plt.subplots(2, 2, figsize=(9, 8), constrained_layout=True, sharey=True)
plt.suptitle("MAD between calc. and exp. $\phi$", fontsize=14)
ax = axes[0][0]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg ' + str(0.409))
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent ' + str(0.364))
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent ' + str(0.253))
ax.set_title("Native State")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.legend(loc="upper left", bbox_to_anchor=(0.27, 1.0)); ax.grid()
ax = axes[0][1]
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg ' + str(0.306))
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 ' + str(0.300))
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent ' + str(0.352))
ax.set_title("Transition-State Ensemble")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()
ax = axes[1][0]
ax.plot(PIDs, ns_cent, 'o-', color='tab:purple', label='NS cent 0.364')
ax.plot(PIDs, ns_locent, 'o-', color='tab:pink', label='NS locent 0.253')
ax.plot(PIDs, ts_cent, 'o-', color='tab:olive', label='TSE cent 0.352')
# ax.plot(PIDs, nd_cent, 'o-', color='tab:blue', label="ND 'c' cent 0.31")
ax.set_ylabel("MAD from exp_phi", fontsize=14)
ax.set_title("Node centrality")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(loc='upper right', ncol=2); ax.grid()
ax = axes[1][1]
ax.plot(PIDs, ns_deg, 'o--', color='tab:purple', label='NS deg 0.409')
ax.plot(PIDs, ts_deg, 'o--', color='tab:olive', label='TSE deg 0.306')
ax.plot(PIDs, ts_deg2, 's--', color='tab:gray', label='TSE deg2 0.300')
# ax.plot(PIDs, nd_deg, 'o--', color='tab:blue', label="ND 'c' deg 0.36")
ax.set_title("Node degree")
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(); ax.grid()
# plt.savefig("MAD_phi_NSTSE.png")
ts_deg - ts_deg2
# two-sided pvalues
print("ts_deg2 ts_deg")
print(ttest_rel(ts_deg2, ts_deg))
print(tts(ts_deg2.mean(), ts_deg2.std(ddof=1), 10,
ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nts_deg ts_cent")
print(ttest_rel(ts_deg, ts_cent))
print(tts(ts_cent.mean(), ts_cent.std(ddof=1), 10,
ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ts_deg")
print(ttest_rel(ns_locent, ts_deg))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10,
ts_deg.mean(), ts_deg.std(ddof=1), 10, equal_var=False))
print("\nns_locent ns_cent")
print(ttest_rel(ns_locent, ns_cent))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10,
ns_cent.mean(), ns_cent.std(ddof=1), 10, equal_var=False))
ns_deg = c(0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41) ns_cent = c(0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36) ns_locent = c(0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16) ts_deg = c(0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3) ts_deg2 = c(0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32) ts_cent = c(0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4) > t.test(ts_deg2, ts_deg, paired=T, alt="less")$p.value [1] 0.4034631 > t.test(ts_deg, ts_cent, paired=T, alt="less")$p.value [1] 0.02354884 > t.test(ns_locent, ts_deg, paired=T, alt="less")$p.value [1] 0.1000312 > t.test(ns_locent, ts_deg, alt="less")$p.value # Welch Two Sample t-test. [1] 0.09235434
import random
random.seed()
# mean_95CI with resampling/bootstrap
def bootstrap(vals):
means = np.zeros(10000); stdevs = np.zeros(10000)
for i in np.arange(10000):
resample = random.choices(vals, k=10) # with replacement
means[i], stdevs[i] = np.mean(resample), np.std(resample, ddof=1)
a, z = np.percentile(means, 2.5), np.percentile(means, 97.5)
moe = (z-a)/2.0
return moe, a, z
# calc the 95%CI of the mean, with sample stdev, n >= 30
def mean_95CI(sample_mean, sample_stdev, sample_size):
moe = (1.96 * sample_stdev)/np.sqrt(sample_size) # margin of error
return moe, sample_mean - moe, sample_mean + moe
def summary_MADstats(vals):
mu = np.mean(vals); snu = np.std(vals, ddof=1)
gmoe, ga, gz = mean_95CI(mu, snu, 10) # Gaussian estimated
bmoe, ba, bz = bootstrap(vals) # bootstrap
return np.round([mu, snu, np.ptp(vals), gmoe, ga, gz, bmoe, ba, bz], 4)
# sample size is too small for CLT which 95%meanCI assumes (should use bootstrap, resampling?)
print("MAD between experimental and calculated phi-values")
df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright",],
"NS_deg": summary_MADstats(ns_deg), "NS_cent": summary_MADstats(ns_cent),
"NS_locent": summary_MADstats(ns_locent), "TSE_deg": summary_MADstats(ts_deg),
"TSE_deg2": summary_MADstats(ts_deg2), "TSE_cent": summary_MADstats(ts_cent) })
df = df.set_index("MAD_stats")
df
methods = df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even
# both ways produce similar CIs
ga = df.loc['95meanCI_gleft'].values; gz = df.loc['95meanCI_gright'].values
# gmoe = df.loc['gMOE'].values
ba = df.loc['95meanCI_bleft'].values; bz = df.loc['95meanCI_bright'].values
# bmoe = df.loc['bMOE'].values
mu = df.loc["sample_avg"]
for i, p in enumerate(zip(ga, gz, ba, bz)):
plt.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
plt.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
plt.scatter(i+0.2, mu[i], marker='^')
plt.title("Calculating $\phi$ values from native-state and TSE PRNs.")
plt.xticks(list(range(0, n, 1)), methods, rotation=45, fontsize=12)
plt.xlabel("Method to calculate phi-values", fontsize=12)
plt.ylabel("95% confidence interval for mean MAD", fontsize=12)
plt.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0,1.0))
plt.grid()
fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
print(fn1)
df1 = pd.read_table(fn1, sep="\s+", header=1)
df1.head()
# combine exp_phi with degseq
for pid in PIDs:
fn0 = os.path.join(dirname, pid + "_deg0.out")
df0 = pd.read_table(fn0, sep="\s+") # to get nids with exp_phi values, and exp_phi values
dp = df0.query("exp_phi > -1")[['nid', 'exp_phi']]
phis = dp.exp_phi.values # phi-values
nids = dp.nid.values # nids with phi-values
fn1 = os.path.join(dirname, pid + "_locents.out") # explore all local cent
df1 = pd.read_table(fn1, sep="\s+", header=1)
mads = []
ncols = df1.shape[1] # 1..n are subs: candidate initial fold substructures
for c in np.arange(1, ncols, 1):
a = df1.iloc[:, c].values # select column-wise
normed_a = a/np.max(a)
mad = np.round(np.mean(np.abs(normed_a.take(nids) - phis)), 3)
mads.append(mad)
print(pid, np.min(mads), np.argmin(mads), mads)
# MAD scores with different criterion for local centrality
max_cent = [0.306, 0.501, 0.204, 0.19, 0.236, 0.223, 0.30, 0.153, 0.24, 0.156]
init_fold = [0.363, 0.507, 0.345, 0.19, 0.236, 0.223, 0.355, 0.153, 0.384, 0.209]
min_mad = [0.306, 0.50, 0.204, 0.19, 0.186, 0.21, 0.207, 0.153, 0.236, 0.142]
print(np.mean(max_cent), np.mean(init_fold), np.mean(min_mad))
plt.title("Local centrality with different EDS path criterion", fontsize=12)
plt.plot(PIDs, max_cent, 'o-', color='tab:pink', label='max_cent 0.251')
plt.plot(PIDs, init_fold, 'x--', color='tab:brown', label='init_fold 0.296')
plt.plot(PIDs, min_mad, '.--', color='black', label='min_mad 0.233')
plt.xticks(PIDs, rotation=90)
plt.ylabel("MAD between calc. and exp. $\phi$", fontsize=12)
plt.legend(loc='upper right'); plt.grid()
# plt.savefig("MAD_phi_locents.png")
SKLC November 2020