import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.stats import ttest_rel # samples have same variance, paired
from scipy.stats import ttest_ind_from_stats as tts # different variance, paired?
# check with R, use CI
Qs =[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
dirname = "Paci_ND3"
PIDs = ["1imq", "1lmb4", "1bf4", "2ptl", "2ci2", "1aps", "1fmk", "1bk2", "1shf", "1ten"]
EDVs = ['a'] # EDVs = ['a', 'b', 's', 'l', 'v']
NDVs = ['c', 'x', 'r', 'a', 'y']
ndv_colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']
# computed previously
c_peak_Qs = np.array([0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.3])
x_peak_Qs = np.array([0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2])
r_peak_Qs = np.array([0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2])
a_peak_Qs = np.array([0.2, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2])
y_peak_Qs = np.array([0.2, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1])
# cxray
peak_Qs = np.column_stack([c_peak_Qs, x_peak_Qs, r_peak_Qs, a_peak_Qs, y_peak_Qs])
peak_Qs
def mad_from_phi(ndv, edv):
dirn = os.path.join(dirname, "edv_a") # ND data
ofn = os.path.join(dirname, ndv + "_phi_mad.out")
fout = open(ofn, "w"); fout.close() # truncates existing file
fout = open(ofn, "a") # appends to file
fout.write("pid Q mad_deg_avg mad_cent_avg\n")
# fout.write("pid Q mad_deg_avg mad_deg_avg2 mad_cent_avg mad_cent_avg2\n")
for pid in PIDs:
fn0 = os.path.join(dirname, pid + "_nodestats.out") # NS
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
N = len(df0)
fn1 = os.path.join(dirn, pid + "_" + ndv + "_nodestats.out") # ND
df1 = pd.read_table(fn1, sep="\s+") # Q nid deg_avg cent_avg
for q in Qs:
dq = df1.query("Q == @q")
m = dq.deg_avg.max() # normalize then select phi residues
dq = dq.assign(deg_avg_norm = dq.deg_avg.values/m)
# prop. of native degree
# dq = dq.assign(deg_avg_norm2 = dq.deg_avg.values/df0.deg0.values)
m = dq.cent_avg.max()
dq = dq.assign(cent_avg_norm = dq.cent_avg.values/m)
# fraction of all paths
# dq = dq.assign(cent_avg_norm2 = dq.cent_avg.values/(N*N-N))
dq = dq.assign(exp_phi = df0.exp_phi.values)
de = dq.query("exp_phi > -1")
a = np.round(np.mean(np.abs(de.deg_avg_norm.values - de.exp_phi.values)), 3)
b = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 3)
fout.write("{0} {1} {2} {3}\n".format(pid, q, a, b))
# a2 = np.round(np.mean(np.abs(de.deg_avg_norm2.values - de.exp_phi.values)), 3)
# b2 = np.round(np.mean(np.abs(de.cent_avg_norm2.values - de.exp_phi.values)), 3)
# fout.write("{0} {1} {2} {3} {4} {5}\n".format(pid, q, a, a2, b, b2))
fout.close()
df = pd.read_table(ofn, sep=' ')
# in general a2 > a
# count = np.sum(df.mad_deg_avg.values > df.mad_deg_avg2.values) # 21, 22, 19, 21, 14
# in general b2 > b
# count = np.sum(df.mad_cent_avg.values > df.mad_cent_avg2.values) # 16, 16, 56, 20, 64
# in general a > b, desire to minimize mad, use b
count = np.sum(df.mad_deg_avg.values > df.mad_cent_avg.values) # 90, 91, 73, 79, 62
# print(ndv, edv, count, len(df))
return count
# this cell takes some time to run, only need to do this once to create ndv_phi_mad.out files
counts = [] # deg > cent mad across all Qs for each pid
for edv in EDVs:
row = []
for ndv in NDVs:
count = mad_from_phi(ndv, edv)
row.append(count)
counts.append(row)
counts
# formats counts and do summation
df = pd.DataFrame(counts).assign(EDV=EDVs).set_index("EDV")
df.columns = NDVs
s = df.sum(axis=0) # col sum
s.name = "sum"
df = df.append(s)
df = df.assign(sum = df.sum(axis=1)) # row sum
df
# load the ndv_phi_mad.out files
edv = 'a'
cxray_dfs = []
for ndv in NDVs:
fn = os.path.join(dirname, ndv + "_phi_mad.out")
df = pd.read_table(fn, sep=' ')
cxray_dfs.append(df)
import random
random.seed()
# mean_95CI with resampling/bootstrap
def bootstrap(vals):
means = np.zeros(10000); stdevs = np.zeros(10000)
for i in np.arange(10000):
resample = random.choices(vals, k=10) # with replacement
means[i], stdevs[i] = np.mean(resample), np.std(resample, ddof=1)
a, z = np.percentile(means, 2.5), np.percentile(means, 97.5)
moe = (z-a)/2.0
return moe, a, z
# calc the 95%CI of the mean, with sample stdev, n >= 30
def mean_95CI(sample_mean, sample_stdev, sample_size):
moe = (1.96 * sample_stdev)/np.sqrt(sample_size) # margin of error
return moe, sample_mean - moe, sample_mean + moe
def summary_stats(vals):
mu = np.mean(vals); snu = np.std(vals, ddof=1)
gmoe, ga, gz = mean_95CI(mu, snu, 10) # Gaussian estimated
bmoe, ba, bz = bootstrap(vals) # bootstrap
return np.round([mu, snu, np.ptp(vals), gmoe, ga, gz, bmoe, ba, bz], 4)
# compare mads of ND variants
fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True, sharey=True)
plt.suptitle("Mean absolute difference between phi-values from experiments and \n \
phi-values calculated with normalized node centrality from ND nets, EDS variant " + edv)
plt.delaxes(axes[3][1])
plt.delaxes(axes[3][2])
min_mads, min_Qs = [], []
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
res = []
for i, ndv in enumerate(NDVs):
dp = cxray_dfs[i].query("pid == @pid")
mad = dp.mad_cent_avg.values
# mad = dp.mad_cent_avg2.values
ax.plot(Qs, mad, 'o--', label=ndv)
res.append(mad)
ax.set_title(pid)
ax.set_xlabel('Q'); ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.legend(); ax.grid()
min_mad = np.round(np.min(res, axis=1), 3) # row-wise per ND variant, over Qs for a pid
min_q = (np.round(np.argmin(res, axis=1), 3) + 1)/10
min_mads.append(min_mad)
min_Qs.append(min_q)
print(pid, min_mad, min_q)
# ofn = os.path.join(dirname, "ND_centphi_" + edv + ".png")
# print(ofn)
# plt.savefig(ofn)
# can skip if file already created
cent_minmad_df = pd.DataFrame(min_mads)
cent_minmad_df.columns = NDVs
cent_minmad_df = cent_minmad_df.assign(pid=PIDs).set_index('pid')
ofn = os.path.join(dirname, "ND_centphi_minmad_" + edv + ".out")
cent_minmad_df.to_csv(ofn, sep=' ')
cent_MADstats_df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright",],
"c": summary_stats(cent_minmad_df['c'].values),
"x": summary_stats(cent_minmad_df['x'].values),
"r": summary_stats(cent_minmad_df['r'].values),
"a": summary_stats(cent_minmad_df['a'].values),
"y": summary_stats(cent_minmad_df['y'].values) })
cent_MADstats_df = cent_MADstats_df.set_index("MAD_stats")
# ofn = os.path.join(dirname, "ND_cent_MADstats_" + edv + ".out")
# cent_MADstats_df.to_csv(ofn, sep=' ')
# can skip if file already created
cent_minQ_df = pd.DataFrame(min_Qs)
cent_minQ_df.columns = NDVs
cent_minQ_df = cent_minQ_df.assign(pid=PIDs).set_index('pid')
ofn = os.path.join(dirname, "ND_centphi_minQ_" + edv + ".out")
cent_minQ_df.to_csv(ofn, sep=' ')
cent_minQstats_df = pd.DataFrame({ "minQ_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright"],
"c": summary_stats(cent_minQ_df['c'].values),
"x": summary_stats(cent_minQ_df['x'].values),
"r": summary_stats(cent_minQ_df['r'].values),
"a": summary_stats(cent_minQ_df['a'].values),
"y": summary_stats(cent_minQ_df['y'].values)})
cent_minQstats_df = cent_minQstats_df.set_index("minQ_stats")
# ofn = os.path.join(dirname, "ND_cent_minQstats_" + edv + ".out")
# cent_minQstats_df.to_csv(ofn, sep=' ')
# edv = 'a'
# fn = os.path.join(dirname, "ND_centphi_minmad_" + edv + ".out")
# cent_minmad_df = pd.read_table(fn, sep=' ').set_index('pid')
print(cent_minmad_df.describe()) # sample std
cent_minmad_df
print(np.mean(cent_minmad_df['x'].values))
print(np.std(cent_minmad_df['x'].values))
print(np.std(cent_minmad_df['x'].values, ddof=1))
# fn = os.path.join(dirname, "ND_centphi_minQ_" + edv + ".out")
# cent_minQ_df = pd.read_table(fn, sep=' ').set_index('pid')
print(cent_minQ_df.describe())
cent_minQ_df
fig, axes = plt.subplots(1, 2, figsize=(8, 4), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values with normalized node centrality from ND nets.",
fontsize=14)
ax0 = axes[0]
cent_minmad_df.boxplot(ax = ax0, showmeans=True)
ax0.set_title('Minimum MAD from exp_phi')
ax0.set_xticklabels(NDVs, fontsize=14); ax0.set_xlabel('ND variant')
# ax0.set_yticks(np.arange(0.2, 0.55, 0.05))
ax1 = axes[1]
cent_minQ_df.boxplot(ax = ax1, showmeans=True)
ax1.set_title('$Q$ with min MAD');
ax1.set_xticklabels(NDVs, fontsize=14); ax1.set_xlabel('ND variant')
_ = ax1.set_yticks(np.arange(0, 1.1, 0.1))
# MAD between min_Qs (Q where MAD from exp_phi is minimum) and peak Qs for cxray NDVs
np.mean(np.abs(cent_minQ_df.values - peak_Qs), axis=0)
print("MAD between experimental and calculated phi-values")
cent_MADstats_df
print("Q with minimum MAD between experimental and calculated phi-values")
cent_minQstats_df
methods = cent_MADstats_df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even
fig, axes = plt.subplots(2, 1, figsize=(6, 8), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values with normalized ND node centrality.", fontsize=14)
ax = axes[0]
mu = cent_MADstats_df.loc['sample_avg'].values
# both ways produce similar CIs
ga = cent_MADstats_df.loc['95meanCI_gleft'].values
gz = cent_MADstats_df.loc['95meanCI_gright'].values
# gmoe = cent_MADstats_df.loc['gMOE'].values
ba = cent_MADstats_df.loc['95meanCI_bleft'].values
bz = cent_MADstats_df.loc['95meanCI_bright'].values
# bmoe = cent_MADstats_df.loc['bMOE'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.set_title("average MAD from exp_phi", fontsize=14)
ax.set_ylabel("95% confidence interval", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
# ax.set_xlabel("Method to calculate phi-values", fontsize=12)
ax.grid()
ax = axes[1] # methods = cent_minQstats_df.columns.values
mu = cent_minQstats_df.loc['sample_avg'].values
ga = cent_minQstats_df.loc['95meanCI_gleft'].values
gz = cent_minQstats_df.loc['95meanCI_gright'].values
# gmoe = cent_minQstats_df.loc['gMOE'].values
ba = cent_minQstats_df.loc['95meanCI_bleft'].values
bz = cent_minQstats_df.loc['95meanCI_bright'].values
# bmoe = cent_minQstats_df.loc['bMOE'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.set_title("average minQ", fontsize=14)
ax.set_ylabel("95% confidence interval", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
# ax.set_xlabel("ND node centrality", fontsize=14)
ax.grid()
ND variant 'a' produces the best calculated $\phi$ values with node centrality for the dataset:
ND variant 'c' yields the smallest MAD between min$Q$s and peak $Q$s (0.14).
ndc_cent = cent_minmad_df['c'].values
ndx_cent = cent_minmad_df['x'].values
nda_cent = cent_minmad_df['a'].values
ndr_cent = cent_minmad_df['r'].values
print("c a")
print(ttest_rel(ndc_cent, nda_cent))
print(tts(ndc_cent.mean(), ndc_cent.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
print("\nx a")
print(ttest_rel(ndx_cent, nda_cent))
print(tts(ndx_cent.mean(), ndx_cent.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
print("\nx c")
print(ttest_rel(ndx_cent, ndc_cent))
print(tts(ndx_cent.mean(), ndx_cent.std(ddof=1), 10,
ndc_cent.mean(), ndc_cent.std(ddof=1), 10, equal_var=False))
print("\nr c")
print(ttest_rel(ndr_cent, ndc_cent))
print(tts(ndr_cent.mean(), ndr_cent.std(ddof=1), 10,
ndc_cent.mean(), ndc_cent.std(ddof=1), 10, equal_var=False))
print("\nr x")
print(ttest_rel(ndr_cent, ndx_cent))
print(tts(ndr_cent.mean(), ndr_cent.std(ddof=1), 10,
ndx_cent.mean(), ndx_cent.std(ddof=1), 10, equal_var=False))
print("\nr a")
print(ttest_rel(ndr_cent, nda_cent))
print(tts(ndr_cent.mean(), ndr_cent.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
minMad_c = cent_minmad_df['c'].values
minMad_a = cent_minmad_df['a'].values
print('minMad c', minMad_c)
print('minMad a', minMad_a)
minQ_c = cent_minQ_df['c'].values
minQ_a = cent_minQ_df['a'].values
print('minQ c', minQ_c)
print('minQ a', minQ_a)
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Calculated $\phi$ values from normalized node centrality, \
ND variants 'c' and 'a' with EDS variant 'a'.", fontsize=14)
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
dirn = os.path.join(dirname, "edv_a") # ND data
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
fn1 = os.path.join(dirn, pid + "_c_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+")
q_c = minQ_c[i]
dq = df1.query("Q == @q_c")
dq = dq.assign(cent_avg_norm = dq.cent_avg.values/dq.cent_avg.max())
dq = dq.assign(exp_phi = df0.exp_phi.values)
de = dq.query("exp_phi > -1")
mad_c = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 2)
# print(pid, q, mad_c, spearmanr(de.cent_avg_norm.values, de.exp_phi.values))
ax.plot(dq.nid.values+1, dq.cent_avg_norm.values, color="tab:blue", alpha=0.5,
label="'c' cent " + str(mad_c))
fn1 = os.path.join(dirn, pid + "_a_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+")
q_a = minQ_a[i]
dq = df1.query("Q == @q_a")
dq = dq.assign(cent_avg_norm = dq.cent_avg.values/dq.cent_avg.max())
dq = dq.assign(exp_phi = df0.exp_phi.values)
de = dq.query("exp_phi > -1")
mad_a = np.round(np.mean(np.abs(de.cent_avg_norm.values - de.exp_phi.values)), 2)
# print(pid, q, mad_a, spearmanr(de.cent_avg_norm.values, de.exp_phi.values))
ax.plot(dq.nid.values+1, dq.cent_avg_norm.values, color="tab:red", alpha=0.5,
label="'a' cent " + str(mad_a))
ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
ax.set_title(pid + " " + str(q_c) + " " + str(q_a), loc='left')
ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))
# ofn = os.path.join(dirname, "ND_centphi_ca_" + edv + ".png")
# plt.savefig(ofn)
# compare mads of ND variants
fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True, sharey=True)
plt.suptitle("Mean absolute difference between phi-values from experiments and \n \
phi-values calculated with normalized node degree from ND nets, EDS variant " + edv)
plt.delaxes(axes[3][1])
plt.delaxes(axes[3][2])
min_mads, min_Qs = [], []
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
res = []
for i, ndv in enumerate(NDVs):
dp = cxray_dfs[i].query("pid == @pid")
mad = dp.mad_deg_avg.values
ax.plot(Qs, mad, 'o--', label=ndv)
res.append(mad)
ax.set_title(pid)
ax.set_xlabel('Q'); ax.set_xticks(np.arange(0, 1.1, 0.1))
ax.legend(); ax.grid()
min_mad = np.round(np.min(res, axis=1), 3) # row-wise per ND variant, over Qs for a pid
min_q = (np.round(np.argmin(res, axis=1), 3) + 1)/10
min_mads.append(min_mad)
min_Qs.append(min_q)
print(pid, min_mad, min_q)
# ofn = os.path.join(dirname, "ND_degphi_" + edv + ".png")
# plt.savefig(ofn)
# can skip if file already exists
deg_minmad_df = pd.DataFrame(min_mads)
deg_minmad_df.columns = NDVs
deg_minmad_df = deg_minmad_df.assign(pid=PIDs).set_index('pid')
ofn = os.path.join(dirname, "ND_degphi_minmad_" + edv + ".out")
deg_minmad_df.to_csv(ofn, sep=' ')
deg_MADstats_df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright",],
"c": summary_stats(deg_minmad_df['c'].values),
"x": summary_stats(deg_minmad_df['x'].values),
"r": summary_stats(deg_minmad_df['r'].values),
"a": summary_stats(deg_minmad_df['a'].values),
"y": summary_stats(deg_minmad_df['y'].values) })
deg_MADstats_df = deg_MADstats_df.set_index("MAD_stats")
# ofn = os.path.join(dirname, "ND_deg_MADstats_" + edv + ".out")
# deg_MADstats_df.to_csv(ofn, sep=' ')
# can skip if file already exists
deg_minQ_df = pd.DataFrame(min_Qs)
deg_minQ_df.columns = NDVs
deg_minQ_df = deg_minQ_df.assign(pid=PIDs).set_index('pid')
ofn = os.path.join(dirname, "ND_degphi_minQ_" + edv + ".out")
deg_minQ_df.to_csv(ofn, sep=' ')
deg_minQstats_df = pd.DataFrame({ "minQ_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright"],
"c": summary_stats(deg_minQ_df['c'].values),
"x": summary_stats(deg_minQ_df['x'].values),
"r": summary_stats(deg_minQ_df['r'].values),
"a": summary_stats(deg_minQ_df['a'].values),
"y": summary_stats(deg_minQ_df['y'].values)})
deg_minQstats_df = deg_minQstats_df.set_index("minQ_stats")
# ofn = os.path.join(dirname, "ND_deg_minQstats_" + edv + ".out")
# deg_minQstats_df.to_csv(ofn, sep=' ')
# fn = os.path.join(dirname, "ND_degphi_minmad_" + edv + ".out")
# deg_minmad_df = pd.read_table(fn, sep=' ').set_index('pid')
print(deg_minmad_df.describe())
deg_minmad_df
# fn = os.path.join(dirname, "ND_degphi_minQ_" + edv + ".out")
# deg_minQ_df = pd.read_table(fn, sep=' ').set_index('pid')
print(deg_minQ_df.describe())
deg_minQ_df
fig, axes = plt.subplots(1, 2, figsize=(8, 4), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values with normalized node degree from ND nets.",
fontsize=14)
ax0 = axes[0]
deg_minmad_df.boxplot(ax = ax0, showmeans=True)
ax0.set_title('Minimum MAD from exp_phi')
ax0.set_xticklabels(NDVs, fontsize=14); ax0.set_xlabel('ND variant')
# ax0.set_yticks(np.arange(0.2, 0.55, 0.05))
ax1 = axes[1]
deg_minQ_df.boxplot(ax = ax1, showmeans=True)
ax1.set_title('$Q$ with min MAD');
ax1.set_xticklabels(NDVs, fontsize=14); ax1.set_xlabel('ND variant')
_ = ax1.set_yticks(np.arange(0, 1.1, 0.1))
# MAD between min_Qs (Q where MAD from exp_phi is minimum) and peak Qs for cxray NDVs
np.mean(np.abs(deg_minQ_df.values - peak_Qs), axis=0)
print("MAD between experimental and calculated phi-values")
deg_MADstats_df
print("Q with minimum MAD between experimental and calculated phi-values")
deg_minQstats_df
methods = deg_MADstats_df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even
fig, axes = plt.subplots(2, 1, figsize=(6, 8), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values with normalized ND node degree.", fontsize=14)
ax = axes[0]
mu = deg_MADstats_df.loc['sample_avg'].values
# both ways produce similar CIs
ga = deg_MADstats_df.loc['95meanCI_gleft'].values
gz = deg_MADstats_df.loc['95meanCI_gright'].values
# gmoe = deg_MADstats_df.loc['gMOE'].values
ba = deg_MADstats_df.loc['95meanCI_bleft'].values
bz = deg_MADstats_df.loc['95meanCI_bright'].values
# bmoe = deg_MADstats_df.loc['bMOE'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_title("mean MAD from exp_phi", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
# ax.set_xlabel("Method to calculate phi-values", fontsize=12)
ax.set_ylabel("95% confidence interval", fontsize=12)
ax.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.grid()
ax = axes[1] # methods = deg_minQstats_df.columns.values
mu = deg_minQstats_df.loc['sample_avg'].values
ga = deg_minQstats_df.loc['95meanCI_gleft'].values
gz = deg_minQstats_df.loc['95meanCI_gright'].values
# gmoe = deg_minQstats_df.loc['gMOE'].values
ba = deg_minQstats_df.loc['95meanCI_bleft'].values
bz = deg_minQstats_df.loc['95meanCI_bright'].values
# bmoe = deg_minQstats_df.loc['bMOE'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_title("mean minQ", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
ax.set_ylabel("95% confidence interval", fontsize=12)
# ax.set_xlabel("ND node degree", fontsize=12)
ax.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.grid()
ndc_deg = deg_minmad_df['c'].values
ndx_deg = deg_minmad_df['x'].values
nda_deg = deg_minmad_df['a'].values
ndr_deg = deg_minmad_df['r'].values
# pvalues are two-sided
print("c a")
print(ttest_rel(ndc_deg, nda_deg))
print(tts(ndc_deg.mean(), ndc_deg.std(ddof=1), 10,
nda_deg.mean(), nda_deg.std(ddof=1), 10, equal_var=False))
print("\nx a")
print(ttest_rel(ndx_deg, nda_deg))
print(tts(ndx_deg.mean(), ndx_deg.std(ddof=1), 10,
nda_deg.mean(), nda_deg.std(ddof=1), 10, equal_var=False))
print("\nr c")
print(ttest_rel(ndr_deg, ndc_deg))
print(tts(ndr_deg.mean(), ndr_deg.std(ddof=1), 10,
ndc_deg.mean(), ndc_deg.std(ddof=1), 10, equal_var=False))
print("\nr a")
print(ttest_rel(ndr_deg, nda_deg))
print(tts(ndr_deg.mean(), ndr_deg.std(ddof=1), 10,
nda_deg.mean(), nda_deg.std(ddof=1), 10, equal_var=False))
minMad_a = deg_minmad_df['a'].values
print('minMad a', minMad_a)
minQ_a = deg_minQ_df['a'].values
print('minQ a', minQ_a)
fig, axes = plt.subplots(4, 3, figsize=(12, 18), constrained_layout=True)
plt.suptitle("Calculated $\phi$ values from normalized node degree, \
ND variant 'a' with EDS variant 'a'.", fontsize=14)
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
dirn = os.path.join(dirname, "edv_a") # ND data
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r][c]
fn0 = os.path.join(dirname, pid + "_nodestats.out")
df0 = pd.read_table(fn0, sep="\s+") # nid exp_phi deg0 cent0
fn1 = os.path.join(dirn, pid + "_a_nodestats.out")
df1 = pd.read_table(fn1, sep="\s+")
q_a = minQ_a[i]
dq = df1.query("Q == @q_a")
dq = dq.assign(deg_avg_norm = dq.deg_avg.values/dq.deg_avg.max())
dq = dq.assign(exp_phi = df0.exp_phi.values)
de = dq.query("exp_phi > -1")
mad_a = np.round(np.mean(np.abs(de.deg_avg_norm.values - de.exp_phi.values)), 2)
ax.plot(dq.nid.values+1, dq.deg_avg_norm.values, color="tab:red", alpha=0.5,
label="'a' deg " + str(mad_a))
ax.plot(de.nid.values+1, de.exp_phi.values, "o--g", alpha=0.5, label='exp_phi')
ax.set_title(pid + " " + str(q_a), loc='left')
ax.set_xlabel('Residue'); ax.set_yticks(np.arange(0, 1.1, 0.1))
ax.legend(loc="lower left", bbox_to_anchor=(0.5, 1.0))
# ofn = os.path.join(dirname, "ND_degphi_a_" + edv + ".png")
# plt.savefig(ofn)
fig, axes = plt.subplots(1, 2, figsize=(10, 5), constrained_layout=True, sharey=True)
plt.suptitle("Calculating $\phi$ values with ND PRNs, EDS variant " + edv, fontsize=14)
ax = axes[0]
ax.set_title("Node centrality", fontsize=14)
for ndv in NDVs:
vals = cent_minmad_df[ndv].values
ax.plot(PIDs, vals, 'o-', label=ndv + ' ' + str(np.round(np.mean(vals), 3)))
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90, fontsize=14)
ax.set_ylabel("Minimum MAD from exp_phi", fontsize=14)
ax.legend(); ax.grid()
ax = axes[1]
ax.set_title("Node degree", fontsize=14)
for ndv in NDVs:
vals = deg_minmad_df[ndv].values
ax.plot(PIDs, vals, 'o--', label=ndv + ' ' + str(np.round(np.mean(vals), 3)))
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90, fontsize=14)
ax.legend(); ax.grid()
# ax.set_ylabel("Minimum MAD ", fontsize=14)
# ofn = os.path.join(dirname, "ND_minMadphi_" + edv + ".png")
# plt.savefig(ofn)
fig, axes = plt.subplots(2, 2, figsize=(8, 8), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values with ND PRNs, EDS variant " + edv, fontsize=14)
ax0 = axes[0][0]
cent_minmad_df.boxplot(ax = ax0, showmeans=True)
ax0.set_title('Node centrality', fontsize=14)
ax0.set_xticklabels(NDVs, fontsize=14)
ax0.set_ylabel('Minimum MAD from exp_phi', fontsize=14)
ax0.set_yticks(np.arange(0.15, 0.6, 0.05))
ax1 = axes[0][1]
deg_minmad_df.boxplot(ax = ax1, showmeans=True)
ax1.set_title('Node degree', fontsize=14)
ax1.set_xticklabels(NDVs, fontsize=14)
ax1.set_yticks(np.arange(0.15, 0.6, 0.05))
ax2 = axes[1][0]
cent_minQ_df.boxplot(ax = ax2, showmeans=True)
ax2.set_xticklabels(NDVs, fontsize=14); ax2.set_xlabel('ND variant', fontsize=14)
ax2.set_ylabel('$Q$ with minimum MAD', fontsize=14)
ax2.set_yticks(np.arange(0, 1.1, 0.1))
ax3 = axes[1][1]
deg_minQ_df.boxplot(ax = ax3, showmeans=True)
ax3.set_xticklabels(NDVs, fontsize=14); ax3.set_xlabel('ND variant', fontsize=14)
_ = ax3.set_yticks(np.arange(0, 1.1, 0.1))
# ofn = os.path.join(dirname, "ND_phi_" + edv + ".png")
# plt.savefig(ofn)
Figure above: Summary of results for calculating phi-values from ND PRNs as normalized node centrality (Left) and normalized node degree (Right). It is desirable for a ND variant to yield smaller minimum MAD scores within the ND-TS region.
Interpretation of box plots: The green triangle marks the mean of the data. The box extends from the lower (Q1) to upper (Q3) quartiles of the data with a line at the median (Q2). The whiskers capture data that lie beyond the box which are within 1.5 times the inter-quartile range (Q3-Q1) in either direction. Points beyond the whiskers are deemed outliers.
Top: Distribution of minimum MAD (between calculated and experimental phi-values) scores for the 10 proteins in the Paci dataset. Notably for the biased ND variants ('a', 'c', 'x'), the averages tend to be smaller for node centrality than for node degree.
Bottom: Distribution of $Q$ values where the minimum MAD scores are found. Notably the $Q$ values are more concentrated within the ND-TS region ($Q$ < 0.5) with node centrality than with node degree, for the biased ND variants ('a', 'c', 'x').
methods = cent_MADstats_df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even
fig, axes = plt.subplots(2, 2, figsize=(8, 8), constrained_layout=True)
plt.suptitle("Calculating $\phi$ values from ND nets.", fontsize=14)
ax = axes[0][0]
mu = cent_MADstats_df.loc['sample_avg'].values
ga = cent_MADstats_df.loc['95meanCI_gleft'].values
gz = cent_MADstats_df.loc['95meanCI_gright'].values
ba = cent_MADstats_df.loc['95meanCI_bleft'].values
bz = cent_MADstats_df.loc['95meanCI_bright'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_title("Node centrality", fontsize=14)
ax.set_yticks(np.arange(0.24, 0.46, 0.02))
ax.set_ylabel("95% CI mean MAD from exp_phi", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
ax.legend(); ax.grid()
ax = axes[1][0]
mu = cent_minQstats_df.loc['sample_avg'].values
ga = cent_minQstats_df.loc['95meanCI_gleft'].values
gz = cent_minQstats_df.loc['95meanCI_gright'].values
ba = cent_minQstats_df.loc['95meanCI_bleft'].values
bz = cent_minQstats_df.loc['95meanCI_bright'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_yticks(np.arange(0.0, 0.9, 0.1))
ax.set_ylabel("95% CI mean minQ", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
ax.legend(); ax.grid()
ax = axes[0][1]
mu = deg_MADstats_df.loc['sample_avg'].values
ga = deg_MADstats_df.loc['95meanCI_gleft'].values
gz = deg_MADstats_df.loc['95meanCI_gright'].values
ba = deg_MADstats_df.loc['95meanCI_bleft'].values
bz = deg_MADstats_df.loc['95meanCI_bright'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_title("Node degree", fontsize=14)
ax.set_yticks(np.arange(0.24, 0.46, 0.02))
# ax.set_ylabel("95% confidence interval mean MAD from exp_phi", fontsize=14)
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
ax.legend(); ax.grid()
ax = axes[1][1]
mu = deg_minQstats_df.loc['sample_avg'].values
ga = deg_minQstats_df.loc['95meanCI_gleft'].values
gz = deg_minQstats_df.loc['95meanCI_gright'].values
ba = deg_minQstats_df.loc['95meanCI_bleft'].values
bz = deg_minQstats_df.loc['95meanCI_bright'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
ax.plot([i, i], [p[0], p[1]], marker='o', color=gcolors[i])
ax.plot([i+0.2, i+0.2], [p[2], p[3]], marker='o', color=bcolors[i], label=str(p[3]))
ax.scatter(i+0.2, mu[i], marker='^')
ax.set_yticks(np.arange(0.0, 0.9, 0.1))
ax.set_xticks(list(range(0, n, 1))); ax.set_xticklabels(methods, fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
ax.legend(); ax.grid()
# ofn = os.path.join(dirname, "ND_95CIstats_" + edv + ".png")
# print(ofn)
# plt.savefig(ofn)
# MAD values computed previously
ns_deg = np.array([0.48, 0.34, 0.49, 0.35, 0.34, 0.37, 0.45, 0.35, 0.51, 0.41])
ns_cent = np.array([0.44, 0.3, 0.41, 0.29, 0.32, 0.3, 0.43, 0.35, 0.44, 0.36])
ns_locent = np.array([0.33, 0.5, 0.2, 0.19, 0.24, 0.22, 0.3, 0.15, 0.24, 0.16])
ts_deg = np.array([0.46, 0.24, 0.35, 0.29, 0.28, 0.26, 0.3, 0.28, 0.3, 0.3])
ts_deg2 = np.array([0.25, 0.21, 0.4, 0.29, 0.36, 0.35, 0.28, 0.31, 0.22, 0.32])
ts_cent = np.array([0.41, 0.34, 0.37, 0.34, 0.29, 0.26, 0.42, 0.26, 0.43, 0.4])
ndc_cent = np.array([0.394, 0.286, 0.36, 0.327, 0.26, 0.26, 0.304, 0.23, 0.391, 0.252])
nda_cent = np.array([0.274, 0.372, 0.301, 0.287, 0.251, 0.242, 0.302, 0.204, 0.399, 0.313])
nda_deg = np.array([0.33, 0.315, 0.402, 0.318, 0.271, 0.295, 0.355, 0.268, 0.444, 0.299])
print("nda_deg nda_cent")
print(ttest_rel(nda_deg, nda_cent))
print(tts(nda_deg.mean(), nda_deg.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
print("\nts_deg2 nda_cent")
print(ttest_rel(ts_deg2, nda_cent))
print(tts(ts_deg2.mean(), ts_deg2.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
print("\nns_locent nda_cent")
print(ttest_rel(ns_locent, nda_cent))
print(tts(ns_locent.mean(), ns_locent.std(ddof=1), 10,
nda_cent.mean(), nda_cent.std(ddof=1), 10, equal_var=False))
# sample size is too small for CLT which 95%meanCI assumes (should actual do resampling?)
print("MAD between experimental and calculated phi-values")
df = pd.DataFrame({ "MAD_stats": ["sample_avg", "sample_stdev", "range", \
"gMOE", "95meanCI_gleft", "95meanCI_gright",
"bMOE", "95meanCI_bleft", "95meanCI_bright"],
"NS_deg": summary_stats(ns_deg), "NS_cent": summary_stats(ns_cent),
"NS_locent": summary_stats(ns_locent), "TSE_deg": summary_stats(ts_deg),
"TSE_deg2": summary_stats(ts_deg2), "TSE_cent": summary_stats(ts_cent),
"NDc_cent": summary_stats(ndc_cent), "NDa_cent": summary_stats(nda_cent),
"NDa_deg": summary_stats(nda_deg)})
df = df.set_index("MAD_stats")
df
# edv = 'a'
plt.figure(figsize=(8, 6))
methods = df.columns.values; n = len(methods)
plt_colors = plt.get_cmap("tab20")
gcolors = plt_colors(list(range(1, n*2, 2))) # odd
bcolors = plt_colors(list(range(0, n*2, 2))) # even
mu = df.loc['sample_avg'].values
ga = df.loc['95meanCI_gleft'].values; gz = df.loc['95meanCI_gright'].values
# gmoe = df.loc['gMOE'].values
ba = df.loc['95meanCI_bleft'].values; bz = df.loc['95meanCI_bright'].values
bmoe = df.loc['bMOE'].values
for i, p in enumerate(zip(ga, gz, ba, bz)):
plt.plot([i, i], [p[0], p[1]], color=gcolors[i], marker='o')
plt.scatter(i+0.2, mu[i], marker='^')
plt.plot([i+0.2, i+0.2], [p[2], p[3]], color=bcolors[i], marker='o', label=str(p[3]))
plt.title("Calculating $\phi$ values", fontsize=14)
plt.xticks(list(range(0, n, 1)), methods, rotation=45, fontsize=14)
plt.xlabel("Method: PRN type and node statistic combination", fontsize=14)
plt.ylabel("95% confidence interval around mean MAD", fontsize=14)
plt.legend(title="97.5 percentile", loc="upper left", bbox_to_anchor=(1.0, 1.0)); plt.grid()
# ofn = os.path.join(dirname, "NSTSEND_95CIstats_" + edv + ".png")
# print(ofn)
# plt.savefig(ofn)
ND folding model is capturing some salient aspect of protein folding observed from experiments at the residue level of protein chains.
SKLC December 2020