import numpy as np
import pandas as pd
import os
from scipy.stats import norm, pearsonr, shapiro, kstest
from scipy.special import logsumexp
import matplotlib.pyplot as plt
import NDutils as nd # my lib
# import matplotlib
# matplotlib.use('AGG')
# https://matplotlib.org/tutorials/introductory/usage.html?highlight=saving%20figures%20file
fn = os.path.join('P_ND3', 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
print(PIDs)
print(fold_rates)
print("Range of fold rates:", np.ptp(fold_rates))
# load stats for the three ND variants, dict of dfs
edv = 'a'
dirn = os.path.join("P_ND3", "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
c_ndstats['1imq'].sc_mje # 100 runs per pid
# ND peak energies and peak Qs
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c'", c_peakQs, np.mean(c_peakQs), np.std(c_peakQs))
x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x'", x_peakQs, np.mean(x_peakQs), np.std(x_peakQs))
r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r'", r_peakQs, np.mean(r_peakQs), np.std(r_peakQs))
a_mje_params = nd.get_params(a_ndstats, 'sc_mje')
a_peakQs, a_peakEs = nd.get_peaks(a_mje_params, PIDs)
print("'a'", np.mean(a_peakQs), np.std(a_peakQs))
y_mje_params = nd.get_params(y_ndstats, 'sc_mje')
y_peakQs, y_peakEs = nd.get_peaks(y_mje_params, PIDs)
print("'y'", np.mean(y_peakQs), np.std(y_peakQs))
TSE_stats = {}
def get_TSEstats():
for pid in PIDs:
fn = os.path.join("P_pts", pid + '_TSstats.out')
df = pd.read_table(fn, sep=' ')[['ts_pid', 'scMJE']]
TSE_stats[pid] = df
return
TSE_mje_params = {}
def get_mje_params():
for pid in PIDs:
mjes = TSE_stats[pid].scMJE.values
TSE_mje_params[pid] = (mjes.mean(), mjes.std())
return
# normaltest requires sample >= 20, using shapiro-wilk
def check_normality(alpha=0.005):
for pid in PIDs:
mjes = TSE_stats[pid].scMJE.values
sw = shapiro(mjes)
print(pid, sw)
if sw[1] < alpha:
print('\t not normal')
return
# load TSE stats
get_TSEstats()
# Number of structures in TSEs
print(PIDs)
num_ts = [len(TSE_stats[pid]) for pid in PIDs]
print(num_ts)
# get TSE mje params (mean, stdev)
get_mje_params()
# shapiro-wilk, if p-value <= 0.01, reject null hypothesis(that sample is normally distributed)
check_normality(0.01)
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', \
'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
def plot_TSE(ax, pid):
mjes = TSE_stats[pid].scMJE.values
ax.hist(mjes, density=True, alpha=0.5)
mu, stdev = TSE_mje_params[pid]
x_axis = np.linspace(mu - 3*stdev, mu + 3*stdev, 1000)
ax.plot(x_axis, norm.pdf(x_axis, mu, stdev), colors[9], lw=2, label="TSE")
ax.set_title(pid + ' ' + str(num_ts[i]))
ax.set_ylabel("Density"); ax.set_xlabel("ND energy (E)")
def plot_ND(ax, pid, q, nd_mje_params:dict , nd_type):
mu, stdev = nd_mje_params[pid, q]
x_axis = np.linspace(mu - 3*stdev, mu + 3*stdev, 1000)
ax.plot(x_axis, norm.pdf(x_axis, mu, stdev),
colors[(int)(q*10-1)], lw=2, label= nd_type + ' ' + str(q))
fig, axes = plt.subplots(3, 4, figsize=(16, 12), constrained_layout=True)
plt.suptitle('Distribution of TSE E values.')
plt.delaxes(axes[2, 2])
plt.delaxes(axes[2, 3])
for i, pid in enumerate(PIDs):
r = i // 4; c = i % 4; ax = axes[r,c]
plot_TSE(ax, pid)
# plt.savefig("TSE_energydist.png")
# Plot a TSE ND energy histogram and Gaussian
pid = '1aps'
tse_Es = TSE_stats[pid].scMJE.values
tse_mu, tse_stdev = TSE_mje_params[pid]
plt.hist(tse_Es, density=True, alpha=0.5)
x_axis = np.linspace(tse_mu - 3*tse_stdev, tse_mu + 3*tse_stdev, 1000)
plt.plot(x_axis, norm.pdf(x_axis, tse_mu, tse_stdev), colors[9], lw=2, label='TSE')
# Plot 'c' ND energy Gaussians
for i, q in enumerate(nd.Qs[:-1]):
mu, stdev = c_mje_params[pid, q]
x_axis = np.linspace(mu - 3*stdev, mu + 3*stdev, 1000)
plt.plot(x_axis, norm.pdf(x_axis, mu, stdev), colors[i], lw=2, label=str(q))
plt.title("Locating a TSE within the 'c' ND energy Gaussians: " + pid);
plt.xlabel("ND energy (E)"); plt.ylabel('Density')
_= plt.legend(loc="upper left", bbox_to_anchor=(1.0,1.0))
# Plot a TSE ND energy histogram and Gaussian
pid = '1aps'
tse_Es = TSE_stats[pid].scMJE.values
tse_mu, tse_stdev = TSE_mje_params[pid]
plt.hist(tse_Es, density=True, alpha=0.5)
x_axis = np.linspace(tse_mu - 3*tse_stdev, tse_mu + 3*tse_stdev, 1000)
plt.plot(x_axis, norm.pdf(x_axis, tse_mu, tse_stdev), colors[9], lw=2, label='TSE')
# Plot 'a' ND energy Gaussians
for i, q in enumerate(nd.Qs[:-1]):
mu, stdev = a_mje_params[pid, q]
x_axis = np.linspace(mu - 3*stdev, mu + 3*stdev, 1000)
plt.plot(x_axis, norm.pdf(x_axis, mu, stdev), colors[i], lw=2, label=str(q))
plt.title("Locating a TSE within the 'a' ND energy Gaussians: " + pid);
plt.xlabel("ND energy (E)"); plt.ylabel('Density')
_= plt.legend(loc="upper left", bbox_to_anchor=(1.0,1.0))
pid = '1aps'
tse_Es = TSE_stats[pid].scMJE.values
for q in nd.Qs:
print(q, kstest(tse_Es, 'norm', args=(a_mje_params[pid, q])))
def locate_TSE_E1(nd_mje_params: dict):
n = len(PIDs)
TSE_Qs = np.zeros(n); max_pvals = np.zeros(n)
for i, pid in enumerate(PIDs):
mjes = TSE_stats[pid].scMJE.values
ks = [ kstest(mjes, 'norm', args=(nd_mje_params[pid, q])) for q in nd.Qs ]
d, pval = zip(* ks)
# print(d, np.min(d))
# print(pval, np.max(pval))
max_pvals[i] = np.max(pval)
n = np.argmax(pval); assert np.argmin(d) == n, "argmin(d) is not equal argmax(pval)"
TSE_Qs[i] = nd.Qs[n]
return TSE_Qs, max_pvals
c_TSE_Qs_E1, c_pvals_E1 = locate_TSE_E1(c_mje_params.copy())
x_TSE_Qs_E1, x_pvals_E1 = locate_TSE_E1(x_mje_params.copy())
r_TSE_Qs_E1, r_pvals_E1 = locate_TSE_E1(r_mje_params.copy())
a_TSE_Qs_E1, a_pvals_E1 = locate_TSE_E1(a_mje_params.copy())
y_TSE_Qs_E1, y_pvals_E1 = locate_TSE_E1(y_mje_params.copy())
print(PIDs)
print('c', c_TSE_Qs_E1, np.mean(c_TSE_Qs_E1), np.std(c_TSE_Qs_E1))
print('x', x_TSE_Qs_E1, np.mean(x_TSE_Qs_E1), np.std(x_TSE_Qs_E1))
print('r', r_TSE_Qs_E1, np.mean(r_TSE_Qs_E1), np.std(r_TSE_Qs_E1))
print('a', a_TSE_Qs_E1, np.mean(a_TSE_Qs_E1), np.std(a_TSE_Qs_E1))
print('y', y_TSE_Qs_E1, np.mean(y_TSE_Qs_E1), np.std(y_TSE_Qs_E1))
# Mean Absolute Difference: how close are the TSE Qs to ND peak Qs
print('c:', nd.mad(c_peakQs, c_TSE_Qs_E1))
print('x:', nd.mad(x_peakQs, x_TSE_Qs_E1))
print('r:', nd.mad(r_peakQs, r_TSE_Qs_E1))
print('a:', nd.mad(a_peakQs, a_TSE_Qs_E1))
print('y:', nd.mad(y_peakQs, y_TSE_Qs_E1))
fig, axes = plt.subplots(4, 3, figsize=(12, 16), constrained_layout=True)
plt.suptitle('Best KS-fit ND energy Gaussian for TSE, Paci dataset.')
plt.delaxes(axes[3, 1])
plt.delaxes(axes[3, 2])
for i, pid in enumerate(PIDs):
r = i//3; c = i%3; ax = axes[r, c]
plot_TSE(ax, pid)
a_q = a_TSE_Qs_E1[i]
plot_ND(ax, pid, a_q, a_mje_params.copy(), 'a')
c_q = c_TSE_Qs_E1[i]
plot_ND(ax, pid, c_q, c_mje_params.copy(), 'c')
x_q = x_TSE_Qs_E1[i]
plot_ND(ax, pid, x_q, x_mje_params.copy(), 'x')
r_q = r_TSE_Qs_E1[i]
plot_ND(ax, pid, r_q, r_mje_params.copy(), 'r')
y_q = y_TSE_Qs_E1[i]
plot_ND(ax, pid, y_q, y_mje_params.copy(), 'y')
ax.legend()
plt.savefig("TSE_energylocation_KS.png")
pid = '1aps'
mjes = TSE_stats[pid].scMJE.values
E_params = [ a_mje_params[pid, q] for q in nd.Qs ]
llhs = np.zeros(10) # log likelihoods, one for each ND energy Gaussian
for q in np.arange(10):
m, s = E_params[q]
dens = [ norm.pdf(x, m, s) for x in mjes ]
llhs[q] = logsumexp(dens)
print(nd.Qs[q], np.round(llhs[q], 4))
# can ignore priors: uniform, each Q has the same prob.
# denom = np.sum(llhs) # can ignore normalizing factor
# post_probs = llhs/denom # np.sum(post_probs) approx. 1.0
# posteriors[pid] = post_probs
q = nd.Qs[np.argmax(llhs)] # MAP q
print(pid, "MAP Q:", q)
# joint prob/density of the sample of i.i.d. observations
def locate_TSE_E2(nd_mje_params: dict):
n = len(PIDs); TSE_Qs = np.zeros(n)
for i, pid in enumerate(PIDs):
mjes = TSE_stats[pid].scMJE.values
E_params = [ nd_mje_params[pid, q] for q in nd.Qs ]
llhs = np.zeros(10) # log likelihoods, one for each ND energy Gaussian
for q in np.arange(10):
m, s = E_params[q]
dens = [ norm.pdf(x, m, s) for x in mjes ]
llhs[q] = logsumexp(dens)
q = nd.Qs[np.argmax(llhs)] # maximum likelihood
TSE_Qs[i] = q
return TSE_Qs
c_TSE_Qs_E2 = locate_TSE_E2(c_mje_params.copy())
x_TSE_Qs_E2 = locate_TSE_E2(x_mje_params.copy())
r_TSE_Qs_E2 = locate_TSE_E2(r_mje_params.copy())
a_TSE_Qs_E2 = locate_TSE_E2(a_mje_params.copy())
y_TSE_Qs_E2 = locate_TSE_E2(y_mje_params.copy())
print(PIDs)
print('c', c_TSE_Qs_E2, np.mean(c_TSE_Qs_E2), np.std(c_TSE_Qs_E2))
print('x', x_TSE_Qs_E2, np.mean(x_TSE_Qs_E2), np.std(x_TSE_Qs_E2))
print('r', r_TSE_Qs_E2, np.mean(r_TSE_Qs_E2), np.std(r_TSE_Qs_E2))
print('a', a_TSE_Qs_E2, np.mean(a_TSE_Qs_E2), np.std(a_TSE_Qs_E2))
print('y', y_TSE_Qs_E2, np.mean(y_TSE_Qs_E2), np.std(y_TSE_Qs_E2))
# Mean Absolute Difference: how close are the estimated Qs for PTS nets to ND peak Qs
print('c:', nd.mad(c_peakQs, c_TSE_Qs_E2))
print('x:', nd.mad(x_peakQs, x_TSE_Qs_E2))
print('r:', nd.mad(r_peakQs, r_TSE_Qs_E2))
print('a:', nd.mad(a_peakQs, a_TSE_Qs_E2))
print('y:', nd.mad(y_peakQs, y_TSE_Qs_E2))
fig, axes = plt.subplots(3, 4, figsize=(16, 12), constrained_layout=True)
plt.suptitle('Most likely ND energy Gaussian for TSE, Paci dataset.')
plt.delaxes(axes[2, 2])
plt.delaxes(axes[2, 3])
for i, pid in enumerate(PIDs):
r = i//4; c = i%4; ax = axes[r, c]
plot_TSE(ax, pid)
c_q = c_TSE_Qs_E2[i]
plot_ND(ax, pid, c_q, c_mje_params.copy(), 'c')
x_q = x_TSE_Qs_E2[i]
plot_ND(ax, pid, x_q, x_mje_params.copy(), 'x')
r_q = r_TSE_Qs_E2[i]
plot_ND(ax, pid, r_q, r_mje_params.copy(), 'r')
a_q = a_TSE_Qs_E2[i]
plot_ND(ax, pid, a_q, a_mje_params.copy(), 'a')
y_q = y_TSE_Qs_E2[i]
plot_ND(ax, pid, y_q, y_mje_params.copy(), 'y')
ax.legend()
plt.savefig("TSE_energylocation_MAP.png")
# plot ND_peak_Qs, TSE_Qs_E1, TSE_Qs_E2
fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True, sharey=True)
fig.suptitle("Location of TSE structures with ND energy.")
plt.delaxes(axes[1][2])
ax = axes[0][0]
ax.scatter(PIDs, a_peakQs, marker='^', s=80, color='tab:red', label='peak')
ax.scatter(PIDs, a_TSE_Qs_E1, marker='s', s=80, edgecolors='black', facecolors='none',
label='KS')
ax.scatter(PIDs, a_TSE_Qs_E2, marker='x', s=80, color='red', label='MAP')
ax.set_title("'a' ND variant")
ax.set_ylabel('Q', fontsize=14)
ax = axes[0][1]
ax.scatter(PIDs, c_peakQs, marker='^', s=80, color='tab:blue', label='peak')
ax.scatter(PIDs, c_TSE_Qs_E1, marker='s', s=80, edgecolors='black', facecolors='none',
label='KS')
ax.scatter(PIDs, c_TSE_Qs_E2, marker='x', s=80, color='red', label='MAP')
ax.set_title("'c' ND variant")
ax = axes[0][2]
ax.scatter(PIDs, x_peakQs, marker='^', s=80, color='tab:orange', label='peak')
ax.scatter(PIDs, x_TSE_Qs_E1, marker='s', s=80, edgecolors='black', facecolors='none',
label='KS')
ax.scatter(PIDs, x_TSE_Qs_E2, marker='x', s=80, color='red', label='MAP')
ax.set_title("'x' ND variant")
ax = axes[1][0]
ax.scatter(PIDs, r_peakQs, marker='^', s=80, color='tab:green', label='peak')
ax.scatter(PIDs, r_TSE_Qs_E1, marker='s', s=80, edgecolors='black', facecolors='none',
label='KS')
ax.scatter(PIDs, r_TSE_Qs_E2, marker='x', s=80, color='red', label='MAP')
ax.set_title("'r' ND variant")
ax.set_ylabel('Q', fontsize=14)
ax = axes[1][1]
ax.scatter(PIDs, y_peakQs, marker='^', s=80, color='tab:purple', label='peak')
ax.scatter(PIDs, y_TSE_Qs_E1, marker='s', s=80, edgecolors='black', facecolors='none',
label='KS')
ax.scatter(PIDs, y_TSE_Qs_E2, marker='x', s=80, color='red', label='MAP')
ax.set_title("'y' ND variant")
for ax in axes.flatten()[:-1]:
ax.set_xticks(PIDs); ax.set_xticklabels(PIDs, rotation=90)
ax.legend(loc="upper center"); ax.grid()
# boxplot ND_peak_Qs, TSE_Qs_E1, TSE_Qs_E2
fig, axes = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True)
fig.suptitle("Location of TSE structures with ND energy.")
ax = axes[0]
ax.boxplot([a_peakQs, c_peakQs, x_peakQs, r_peakQs, y_peakQs], showmeans=True)
ax.set_title("ND Peak")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['a', 'c', 'x', 'r', 'y'], fontsize=14)
ax.set_ylabel('Q', fontsize=14); ax.set_yticks(nd.Qs); ax.grid()
ax = axes[1]
ax.boxplot([a_TSE_Qs_E1, c_TSE_Qs_E1, x_TSE_Qs_E1, r_TSE_Qs_E1, y_TSE_Qs_E1], showmeans=True)
ax.set_title("Approach 1: KS")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['a', 'c', 'x', 'r', 'y'], fontsize=14)
ax.set_yticks(nd.Qs); ax.grid()
ax = axes[2]
ax.boxplot([a_TSE_Qs_E2, c_TSE_Qs_E2, x_TSE_Qs_E2, r_TSE_Qs_E2, y_TSE_Qs_E2], showmeans=True)
ax.set_title("Approach 2: MAP")
ax.set_xlabel("ND variant"); ax.set_xticklabels(['a', 'c', 'x', 'r', 'y'], fontsize=14)
ax.set_yticks(nd.Qs); ax.grid()
plt.savefig("TSE_energylocation.png")
SKLC September 2020