import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, norm, normaltest, shapiro
import matplotlib.pyplot as plt
import NDutils as nd # my lib
fn = os.path.join('P_ND3', 'P_frates.txt')
df = nd.load_frates(fn)
df.head()
dirname = "P_ND3"
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
print(num_pids, PIDs)
print("Range of fold rates:", np.ptp(fold_rates))
# Get ND energy data
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
c_mje_params = nd.get_params(c_ndstats, 'sc_mje')
c_peakQs, c_peakEs = nd.get_peaks(c_mje_params, PIDs)
print("'c'", pearsonr(fold_rates, c_peakEs), np.mean(c_peakQs), np.std(c_peakQs),
nd.mjes_range(c_peakEs))
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
x_mje_params = nd.get_params(x_ndstats, 'sc_mje')
x_peakQs, x_peakEs = nd.get_peaks(x_mje_params, PIDs)
print("'x'", pearsonr(fold_rates, x_peakEs), np.mean(x_peakQs), np.std(x_peakQs),
nd.mjes_range(x_peakEs))
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
r_mje_params = nd.get_params(r_ndstats, 'sc_mje')
r_peakQs, r_peakEs = nd.get_peaks(r_mje_params, PIDs)
print("'r'", pearsonr(fold_rates, r_peakEs), np.mean(r_peakQs), np.std(r_peakQs),
nd.mjes_range(r_peakEs))
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
a_mje_params = nd.get_params(a_ndstats, 'sc_mje')
a_peakQs, a_peakEs = nd.get_peaks(a_mje_params, PIDs)
print("'a'", pearsonr(fold_rates, a_peakEs), np.mean(a_peakQs), np.std(a_peakQs),
nd.mjes_range(a_peakEs))
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
y_mje_params = nd.get_params(y_ndstats, 'sc_mje')
y_peakQs, y_peakEs = nd.get_peaks(y_mje_params, PIDs)
print("'y'", pearsonr(fold_rates, y_peakEs), np.mean(y_peakQs), np.std(y_peakQs),
nd.mjes_range(y_peakEs))
# Example content of dicts
c_mje_params['2ci2', 0.1]
c_ndstats['2ci2'].loc[0.1].sc_mje
Check sample data normality:
def check_normality(ndstats: dict, col: str, alpha: float, method='shapiro'):
"""Method is either 'shapiro' (default) or 'normaltest'."""
pvals_dict = {}
for pid in PIDs:
if method == 'normaltest': # pvals are two-tailed, want the tail area upto statistic
pvals = [ (normaltest(ndstats[pid].loc[q][[col]].values)[1]/2).item()
for q in nd.Qs ]
else:
pvals = [ shapiro(ndstats[pid][[col]].loc[q].values)[1] for q in nd.Qs ]
pvals_dict[pid] = pvals # np.round(pvals, 3)
df = pd.DataFrame(pvals_dict, index = nd.Qs)
db = df.apply(lambda x: x < alpha)
null_rejects = np.sum(db.values, axis=0) # times null hypothesis rejected
return df, db, null_rejects
# shapiro-wilk, p-value < 0.005, reject normality (True)
a_df, a_db, a_nr = check_normality(a_ndstats.copy(), "sc_mje", 0.005, "shapiro")
print(a_db, '\n')
print(a_nr, np.sum(a_nr))
print("not-normals in Q < 0.6", np.count_nonzero(a_db.loc[0.1:0.5].values), '\n')
print(np.round(a_df, 3))
# shapiro-wilk, p-value < 0.005, reject normality (True)
c_df, c_db, c_nr = check_normality(c_ndstats.copy(), "sc_mje", 0.005, "shapiro")
print(c_db, '\n')
print(c_nr, np.sum(c_nr))
print("not-normals in Q < 0.6", np.count_nonzero(c_db.loc[0.1:0.5].values), '\n')
print(np.round(c_df, 3))
print(c_df.loc[0.2][['2ci2']].item())
x_df, x_db, x_nr = check_normality(x_ndstats.copy(), "sc_mje", 0.005, "shapiro")
print(x_db, '\n')
print(x_nr, np.sum(x_nr))
print("not-normals in Q < 0.6", np.count_nonzero(x_db.loc[0.1:0.5].values), '\n')
print(np.round(x_df, 3))
print(x_df.loc[0.1][['2ptl']].item())
print(x_df.loc[0.4][['2ci2']].item())
print(x_df.loc[0.5][['1ten']].item())
r_df, r_db, r_nr = check_normality(r_ndstats.copy(), "sc_mje", 0.005, "shapiro")
print(r_db, '\n')
print(r_nr, np.sum(r_nr))
print("not-normals in Q < 0.6", np.count_nonzero(r_db.loc[0.1:0.5].values), '\n')
print(np.round(r_df, 3))
print(r_df.loc[0.4][['1imq']].item())
y_df, y_db, y_nr = check_normality(y_ndstats.copy(), "sc_mje", 0.005, "shapiro")
print(y_db, '\n')
print(y_nr, np.sum(y_nr))
print("not-normals in Q < 0.6", np.count_nonzero(y_db.loc[0.1:0.5].values), '\n')
print(np.round(y_df, 3))
fig, axes = plt.subplots(1, 3, figsize=(15, 5), constrained_layout=True, sharey=True)
plt.suptitle("ND energy distributions at $Q$=1.0")
for pid in PIDs:
mjes = c_ndstats[pid].sc_mje.loc[1.0].values
axes[0].hist(mjes, label=pid)
mjes = x_ndstats[pid].sc_mje.loc[1.0].values
axes[1].hist(mjes, label=pid)
mjes = r_ndstats[pid].sc_mje.loc[1.0].values
axes[2].hist(mjes, label=pid)
axes[0].set_title("'c' ND variant")
axes[1].set_title("'x' ND variant")
axes[2].set_title("'r' ND variant")
_ = axes[0].legend(loc='upper left', bbox_to_anchor=(0.3, 1.0), title='PTS pids')
# Q-Q plots for distributions identified as not-normal
# c 2ci2 0.2
# x 2ci2 0.4
# x 2ptl 0.1
# x 1ten 0.5
# r 1imq 0.4
# may want to check out statsmodels.graphics.gofplots.qqplot
# https://www.statsmodels.org/stable/generated/statsmodels.graphics.gofplots.qqplot.html
def qq_data(pid, q, ndstats: dict, mje_params: dict):
pp = np.arange(0, 101, 5)[1:] # percentile points
vals = ndstats[pid].loc[q].sc_mje.values
mu, stdev = mje_params[pid, q]
x = np.percentile(norm.rvs(size=100), pp) # independent variable
y = np.percentile((vals-mu)/stdev, pp)
return x, y, pearsonr(x, y)
fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
plt.suptitle('Percentile-percentile plots')
z = np.linspace(-2, 4)
ax = axes[0][0]
x, y, p = qq_data('2ci2', 0.3, c_ndstats.copy(), c_mje_params.copy())
print("'c' 2ci2 0.3 Example of a normal ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'c' 2ci2 Q=0.3 Example of a normal")
ax.set_ylabel("Standardized 'c' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
ax = axes[0][1]
x, y, p = qq_data('2ci2', 0.2, c_ndstats.copy(), c_mje_params.copy())
print("'c' 2ci2 0.2 ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'c' 2ci2 Q=0.2 ")
ax.set_ylabel("Standardized 'c' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
ax = axes[0][2]
x, y, p = qq_data('2ci2', 0.4, x_ndstats.copy(), x_mje_params.copy())
print("'x' 2ci2 0.4 ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'x' 2ci2 Q=0.4 ")
ax.set_ylabel("Standardized 'x' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
ax = axes[1][0]
x, y, p = qq_data('2ptl', 0.1, x_ndstats.copy(), x_mje_params.copy())
print("'x' 2ptl 0.1 ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'x' 2ptl Q=0.1 ")
ax.set_ylabel("Standardized 'x' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
ax = axes[1][1]
x, y, p = qq_data('1ten', 0.5, x_ndstats.copy(), x_mje_params.copy())
print("'x' 1ten 0.5 ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'x' 1ten Q=0.5 ")
ax.set_ylabel("Standardized 'x' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
ax = axes[1][2]
x, y, p = qq_data('1imq', 0.4, r_ndstats.copy(), r_mje_params.copy())
print("'r' 1imq 0.4 ", np.round(p, 5))
ax.scatter(x, y, alpha=0.5)
ax.set_title(" 'r' 1imq Q=0.4 ")
ax.set_ylabel("Standardized 'r' ND energy")
ax.plot(z, z, color='red'); ax.grid(True)
# plots may vary with different runs, but correlations are > 0.9.
# closer the dots are to the red line, "more normal".
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', \
'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']
# adapted from UCSD Fundamentals of Machine Learning edx course
# plot Gaussians at Qs, for a pid
def plot_gaussians(ax, ndv, pid, mje_params):
for i, q in enumerate(nd.Qs[:-1]):
mu, stdev = mje_params[pid, q]
x_axis = np.linspace(mu - 3*stdev, mu + 3*stdev, 1000)
ax.plot(x_axis, norm.pdf(x_axis, mu, stdev), colors[i], lw=2, label=str(q))
ax.set_title("'" + ndv + "' ND energy Gaussians " + pid, fontsize=12)
ax.set_xlabel("ND energy (E)", fontsize=12)
ax.set_ylabel("Density", fontsize=12)
ax.legend(title='Q')
return
def plot_energy_profile(ax, nd_type, pid, mje_params, nd_peakQs, nd_peakEs):
ax.plot(nd.Qs, [ mje_params[pid, q][0] for q in nd.Qs ], linewidth=1)
p = PIDs.index(pid)
q = nd_peakQs[p]; i = int(q*10-1)
ax.scatter(q, nd_peakEs[p], marker='^', color=colors[i], label='peak')
ax.set_title("'" + nd_type + "' ND energy profile " + pid)
ax.set_xlabel('Q', fontsize=12); ax.set_xticks(nd.Qs)
ax.set_ylabel('ND energy (E)', fontsize=12);
ax.legend(); ax.grid()
return
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4), constrained_layout=True)
pid = '2ci2'; ndv = 'a'
plot_gaussians(ax1, ndv, pid, a_mje_params)
plot_energy_profile(ax2, ndv, pid, a_mje_params, a_peakQs, a_peakEs)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10,4), constrained_layout=True)
pid = '2ci2'; ndv = 'y'
plot_gaussians(ax1, ndv, pid, y_mje_params)
plot_energy_profile(ax2, ndv, pid, y_mje_params, y_peakQs, y_peakEs)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
plt.suptitle("Gaussians of 'c' ND energy by Q")
for i, p in enumerate(PIDs):
r = i // 5; c = i % 5
plot_gaussians(axes[r, c], 'c', p, c_mje_params)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
plt.suptitle("Gaussians of 'a' ND energy by Q")
for i, p in enumerate(PIDs):
r = i // 5; c = i % 5
plot_gaussians(axes[r, c], 'a', p, a_mje_params)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
plt.suptitle("Gaussians of 'x' ND energy by Q")
for i, p in enumerate(PIDs):
r = i // 5; c = i % 5
plot_gaussians(axes[r, c], 'x', p, x_mje_params)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
plt.suptitle("Gaussians of 'r' ND energy by Q")
for i, p in enumerate(PIDs):
r = i // 5; c = i % 5
plot_gaussians(axes[r, c], 'r', p, r_mje_params)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
plt.suptitle("Gaussians of 'y' ND energy by Q")
for i, p in enumerate(PIDs):
r = i // 5; c = i % 5
plot_gaussians(axes[r, c], 'y', p, y_mje_params)
SKLC August 2020