import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn import linear_model
import matplotlib.pyplot as plt
import NDutils as nd # my lib
NDVs = ['y', 'r', 'a', 'c', 'x', 'h']
dirname = "Uzunoglu"
fn = os.path.join(dirname, "UZ_foldtype.txt")
DF = pd.read_table(fn, sep=' ')
DF.head() # reserve DF for this purpose!
UZ_PIDs = DF.pid.values
UZ_frates = DF.k_f.values
print(len(UZ_PIDs), np.ptp(UZ_frates))
UZ_SStypes = DF.SStype.values
df = nd.count_HSTs("Uzunoglu", UZ_PIDs, UZ_SStypes)
print(df.dtypes)
df.sample(5)
da = df.query("SStype == 'a'")
print(len(da))
da
# reclassified 1o6x as 'ab', nH > 3
db = df.query("SStype == 'b'")
print(len(db))
db
# reclassified 1g6p, 1m9s 'b'
dab = df.query("SStype == 'ab'")
print(len(dab))
dab
n_dr = nd.frate_corrs_with_N_by_SStype(DF, "Uzunoglu (52)")
n_dr
dirn = os.path.join("UZ_NDH", "hard")
# example of a datafile
pid = "1aps"
fn = os.path.join(dirn, pid + "_bsd_hstats_h.out")
dh = pd.read_table(fn, sep=' ')
dh.head()
maxEs, argmaxEs, maxE_sds = nd.get_ndh_maxEs(dirn, UZ_PIDs)
h_df = DF.assign(maxE=maxEs, maxE_sd=maxE_sds)
h_df.head()
print("maxE_sds:", np.mean(maxE_sds), np.std(maxE_sds, ddof=1),
np.min(maxE_sds), np.max(maxE_sds))
# correlation maxEs with exp. fold rate
print("maxE Pearson corr. with exp. fold rate:", pearsonr(maxEs, UZ_frates))
# check for outliers
_ = plt.scatter(maxEs, UZ_frates)
h_df.query("maxE > 700") # outlier
h_rs, h_ps = nd.analyze_maxE_by_SStype(h_df)
fig, ax1 = plt.subplots()
ax = nd.plot_maxE_by_SStype(h_df, ax1, h_rs)
ax.set_title("Uzunoglo h " + str(np.round(h_rs[0], 3)), fontsize=14)
_ = ax.legend(title="SStype", fontsize=12)
# excluding outlier
h1_df = h_df.query("pid != '2vik'") # ab
len(h1_df)
h1_rs, h1_ps = nd.analyze_maxE_by_SStype(h1_df)
dirn = os.path.join("UZ_ND3", "edv_a")
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='a')
a_df = DF.assign(peakE = peakEs)
a_rs, a_ps = nd.analyze_peakE_by_SStype(a_df, 'a')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='c')
c_df = DF.assign(peakE = peakEs)
c_rs, c_ps = nd.analyze_peakE_by_SStype(c_df, 'c')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='x')
x_df = DF.assign(peakE = peakEs)
x_rs, x_ps = nd.analyze_peakE_by_SStype(x_df, 'x')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='r')
r_df = DF.assign(peakE = peakEs)
r_rs, r_ps = nd.analyze_peakE_by_SStype(r_df, 'r')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='y')
y_df = DF.assign(peakE = peakEs)
y_rs, y_ps = nd.analyze_peakE_by_SStype(y_df, 'y')
# df = eval('a' + "_df")
# np.all(df.peakE.values == a_df.peakE.values) # True
fig, axes = plt.subplots(2, 3, figsize=(12, 7), constrained_layout=True)
plt.suptitle("Uzunoglo (52)", fontsize=14)
for i, ndv in enumerate(NDVs[:-1]):
r = i//3; c = i%3; ax1 = axes[r][c]
df = eval(ndv + "_df"); rs = eval(ndv + "_rs")
nd.plot_peakE_by_SStype(df, ax1, rs)
ax1.set_title(ndv + ' ' + str(np.round(rs[0], 3)) , fontsize=14)
ax1.legend(title="SStype")
ax1 = axes[1][2]
nd.plot_maxE_by_SStype(h_df, ax1, h_rs)
ax1.set_title('h ' + str(np.round(h_rs[0], 3)), fontsize=14)
_ = ax1.legend(title="SStype")
# Z == All
dr = pd.DataFrame({'SStype': ['Z', 'A', 'AB', 'B'],
'y_rs': y_rs, 'y_ps': y_ps, 'r_rs': r_rs, 'r_ps': r_ps,
'a_rs': a_rs, 'a_ps': a_ps, 'c_rs': c_rs, 'c_ps': c_ps,
'x_rs': x_rs, 'x_ps': x_ps, 'h_rs': h_rs, 'h_ps': h_ps,}).set_index('SStype')
UZcorrs_dr = dr.join(n_dr)
# save to file
UZcorrs_dr.to_csv("UZcorrs_dr.txt", sep=' ')
# desire corrs with ND variants to be stronger than with 'h' or 'n',
# since generating peak Es involve more work
df = UZcorrs_dr.filter(regex='_rs').T
df.round(4)
# smaller (more negative) is better
Xs = NDVs[:]; Xs.extend(['N', 'N1/2', 'N2/3'])
fig, ax = plt.subplots()
ax.scatter(Xs, df.A.values, marker='^', edgecolor='green', facecolor='none', label='A')
ax.scatter(Xs, df.AB.values, edgecolor='orange', lw=2, facecolor='none', label='AB')
ax.scatter(Xs, df.B.values, marker='s', edgecolor='magenta', facecolor='none', label='B')
ax.set_xticks(Xs); ax.set_xticklabels(Xs, fontsize=14)
ax.set_xlabel("LE <--- priority ---> SE", fontsize=14)
ax.set_title("Uzunoglo (52): correlation with exp. fold rate by SS type.", fontsize=14)
ax.legend(title="SStype", fontsize=12, loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.grid()
If the task at hand is simply predicting fold-rate, N is competitive with ND, and simpler than ND, for structurally pure SStype chains, A and B.
ND variants are also better at modelling structurally pure chains:
The problematic class is structurally mixed SStype chains (AB), which seem to thrive in the "Goldilocks region" created by ND variant 'a' which strikes a good balance between LE and SE.
Goal is to design a ND variant that can handle all kinds of chains well.
alpha_df = y_df.query("SStype == 'a'")
alphabeta_df = a_df.query("SStype == 'ab'")
beta_df = x_df.query("SStype == 'b'")
yax_df = alpha_df.append(beta_df).append(alphabeta_df)
print(len(alpha_df), len(alphabeta_df), len(beta_df), len(yax_df))
yax_rs, yax_ps = nd.analyze_peakE_by_SStype(yax_df, ndv='y-a-x')
alpha_df = r_df.query("SStype == 'a'")
alphabeta_df = a_df.query("SStype == 'ab'")
beta_df = x_df.query("SStype == 'b'")
rax_df = alpha_df.append(beta_df).append(alphabeta_df)
print(len(alpha_df), len(alphabeta_df), len(beta_df), len(rax_df))
rax_rs, rax_ps = nd.analyze_peakE_by_SStype(rax_df, ndv='r-a-x')
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4), constrained_layout=True)
plt.suptitle("Uzunoglo (52): combined peak Es.", fontsize=14)
ax1.set_title("y-a-x " + str(np.round(yax_rs[0], 3)), fontsize=14)
nd.plot_peakE_by_SStype(yax_df, ax1, yax_rs)
ax1.legend(title="SStype")
ax2.set_title("r-a-x " + str(np.round(rax_rs[0], 3)), fontsize=14)
nd.plot_peakE_by_SStype(rax_df, ax2, rax_rs)
_ = ax2.legend(title="SStype")
dirn = os.path.join("UZ_ND3", "notscaled")
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='x_notscaled')
ux_df = DF.assign(peakE = peakEs)
ux_rs, ux_ps = nd.analyze_peakE_by_SStype(ux_df, ndv='ux')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='c_notscaled')
uc_df = DF.assign(peakE = peakEs)
uc_rs, uc_ps = nd.analyze_peakE_by_SStype(uc_df, ndv='uc')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, UZ_PIDs, "stats", ndv='a_notscaled')
ua_df = DF.assign(peakE = peakEs)
ua_rs, ua_ps = nd.analyze_peakE_by_SStype(ua_df, ndv='ua')
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True)
plt.suptitle("Uzunoglo: unscaled ND variants.", fontsize=14)
ax1.set_title("ux " + str(np.round(ux_rs[0], 3)), fontsize=14)
nd.plot_peakE_by_SStype(ux_df, ax1, ux_rs)
ax1.legend(title="SStype")
ax2.set_title("uc " + str(np.round(uc_rs[0], 3)), fontsize=14)
nd.plot_peakE_by_SStype(uc_df, ax2, uc_rs)
ax2.legend(title="SStype")
ax3.set_title("ua " + str(np.round(ua_rs[0], 3)), fontsize=14)
nd.plot_peakE_by_SStype(ua_df, ax3, ua_rs)
_ = ax3.legend(title="SStype")
unscaled_dr = pd.DataFrame({'SStype': ['Z', 'A', 'AB', 'B'],
'ua_rs': ua_rs, 'ua_ps': ua_ps, 'uc_rs': uc_rs, 'uc_ps': uc_ps,
'ux_rs': ux_rs, 'ux_ps': ux_ps}).set_index('SStype')
UZcorrs_dr = UZcorrs_dr.join(unscaled_dr)
UZcorrs_dr.to_csv("UZcorrs_dr.txt", sep=' ')
CDF = UZcorrs_dr.filter(regex='_rs').T
CDF.round(4)
CDF.A.sort_values()
CDF.AB.sort_values()
CDF.B.sort_values()
# plot to compare these ND variants
Xs = ['y', 'r', 'ua', 'a', 'uc', 'c', 'ux', 'x', 'h', 'n12']
hdf = CDF.copy()
hdf = hdf.reindex([l+"_rs" for l in Xs])
hdf
# smaller (more negative) is better
NDVs = ['y', 'r', 'ua', 'a', 'uc', 'c', 'ux', 'x', 'h', 'N1/2']
fig, ax = plt.subplots()
ax.scatter(NDVs, hdf.A.values, marker='^',s=60, edgecolor='green',facecolor='none', label='A')
ax.scatter(NDVs, hdf.AB.values, edgecolor='orange', lw=2, s=60, facecolor='none', label='AB')
ax.scatter(NDVs, hdf.B.values, marker='s', edgecolor='magenta', facecolor='none', label='B')
ax.axhline(y=-0.5723, color="tab:green", lw=1, ls='--')
ax.annotate("", xy=('ux', -0.5723), xycoords='data', xytext=('x', -0.47),
arrowprops=dict(arrowstyle='-|>', connectionstyle="arc3", color='tab:green'))
ax.annotate("", xy=('uc', -0.5264), xycoords='data', xytext=('c', -0.437),
arrowprops=dict(arrowstyle='-|>', connectionstyle="arc3", color='tab:green'))
ax.annotate("", xy=('ua', -0.2204), xycoords='data', xytext=('a', -0.4121),
arrowprops=dict(arrowstyle='-|>', connectionstyle="arc3", color='tab:green'))
ax.set_xticks(NDVs); ax.set_xticklabels(NDVs, fontsize=14)
ax.set_title("Uzunoglo (52): correlation with exp. fold rate by SStype.", fontsize=14)
ax.legend(title="SStype", fontsize=12, loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.grid()
This is expected behavior, since scaling reduces $p_e$ for LE, and findings so far indicate that A will benefit but B will suffer from an increase in $p_e$ of LE.
However, all correlations weaken with unscaled 'a' ('ua'). This could be the lingering of effect of restricting node selection to a local neighborhood (crony).
# mixed unscaled and scaled x, with a
alpha_df = ux_df.query("SStype == 'a'")
alphabeta_df = a_df.query("SStype == 'ab'")
beta_df = x_df.query("SStype == 'b'")
uxax_df = alpha_df.append(beta_df).append(alphabeta_df)
print(len(alpha_df), len(alphabeta_df), len(beta_df), len(uxax_df))
uxax_rs, uxax_ps = nd.analyze_peakE_by_SStype(uxax_df, ndv='ux-a-x')
print(nd.mjes_range(uxax_df.peakE.values))
fig, ax1 = plt.subplots()
ax1.set_title("Uzunoglo(52) ux-a-x " + str(np.round(uxax_rs[0], 3)))
nd.plot_peakE_by_SStype(uxax_df, ax1, uxax_rs)
ax1.set_xlabel("Combined peak Es from ND variants")
_ = ax1.legend(title="SStype")
dirname = "Kamagata"
fn = os.path.join(dirname, "K_foldtype.txt")
df = pd.read_table(fn, sep='\t')
# remove outlier '1ael' for ND results
DF = df.query("pid != '1ael'") # reserve DF for this purpose!
DF.head()
K_PIDs = DF.pid.values
print(len(K_PIDs), np.ptp(DF.k_f.values), DF.N.max())
K_SStypes = DF.SStype.values
df = nd.count_HSTs("Kamagata", K_PIDs, K_SStypes)
df.sample(5)
# nH > 3, nS < 10
da = df.query("SStype == 'a'")
print(len(da))
da
# nH < 3, nS > 10
db = df.query("SStype == 'b'")
print(len(db))
db
# nH > 3, nS > 10
dab = df.query("SStype == 'ab'")
print(len(dab))
dab
n_dr = nd.frate_corrs_with_N_by_SStype(DF, "Kamagata(20)")
n_dr
dirn = os.path.join("K_NDH", "hard")
pid = "2abd"
fn = os.path.join(dirn, pid + "_bsd_hstats_h.out")
dh = pd.read_table(fn, sep=' ')
dh.head()
maxEs, argmaxEs, maxE_sds = nd.get_ndh_maxEs(dirn, K_PIDs)
h_df = DF.assign(maxE=maxEs, maxE_sd=maxE_sds)
h_df.head()
print("maxE_sds:", np.mean(maxE_sds), np.std(maxE_sds, ddof=1),
np.min(maxE_sds), np.max(maxE_sds))
# correlation maxEs with exp. fold rate
print("maxE Pearson corr. with exp. fold rate:", pearsonr(maxEs, DF.k_f.values))
# outlier?
_ = plt.scatter(maxEs, DF.k_f.values)
h_rs, h_ps = nd.analyze_maxE_by_SStype(h_df)
fig, ax1 = plt.subplots()
ax1.set_title("Kamagata h " + str(np.round(h_rs[0] ,3)), fontsize=14)
nd.plot_maxE_by_SStype(h_df, ax1, h_rs)
_ = ax1.legend(title="SStype", fontsize=12, loc="upper left", bbox_to_anchor=(1.0, 1.0))
dirn = os.path.join("K_ND3", "edv_a")
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='a')
a_df = DF.assign(peakE = peakEs)
a_rs, a_ps = nd.analyze_peakE_by_SStype(a_df, 'a')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='c')
c_df = DF.assign(peakE = peakEs)
c_rs, c_ps = nd.analyze_peakE_by_SStype(c_df, 'c')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='x')
x_df = DF.assign(peakE = peakEs)
x_rs, x_ps = nd.analyze_peakE_by_SStype(x_df, 'x')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='r')
r_df = DF.assign(peakE = peakEs)
r_rs, r_ps = nd.analyze_peakE_by_SStype(r_df, 'r')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='y')
y_df = DF.assign(peakE = peakEs)
y_rs, y_ps = nd.analyze_peakE_by_SStype(y_df, 'y')
dirn = os.path.join("K_ND3", "notscaled")
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='a_notscaled')
ua_df = DF.assign(peakE = peakEs)
ua_rs, ua_ps = nd.analyze_peakE_by_SStype(ua_df, 'ua')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='c_notscaled')
uc_df = DF.assign(peakE = peakEs)
uc_rs, uc_ps = nd.analyze_peakE_by_SStype(uc_df, 'uc')
peakEs, peakQs = nd.get_ndv_peakEs(dirn, K_PIDs, "stats", ndv='x_notscaled')
ux_df = DF.assign(peakE = peakEs)
ux_rs, ux_ps = nd.analyze_peakE_by_SStype(ux_df, 'ux')
Kcorrs_dr = pd.DataFrame({'SStype': ['Z', 'A', 'AB', 'B'],
'y_rs': y_rs, 'y_ps': y_ps, 'r_rs': r_rs, 'r_ps': r_ps,
'ua_rs': ua_rs, 'ua_ps': ua_ps, 'a_rs': a_rs, 'a_ps': a_ps,
'uc_rs': uc_rs, 'uc_ps': uc_ps, 'c_rs': c_rs, 'c_ps': c_ps,
'ux_rs': ux_rs, 'ux_ps': ux_ps, 'x_rs': x_rs, 'x_ps': x_ps,
'h_rs': h_rs, 'h_ps': h_ps, }).set_index('SStype')
Kcorrs_dr = Kcorrs_dr.join(n_dr)
Kcorrs_dr.to_csv("KCorrs_dr.txt", sep=' ')
CDF = Kcorrs_dr.filter(regex="_rs").T
CDF.round(4)
CDF.Z.sort_values()
CDF.A.sort_values()
CDF.AB.sort_values()
CDF.B.sort_values()
NDVs = ['y', 'r', 'ua', 'a', 'uc', 'c', 'ux', 'x', 'h', 'N', 'N1/2', 'N2/3']
# smaller (more negative) is better
fig, ax = plt.subplots()
ax.scatter(NDVs, CDF.A.values, marker='^', edgecolor='green', facecolor='none', label='A')
ax.scatter(NDVs, CDF.AB.values, edgecolor='orange', lw=2, s=60, facecolor='none', label='AB')
ax.scatter(NDVs, CDF.B.values, marker='s', edgecolor='magenta', facecolor='none', label='B')
ax.set_xticks(NDVs); ax.set_xticklabels(NDVs, fontsize=14)
ax.set_xlabel("LE <--- priority ---> SE", fontsize=14)
ax.set_title("Kamagata (20): correlation with exp. fold rate by SS type.", fontsize=14)
ax.legend(title="SStype", fontsize=12, loc="upper left", bbox_to_anchor=(1.0, 1.0))
ax.grid()
dp = Kcorrs_dr.filter(regex="_ps").T
db = dp.apply(lambda x: x < 0.05)
db
# combine y-a-ua
alpha_df = y_df.query("SStype == 'a'")
alphabeta_df = a_df.query("SStype == 'ab'")
beta_df = ua_df.query("SStype == 'b'")
yaua_df = alpha_df.append(beta_df).append(alphabeta_df)
print(len(alpha_df), len(alphabeta_df), len(beta_df), len(yaua_df))
yaua_rs, yaua_ps = nd.analyze_peakE_by_SStype(yaua_df, ndv='y-a-ua')
print(nd.mjes_range(yaua_df.peakE.values))
# combine ux-a-ua
alpha_df = ux_df.query("SStype == 'a'")
alphabeta_df = a_df.query("SStype == 'ab'")
beta_df = ua_df.query("SStype == 'b'")
uxaua_df = alpha_df.append(beta_df).append(alphabeta_df)
print(len(alpha_df), len(alphabeta_df), len(beta_df), len(uxaua_df))
uxaua_rs, uxaua_ps = nd.analyze_peakE_by_SStype(uxaua_df, ndv='ux-a-ua')
print(nd.mjes_range(uxaua_df.peakE.values))
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True)
plt.suptitle("Kamagata(20): combined peak Es", fontsize=14)
ax1.set_title("y-a-ua " + str(np.round(yaua_rs[0], 3)))
nd.plot_peakE_by_SStype(yaua_df, ax1, yaua_rs)
ax1.legend(title="SStype")
ax2.set_title("ux-a-ua " + str(np.round(uxaua_rs[0], 3)))
nd.plot_peakE_by_SStype(uxaua_df, ax2, uxaua_rs)
ax2.legend(title="SStype")
ax3.set_title("a " + str(np.round(a_rs[0], 3)))
nd.plot_peakE_by_SStype(a_df, ax3, a_rs)
_ = ax3.legend(title="SStype")
SKLC January 2021