import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
from sklearn import linear_model
import matplotlib.pyplot as plt
import NDutils as nd # my lib
ND variants:
EDS variants:
Variables:
# plot colors for the five ND variants
NDVs = ['c', 'x', 'r', 'a', 'y']
ndv_colors = {'c': 'tab:blue', 'x': 'tab:orange', 'r': 'tab:green',
'a': 'tab:red', 'y': 'tab:purple'}
def analyze_E(nd_type, ndstats: dict, mje_params: dict, peakQs: list, peakEs: list,
y_pred: np.array)->np.array:
"""Inputs: nd_type, ndstats.
Outputs: mje_params, peakQs, peakEs, y_pred whose shape is (N, 1)
Returns: list of values.
"""
mje_params.update(nd.get_params(ndstats, 'sc_mje'))
peakQs[:], peakEs[:] = nd.get_peaks(mje_params, PIDs)
print(nd_type, np.mean(peakQs), np.std(peakQs), nd.mjes_range(peakEs))
x = np.array(peakEs)
reg.fit(x.reshape(-1,1), y_true)
# y_pred[:] = reg.predict(x.reshape(-1,1)) # column-vector
np.copyto(y_pred, reg.predict(x.reshape(-1,1))) # numpy 1.7.0
print(pearsonr(fold_rates, x), r2_score(fold_rates, y_pred), np.ptp(y_pred))
print(nd.mra(y_true, y_pred)) # mse, r2, adj_r2
# follow Table columns in paper
c1, c2, c3, c4, c5 = pearsonr(fold_rates, x), np.mean(peakQs), nd.mjes_range(peakEs), \
r2_score(fold_rates, y_pred), np.ptp(y_pred)
return np.round([c1[0], c1[1], c2, c3, c4, c5], 3)
def plot_E(plt_title: str): # could choose which to plot
fig = plt.figure(figsize=(8, 6))
plt.title(plt_title, fontsize=14)
plt.scatter(c_peakEs, fold_rates, color=ndv_colors['c'], alpha=0.5)
plt.plot(c_peakEs, c_y_pred, color=ndv_colors['c'], label="'c' " + str(c_res[0]))
plt.scatter(x_peakEs, fold_rates, color=ndv_colors['x'], alpha=0.5)
plt.plot(x_peakEs, x_y_pred, color=ndv_colors['x'], label="'x' " + str(x_res[0]))
plt.scatter(r_peakEs, fold_rates, color=ndv_colors['r'], alpha=0.5)
plt.plot(r_peakEs, r_y_pred, color=ndv_colors['r'], label="'r' " + str(r_res[0]))
plt.scatter(a_peakEs, fold_rates, color=ndv_colors['a'], alpha=0.5)
plt.plot(a_peakEs, a_y_pred, color=ndv_colors['a'], label="'a' " + str(a_res[0]))
plt.scatter(y_peakEs, fold_rates, color=ndv_colors['y'], alpha=0.5)
plt.plot(y_peakEs, y_y_pred, color=ndv_colors['y'], label="'y' " + str(y_res[0]))
plt.xlabel("Peak ND energy $E$", fontsize=14)
plt.ylabel("Experimental folding rate.", fontsize=14)
plt.legend(title="ndv corr"); plt.grid()
return plt
def fait_analyze_E():
global c_mje_params, c_peakQs, c_peakEs, c_y_pred, c_res
c_mje_params = {}; c_peakQs, c_peakEs = [],[]; c_y_pred = np.empty((num_pids, 1))
c_res = analyze_E('c', c_ndstats, c_mje_params, c_peakQs, c_peakEs, c_y_pred)
global x_mje_params, x_peakQs, x_peakEs, x_y_pred, x_res
x_mje_params = {}; x_peakQs, x_peakEs = [],[]; x_y_pred = np.empty((num_pids, 1))
x_res = analyze_E('x', x_ndstats, x_mje_params, x_peakQs, x_peakEs, x_y_pred)
global r_mje_params, r_peakQs, r_peakEs, r_y_pred, r_res
r_mje_params = {}; r_peakQs, r_peakEs = [],[]; r_y_pred = np.empty((num_pids, 1))
r_res = analyze_E('r', r_ndstats, r_mje_params, r_peakQs, r_peakEs, r_y_pred)
global a_mje_params, a_peakQs, a_peakEs, a_y_pred, a_res
a_mje_params = {}; a_peakQs, a_peakEs = [],[]; a_y_pred = np.empty((num_pids, 1))
a_res = analyze_E('a', a_ndstats, a_mje_params, a_peakQs, a_peakEs, a_y_pred)
global y_mje_params, y_peakQs, y_peakEs, y_y_pred, y_res
y_mje_params = {}; y_peakQs, y_peakEs = [],[]; y_y_pred = np.empty((num_pids, 1))
y_res = analyze_E('y', y_ndstats, y_mje_params, y_peakQs, y_peakEs, y_y_pred)
vals = np.vstack([c_res, x_res, r_res, a_res, y_res])
df = pd.DataFrame(vals).assign(ndv = ['c', 'x', 'r', 'a', 'y']).set_index('ndv')
df.columns = ['corr', 'pval', 'peakQ', 'E_range', 'r2_score', 'pred_range']
return df
def analyze_CO(nd_type: chr, CO_type:str):
"""Inputs: nd_type, CO_type.
CO_type is one of scn0_CO, scn0_RCO and scn0_lnCo and must be a column in a ndstats df.
Returns: a tuple of two lists (correl coefficients, p-values).
"""
ndstats = eval(nd_type + "_ndstats", globals()) # no error checking!
CO_params = nd.get_params(ndstats, CO_type)
CO_rhos, CO_pvals = nd.corrs(CO_params, fold_rates, PIDs)
return CO_rhos, CO_pvals
def plot_CO(plt_title: str):
fig, axes = plt.subplots(2, 3, figsize=(12, 8), constrained_layout=True)
plt.suptitle("Contact-Order of ND native shortcuts and exp. fold rate, " + plt_title,
fontsize=14)
for i, CO_type in enumerate(["scn0_CO", "scn0_lnCO", "scn0_RCO"]): # match col names
ax = axes[0][i]; axp = axes[1][i]
ax.set_title(CO_type[:4].upper() + CO_type[4:], fontsize=14)
for nd_type in NDVs:
CO_rhos, CO_pvals = analyze_CO(nd_type, CO_type)
ax.plot(nd.Qs, CO_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, CO_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
ax.set_ylabel("Pearson correlation coefficient", fontsize=14)
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()
axp.set_ylabel("p-values", fontsize=14)
# axp.plot([0.1, 1.0], [0.05, 0.05], color='gray', label="pval=0.05")
axp.set_xlabel("Q", fontsize=14); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
return plt
def analyze_C(nd_type: chr, C_type: str):
"""Inputs: nd_type, C_type.
C_type is scn0_C and must be a column in a ndstats df.
Returns: a tuple of two lists (correl coefficients, p-values).
"""
ndstats = eval(nd_type + "_ndstats", globals()) # no error checking!
C_params = nd.get_params(ndstats, C_type)
C_rhos, C_pvals = nd.corrs(C_params, fold_rates, PIDs)
return C_rhos, C_pvals
dirname = os.path.join("UZ_ND3")
fn = os.path.join(dirname, 'UZ_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); num_pids = len(PIDs)
fold_rates = df.k_f.values
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
# dict keyed by (pid, q) -> avg stdev scle per q
c_scle_params = nd.get_params(c_ndstats, 'scn0_le')
x_scle_params = nd.get_params(x_ndstats, 'scn0_le')
r_scle_params = nd.get_params(r_ndstats, 'scn0_le')
a_scle_params = nd.get_params(a_ndstats, 'scn0_le')
y_scle_params = nd.get_params(y_ndstats, 'scn0_le')
import random
# random sample of 10 pids
sample_pids = random.sample(PIDs, 10)
fig, axes = plt.subplots(2, 5, figsize=(20, 8), constrained_layout=True)
fig.suptitle("ND long-range native shortcuts for 10 chains in Uzunoglo dataset.", fontsize=14)
for i, pid in enumerate(sample_pids):
r = i//5; c = i%5; ax = axes[r][c]
ax.set_title(pid, fontsize=14)
avg_scles = [ c_scle_params[(pid, q)][0] for q in nd.Qs ]
ax.plot(nd.Qs, avg_scles, color=ndv_colors['c'], label='c')
avg_scles = [ x_scle_params[(pid, q)][0] for q in nd.Qs ]
ax.plot(nd.Qs, avg_scles, color=ndv_colors['x'], label='x')
avg_scles = [ r_scle_params[(pid, q)][0] for q in nd.Qs ]
ax.plot(nd.Qs, avg_scles, color=ndv_colors['r'], label='r')
avg_scles = [ a_scle_params[(pid, q)][0] for q in nd.Qs ]
ax.plot(nd.Qs, avg_scles, color=ndv_colors['a'], label='a')
avg_scles = [ y_scle_params[(pid, q)][0] for q in nd.Qs ]
ax.plot(nd.Qs, avg_scles, color=ndv_colors['y'], label='y')
ax.set_ylabel("Mean SCLE", fontsize=14); ax.set_xlabel("Q", fontsize=14)
ax.legend()
# ofn = os.path.join(dirname, "UZ_ndv_SCLEsample_" + edv + ".png")
# plt.savefig(ofn)
agc, agx, cgx = [], [], [] # a > c, a > x, c > x
for q in nd.Qs:
a_avg_scles = np.array([ a_scle_params[(pid, q)][0] for pid in PIDs ])
c_avg_scles = np.array([ c_scle_params[(pid, q)][0] for pid in PIDs ])
x_avg_scles = np.array([ x_scle_params[(pid, q)][0] for pid in PIDs ])
count = np.sum(a_avg_scles > c_avg_scles)
frac = count/num_pids; agc.append(frac)
# print(q, count, frac)
count = np.sum(a_avg_scles > x_avg_scles)
frac = count/num_pids; agx.append(frac)
# print(q, count, frac)
count = np.sum(c_avg_scles > x_avg_scles)
frac = count/num_pids; cgx.append(frac)
# print(q, count, frac); print()
plt.title("Comparison of long-range native shortcuts between ND variants \n (EDS variant '" +
edv + "'), Uzunoglo dataset")
plt.plot(nd.Qs, cgx, '.--', label="c > x")
plt.plot(nd.Qs, agx, '+--', color="tab:gray", label="a > x")
plt.plot(nd.Qs, agc, 'x--', color="tab:red", label="a > c")
plt.xlabel("Q"); plt.xticks(nd.Qs);
plt.ylabel("Proportion of pids"); plt.yticks([0.0] + nd.Qs)
plt.legend(); plt.grid()
# ofn = os.path.join(dirname, "UZ_ndv_SCLE_" + edv + ".png")
# plt.savefig(ofn)
df = fait_analyze_E()
ofn = os.path.join(dirname, "UZ_ndv_fratecorr_" + edv + ".csv")
df.to_csv(ofn)
df
plt = plot_E("EDS variant '"+ edv + "', Uzunoglo dataset.")
ofn = os.path.join(dirname, "UZ_ndv_fratecorr_" + edv + ".png")
plt.savefig(ofn)
fig, axes = plt.subplots(2, 2, figsize=(10, 8), constrained_layout=True)
plt.suptitle("ND energy and exp. fold rate, EDS variant '" + edv + "', Uzunoglo dataset.")
ax = axes[0][0]; axp = axes[1][0]
for nd_type in ['a', 'c', 'x']:
mje_params = eval(nd_type + "_mje_params", globals())
E_rhos, E_pvals = nd.corrs(mje_params, fold_rates, PIDs)
ax.plot(nd.Qs, E_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, E_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
res = eval(nd_type + "_res", globals())
ax.scatter(res[2], res[0], marker='^', s=80, color=ndv_colors[nd_type],
label=nd_type + " peak")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
ax = axes[0][1]; axp = axes[1][1]
for nd_type in ['r', 'y']:
mje_params = eval(nd_type + "_mje_params", globals())
E_rhos, E_pvals = nd.corrs(mje_params, fold_rates, PIDs)
ax.plot(nd.Qs, E_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, E_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
res = eval(nd_type + "_res", globals())
ax.scatter(res[2], res[0], marker='^', s=80, color=ndv_colors[nd_type],
label=nd_type + " peak")
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
# ofn = os.path.join(dirname, "UZ_ndv_fratecorr1_" + edv + ".png")
# plt.savefig(ofn)
plt = plot_CO("EDS variant '" + edv + "', Uzunoglo dataset")
# ofn = os.path.join(dirname, "UZ_ndv_SCN0CO_" + edv + ".png")
# plt.savefig(ofn)
fig, axes = plt.subplots(2, 2, figsize=(10, 8), constrained_layout=True)
plt.suptitle("Network clustering $C$ of ND native shortcuts and exp. fold rate, " +
"EDS variant '" + edv + "', Uzunoglo dataset")
ax = axes[0][0]; axp = axes[1][0]
for nd_type in ['c', 'x', 'r', 'a']:
C_rhos, C_pvals = analyze_C(nd_type, "scn0_C")
ax.plot(nd.Qs, C_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, C_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
# plot 'y' separately because of its dramatically different behavior
ax = axes[0][1]; axp = axes[1][1]; nd_type = 'y'
C_rhos, C_pvals = analyze_C(nd_type, "scn0_C")
ax.plot(nd.Qs, C_rhos, '.--', color=ndv_colors[nd_type], label=nd_type)
axp.plot(nd.Qs, C_pvals, '.--', color=ndv_colors[nd_type], label=nd_type)
ax.set_ylabel("Pearson correlation coefficient")
ax.set_xlabel("Q"); ax.set_xticks(nd.Qs)
ax.legend(); ax.grid()
axp.set_ylabel("p-values")
axp.set_xlabel("Q"); axp.set_xticks(nd.Qs)
axp.legend(); axp.grid()
# ofn = os.path.join(dirname, "UZ_ndv_SCN0Clus_" + edv + ".png")
# plt.savefig(ofn)
dirname = "K_ND3"
fn = os.path.join(dirname, 'K_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); fold_rates = df.k_f.values
# remove outlier: 1ael [12]
print(PIDs[12], fold_rates[12])
print(PIDs.pop(12))
fold_rates = np.delete(fold_rates, 12)
num_pids = len(PIDs)
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_"+edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
df = fait_analyze_E()
# ofn = os.path.join(dirname, "K_ndv_fratecorr_" + edv + ".csv")
# df.to_csv(ofn)
df
plt = plot_E("EDS variant '" + edv + "', Kamagata dataset.")
# ofn = os.path.join(dirname, "K_ndv_fratecorr_" + edv + ".png")
# plt.savefig(ofn)
dirname = "P_ND3"
fn = os.path.join(dirname, 'P_frates.txt')
df = nd.load_frates(fn)
PIDs = df.pid.tolist(); fold_rates = df.k_f.values
num_pids = len(PIDs)
print(num_pids)
print("Range of fold rates:", np.ptp(fold_rates))
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
df = fait_analyze_E()
# ofn = os.path.join(dirname, "P_ndv_fratecorr_" + edv + ".csv")
# df.to_csv(ofn)
df
plt = plot_E("EDS variant '" + edv + "', Paci dataset")
# ofn = os.path.join(dirname, "P_ndv_fratecorr_" + edv + ".png")
# plt.savefig(ofn)
print("EDV", edv)
print('a', a_peakQs)
print('c', c_peakQs)
print('x', x_peakQs)
print('r', r_peakQs)
print('y', y_peakQs)
peakQs = np.vstack([a_peakQs, c_peakQs, x_peakQs, r_peakQs, y_peakQs]).T
df = pd.DataFrame(peakQs)
df.columns = ['a', 'c', 'x', 'r', 'y']
df.mean()
ax = df.boxplot(showmeans=True)
ax.set_title("EDS variant " + edv + ", Paci dataset", fontsize=14)
ax.set_ylabel("peak Q", fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
_ = ax.set_xticklabels(df.columns.values, fontsize=14)
plt = plot_CO("EDS variant '" + edv + "', Paci dataset")
# ofn = os.path.join(dirname, "Paci_ndv_SCN0CO_" + edv + ".png")
# plt.savefig(ofn)
dirname = "SingleDomain_ND3"
PIDs = [ "2f4k", "1bdd", "2abd_0", "1gb1_0", "1mi0_swissmin", "1mhx_swissmin", "2ptl_0",
"1shg", "1srm_1", "2ci2", "1aps_0", "2kjv_0", "2kjw_0", "1qys" ]
fold_rates = np.arange(0.1, 1.5, step=0.1) # dummy fold rates
num_pids = len(PIDs)
y_true = fold_rates.reshape(-1, 1)
reg = linear_model.LinearRegression()
# load stats for the five ND variants, dict of df keyed by pid
edv = 'a'
dirn = os.path.join(dirname, "edv_" + edv)
c_ndstats = nd.get_NDstats(dirn, PIDs, 'c', edv)
x_ndstats = nd.get_NDstats(dirn, PIDs, 'x', edv)
r_ndstats = nd.get_NDstats(dirn, PIDs, 'r', edv)
a_ndstats = nd.get_NDstats(dirn, PIDs, 'a', edv)
y_ndstats = nd.get_NDstats(dirn, PIDs, 'y', edv)
df = fait_analyze_E() # fake exp fold rate, called to get peakQs and peak Es
print("EDV", edv)
print('a', a_peakQs)
print('c', c_peakQs)
print('x', x_peakQs)
print('r', r_peakQs)
print('y', y_peakQs)
peakQs = np.vstack([a_peakQs, c_peakQs, x_peakQs, r_peakQs, y_peakQs]).T
df = pd.DataFrame(peakQs)
df.columns = ['a', 'c', 'x', 'r', 'y']
df.mean()
ax = df.boxplot(showmeans=True)
ax.set_title("EDS variant " + edv, fontsize=14)
ax.set_ylabel("peak Q", fontsize=14)
ax.set_xlabel("ND variant", fontsize=14)
_ = ax.set_xticklabels(df.columns.values, fontsize=14)
plt.plot(PIDs, c_peakEs, label='c')
plt.plot(PIDs, x_peakEs, label='x')
plt.plot(PIDs, r_peakEs, label='r')
plt.plot(PIDs, a_peakEs, label='a')
plt.plot(PIDs, y_peakEs, label='y')
plt.xticks(PIDs, [ p[:4] for p in PIDs ], rotation=90)
_ = plt.legend()
SKLC September 2020