ND folding pathways¶

Dataset: 14 single domain proteins (pids) whose folding pathways were studied previously.
EDS variants: 'abslv'. ND variants: 'craxy'.
$C_{SCN0}$ folding pathways are computed from ND generated networks.
Compute $P_{init}$, probability (fraction of 100 ND runs per pid) that the first fold step at each $Q$, matches the first fold step from the native structure (PRN0).
Of interest is the $Q$ at which $P_{init} \ge 0.5$ first occurs. Do these $Q$s lie within the ND-TS region $Q < 0.5$?
- The results depend on the EDS, ND combination.
The table below reports the number of pids with $P_{init} \ge 50%$ within the ND-TS region. | |'a'|'c'|'x'|'r'|'y'|Sum| |:---:|:---:|:---:|:---:|:---:|:---:|:---:| |'a'|9| 7| 8| 8| 3| 35| |'b'|8| 8| 9| 8| 1| 34| |'s'|8| 8| 6| 8| 4| 34| |'l'|8| 7| 7| 9| 4| 35| |'v'|8| 7| 5| 8| 4| 32| |Sum|41|37|35|41|16|
The 'a' ND variant with 'a' EDS variant produced the best overall result. This combination boasts the largest column-wise sum and the largest row-wise sum at $Q < 0.4$ and at $Q < 0.5$.
The next best combination at $Q < 0.5$ is the 'r' ND variant with 'l' EDS variant. This combination is also best at $Q < 0.6$.
The 'r' ND variant yield results (largest column-wise sum) which are as good as or better than the biased ND variants, at $Q < 0.5$ and $Q < 0.6$.

import pandas as pd
import numpy as np
import os
import pickle # to save Python dict structure with tuple key, not supported by JSON directly.
import matplotlib.pyplot as plt

import pathways as pwy # my lib
# Uses ProcessPoolExecutor to speed up pathway checking.

Canonical set (single domain)¶

Qs = pwy.Qs
PIDs = pwy.PIDs
print(PIDs)

['2f4k', '1bdd', '2abd_0', '1gb1_0', '1mi0_swissmin', '1mhx_swissmin', '2ptl_0', '1shg', '1srm_1', '2ci2', '1aps_0', '2kjv_0', '2kjw_0', '1qys']

# EDV=a; peak Qs computed previously
a_peakQs = np.array([0.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.2, 0.2, 0.3, 0.3, 0.2, 0.2, 0.2])
c_peakQs = np.array([0.1, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.2])
x_peakQs = np.array([0.1, 0.4, 0.3, 0.2, 0.3, 0.3, 0.3, 0.3, 0.2, 0.3, 0.3, 0.3, 0.3, 0.2])
r_peakQs = np.array([0.1, 0.2, 0.2, 0.2, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1])
y_peakQs = np.array([0.1, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1])

print('a', np.mean(a_peakQs))
print('c', np.mean(c_peakQs))
print('x', np.mean(x_peakQs))
print('r', np.mean(r_peakQs))
print('y', np.mean(y_peakQs))

a 0.21428571428571433
c 0.23571428571428574
x 0.2714285714285714
r 0.15714285714285717
y 0.14285714285714285

# This takes about 2 minutes with my computer setup. 
# Comment out if loading results from pickle file.
# edv = 'a'
# pwy.main(edv)

# pickle results
#with open("pinit_" + edv + ".pickle", 'wb') as fout:
#    pickle.dump(pwy.props_dict, fout)

# import json
# with open("pinit_" + edv + ".txt", 'w') as fout:
#    json.dump(pwy.props_dict, fout) # doesn't like tuple keys
# https://stackoverflow.com/questions/12337583/saving-dictionary-whose-keys-are-tuples-with-json-python/34204513

# numpy interpolate and from scipy.interpolate import interp1d 
# my interpolate
def find_q(pinit):
    for i, p in enumerate(pinit):
        if p == 0.5:
            return Qs[i]
        elif p > 0.5:
            if i == 0:
                return Qs[i]
            else:
                return Qs[i-1]

# load a pickled result
edv = 'a'; print(edv)
with open("pinit_" + edv + ".pickle", 'rb') as fin:
    props_dict = pickle.load(fin)

a

props_df = pd.DataFrame(props_dict).sort_index(axis=1)
props_df["Q"] = Qs
props_df = props_df.set_index("Q")
props_df.head()

pid = "1gb1_0"
props_df[pid]

# my interpolate
c_pinit_Qs, x_pinit_Qs, r_pinit_Qs, a_pinit_Qs, y_pinit_Qs = [], [], [], [], []
for pid in PIDs:
    df = props_df[pid]
    c_pinit_q = find_q(df['c'].values)    
    x_pinit_q = find_q(df['x'].values)
    r_pinit_q = find_q(df['r'].values)
    a_pinit_q = find_q(df['a'].values)
    y_pinit_q = find_q(df['y'].values)
    print(pid, np.round([a_pinit_q, c_pinit_q, x_pinit_q, r_pinit_q, y_pinit_q], 3))
    c_pinit_Qs.append(c_pinit_q)
    x_pinit_Qs.append(x_pinit_q)
    r_pinit_Qs.append(r_pinit_q)
    a_pinit_Qs.append(a_pinit_q)
    y_pinit_Qs.append(y_pinit_q)

a_pinit_Qs = np.array(a_pinit_Qs)
c_pinit_Qs = np.array(c_pinit_Qs)
x_pinit_Qs = np.array(x_pinit_Qs)
r_pinit_Qs = np.array(r_pinit_Qs)
y_pinit_Qs = np.array(y_pinit_Qs)

2f4k [0.1 0.1 0.1 0.3 0.5]
1bdd [0.3 0.3 0.3 0.5 0.5]
2abd_0 [0.3 0.6 0.5 0.8 0.8]
1gb1_0 [0.1 0.2 0.1 0.8 0.9]
1mi0_swissmin [0.6 0.7 0.7 0.4 0.4]
1mhx_swissmin [0.3 0.6 0.5 0.4 0.5]
2ptl_0 [0.9 0.9 0.9 0.8 0.8]
1shg [0.2 0.2 0.3 0.3 0.7]
1srm_1 [0.7 0.7 0.3 0.7 0.7]
2ci2 [0.5 0.5 0.5 0.4 0.6]
1aps_0 [0.4 0.4 0.4 0.5 0.6]
2kjv_0 [0.3 0.3 0.3 0.4 0.4]
2kjw_0 [0.1 0.1 0.2 0.2 0.6]
1qys [0.5 0.6 0.6 0.2 0.4]

# analyze interpolated results
print("EDV", edv)
print("Mean p_init Q")
print('a', np.mean(a_pinit_Qs), np.std(a_pinit_Qs), np.round(a_pinit_Qs, 3))
print('c', np.mean(c_pinit_Qs), np.std(c_pinit_Qs), np.round(c_pinit_Qs, 3))
print('x', np.mean(x_pinit_Qs), np.std(x_pinit_Qs), np.round(x_pinit_Qs, 3))
print('r', np.mean(r_pinit_Qs), np.std(r_pinit_Qs), np.round(r_pinit_Qs, 3))
print('y', np.mean(y_pinit_Qs), np.std(y_pinit_Qs), np.round(y_pinit_Qs, 3))

print("\nMAD p_init Q from peak Q")
print('a', np.mean(np.abs(a_pinit_Qs - a_peakQs)))
print('c', np.mean(np.abs(c_pinit_Qs - c_peakQs)))
print('x', np.mean(np.abs(x_pinit_Qs - x_peakQs)))
print('r', np.mean(np.abs(r_pinit_Qs - r_peakQs)))
print('y', np.mean(np.abs(y_pinit_Qs - y_peakQs)))
print()
print('a', np.sum(a_pinit_Qs < 0.4), np.sum(a_pinit_Qs < 0.5), np.sum(a_pinit_Qs < 0.6))
print('c', np.sum(c_pinit_Qs < 0.4), np.sum(c_pinit_Qs < 0.5), np.sum(c_pinit_Qs < 0.6))
print('x', np.sum(x_pinit_Qs < 0.4), np.sum(x_pinit_Qs < 0.5), np.sum(x_pinit_Qs < 0.6))
print('r', np.sum(r_pinit_Qs < 0.4), np.sum(r_pinit_Qs < 0.5), np.sum(r_pinit_Qs < 0.6))
print('y', np.sum(y_pinit_Qs < 0.4), np.sum(y_pinit_Qs < 0.5), np.sum(y_pinit_Qs < 0.6))

EDV a
Mean p_init Q
a 0.37857142857142856 0.23046094007002396 [0.1 0.3 0.3 0.1 0.6 0.3 0.9 0.2 0.7 0.5 0.4 0.3 0.1 0.5]
c 0.4428571428571428 0.2411706145162019 [0.1 0.3 0.6 0.2 0.7 0.6 0.9 0.2 0.7 0.5 0.4 0.3 0.1 0.6]
x 0.4071428571428571 0.21864611235734235 [0.1 0.3 0.5 0.1 0.7 0.5 0.9 0.3 0.3 0.5 0.4 0.3 0.2 0.6]
r 0.4785714285714287 0.20763488362498048 [0.3 0.5 0.8 0.8 0.4 0.4 0.8 0.3 0.7 0.4 0.5 0.4 0.2 0.2]
y 0.5999999999999999 0.1558387444947959 [0.5 0.5 0.8 0.9 0.4 0.5 0.8 0.7 0.7 0.6 0.6 0.4 0.6 0.4]

MAD p_init Q from peak Q
a 0.19285714285714287
c 0.22142857142857145
x 0.17857142857142858
r 0.3214285714285715
y 0.4571428571428572

a 8 9 11
c 6 7 8
x 7 8 11
r 4 8 10
y 0 3 6

pinit_df = pd.DataFrame({'a' : a_pinit_Qs, 'c': c_pinit_Qs, 'x' : x_pinit_Qs, 
                         'r': r_pinit_Qs, 'y': y_pinit_Qs})

pinit_df.boxplot(showmeans=True)
plt.title("Q where P_init first approaches 0.5.\nEDS variant '" + edv + "'.")
plt.xlabel('ND variant', fontsize=14)
# plt.ylabel("Q where P_init >= 0.5", fontsize=14) 
plt.ylabel("P_init Q", fontsize=14)
plt.xticks(range(1, 6, 1), ['a', 'c', 'x', 'r', 'y'], fontsize=14)
_ = plt.yticks(np.arange(0, 1.1, 0.1))

plt.savefig("ND_ndv_pinitQ_" + edv + ".png")

fig, axes = plt.subplots(5, 3, figsize=(12, 20), constrained_layout=True)
plt.suptitle("Proportion of ND runs per $Q$ with native-like initial fold step $P_{init}$. " + 
             "(EDS variant '" + edv + "') ", fontsize=14)
plt.delaxes(axes[4][2])

for i, pid in enumerate(PIDs):
    r = i//3; c = i%3; ax = axes[r][c]    
    props_df[pid]['c'].plot(ax=ax, style='--', label='c ' + str(c_pinit_Qs[i]))
    props_df[pid]['x'].plot(ax=ax, style='--', label='x ' + str(x_pinit_Qs[i]))
    props_df[pid]['r'].plot(ax=ax, style='--', label='r ' + str(r_pinit_Qs[i])) 
    props_df[pid]['a'].plot(ax=ax, style='--', label='a ' + str(a_pinit_Qs[i]))    
    props_df[pid]['y'].plot(ax=ax, style='--', label='y ' + str(y_pinit_Qs[i]))
    ax.plot([0.1, 0.55], [0.5, 0.5], color='black')  # upper left quadrant
    ax.plot([0.55, 0.55], [0.5, 1.0], color='black')  # upper left quadrant
    ax.set_title(pid[:4])
    ax.set_xlabel('Q'); ax.set_xticks(Qs)
    # ax.set_ylabel('P_init'); 
    ax.set_yticks(np.arange(0, 1.1, 0.1))
    if pid == "2ptl_0":
        ax.legend(loc="upper left"); ax.grid()
    else:
        ax.legend(loc="lower right"); ax.grid()
    
plt.savefig("ND_firststep_" + edv + ".png")

Summary¶

# Q < 0.4 a c x r y
a_counts = [8, 6, 7, 4, 0]
b_counts = [6, 5, 5, 5, 0]
s_counts = [6, 7, 4, 7, 1]
l_counts = [6, 5, 6, 7, 0]
v_counts = [5, 5, 4, 5, 1]
counts = np.vstack([a_counts, b_counts, s_counts, l_counts, v_counts])
print("row sums ", counts.sum(axis=1))
print("col sums ", counts.sum(axis=0))

row sums  [25 21 25 24 20]
col sums  [31 28 26 28  2]

# Q < 0.5 a c x r y
a_counts = [9, 7, 8, 8, 3]
b_counts = [8, 8, 9, 8, 1]
s_counts = [8, 8, 6, 8, 4]
l_counts = [8, 7, 7, 9, 4]
v_counts = [8, 7, 5, 8, 4]
counts = np.vstack([a_counts, b_counts, s_counts, l_counts, v_counts])
print("row sums ", counts.sum(axis=1))
print("col sums ", counts.sum(axis=0))

row sums  [35 34 34 35 32]
col sums  [41 37 35 41 16]

# Q < 0.6 a c x r y
a_counts = [11, 8, 11, 10, 6]
b_counts = [10, 9, 10, 11, 5]
s_counts = [9, 10, 9, 13, 5]
l_counts = [10, 10, 11, 10, 7]
v_counts = [9, 9, 9, 10, 8]
counts = np.vstack([a_counts, b_counts, s_counts, l_counts, v_counts])
print("row sums ", counts.sum(axis=1))
print("col sums ", counts.sum(axis=0))

row sums  [46 45 46 48 45]
col sums  [49 46 50 54 31]

NDVs = ['a', 'c', 'x', 'r', 'y']
a_MADS = [0.193, 0.221, 0.178, 0.321, 0.457]
b_MADS = [0.221, 0.243, 0.214, 0.286, 0.471]
s_MADS = [0.236, 0.228, 0.271, 0.214, 0.450]
l_MADS = [0.221, 0.221, 0.193, 0.278, 0.428]
v_MADS = [0.271, 0.278, 0.286, 0.314, 0.421]

plt.plot(NDVs, a_MADS, 'o--', label='a')
plt.plot(NDVs, b_MADS, 'o--', label='b')
plt.plot(NDVs, s_MADS, 'o--', label='s')
plt.plot(NDVs, l_MADS, 'o--', label='l')
plt.plot(NDVs, v_MADS, 'o--', label='v')
plt.title("MAD between ND peak $Q$ and $P_{init} Q$", fontsize=14)
plt.xticks(NDVs, fontsize=14)
plt.xlabel("ND variant", fontsize=14)
plt.legend(title="EDV"); plt.grid()
plt.savefig("Pinit_MAD.png")

End of notebook¶

SKLC October 2020

	1aps_0					1bdd					...	2kjw_0					2ptl_0
	a	c	r	x	y	a	c	r	x	y	...	a	c	r	x	y	a	c	r	x	y
Q
0.1	0.07	0.02	0.03	0.02	0.0	0.19	0.14	0.15	0.17	0.09	...	0.47	0.45	0.16	0.32	0.0	0.03	0.02	0.01	0.05	0.00
0.2	0.16	0.08	0.10	0.05	0.0	0.34	0.28	0.30	0.31	0.26	...	0.56	0.58	0.43	0.44	0.0	0.05	0.06	0.13	0.11	0.00
0.3	0.24	0.16	0.20	0.11	0.0	0.39	0.42	0.45	0.50	0.14	...	0.67	0.69	0.54	0.57	0.0	0.14	0.10	0.17	0.29	0.00
0.4	0.48	0.39	0.29	0.32	0.0	0.51	0.53	0.46	0.55	0.25	...	0.84	0.75	0.64	0.77	0.0	0.15	0.16	0.17	0.29	0.00
0.5	0.65	0.76	0.46	0.69	0.0	0.66	0.68	0.45	0.62	0.50	...	0.93	0.93	0.74	0.94	0.0	0.15	0.21	0.28	0.36	0.09

	a	c	r	x	y
Q
0.1	0.48	0.36	0.23	0.39	0.00
0.2	0.63	0.46	0.44	0.51	0.00
0.3	0.54	0.58	0.40	0.46	0.00
0.4	0.51	0.55	0.46	0.60	0.00
0.5	0.49	0.57	0.47	0.60	0.00
0.6	0.56	0.53	0.43	0.65	0.31
0.7	0.63	0.41	0.40	0.67	0.28
0.8	0.61	0.52	0.48	0.52	0.23
0.9	0.92	0.92	0.57	0.96	0.45
1.0	0.99	0.99	0.99	0.99	0.99