Secondary Structure Elements from DSSP.

  • Obtain SSEs for residues of a protein chain from DSSP output.
  • The DSSP output is reduced to three labels: H (HIG), S (BE), and T (otherwise).
  • Consecutive identical labels are collected into a SSE, and given a numeric id.

Front end to query DSSP api

In [1]:
# Adapted from https://www3.cmbi.umcn.nl/xssp/api/examples
# Python 3
"""
The client takes a PDB file (must end with .pdb), sends it to the REST service, which
creates DSSP data. The DSSP data is then output to a file.

Example:
    python pdb_to_dssp.py 1crn.pdb https://www3.cmbi.umcn.nl/xssp/
"""
# https://docs.python.org/3/library/argparse.html?highlight=requests#module-argparse
# https://requests.readthedocs.io/en/master/
import argparse
import json
import requests
import time

def pdb_to_dssp(pdb_file_path, rest_url):
    # Read the pdb file data into a variable
    files = {'file_': open(pdb_file_path, 'rb')}

    # Send a request to the server to create hssp data from the pdb file data.
    # If an error occurs, an exception is raised and the program exits. If the
    # request is successful, the id of the job running on the server is
    # returned.
    url_create = '{}api/create/pdb_file/dssp/'.format(rest_url)
    r = requests.post(url_create, files=files)
    r.raise_for_status()

    job_id = json.loads(r.text)['id']
    print("Job submitted successfully. Id is: '{}'".format(job_id))

    # Loop until the job running on the server has finished, either successfully
    # or due to an error.
    ready = False
    while not ready:
        # Check the status of the running job. If an error occurs an exception
        # is raised and the program exits. If the request is successful, the
        # status is returned.
        url_status = '{}api/status/pdb_file/dssp/{}/'.format(rest_url, job_id)
        r = requests.get(url_status)
        r.raise_for_status()

        status = json.loads(r.text)['status']
        print("Job status is: '{}'".format(status))

        # If the status equals SUCCESS, exit out of the loop by changing the
        # condition ready. This causes the code to drop into the `else` block
        # below.
        #
        # If the status equals either FAILURE or REVOKED, an exception is raised
        # containing the error message. The program exits.
        #
        # Otherwise, wait for five seconds and start at the beginning of the
        # loop again.
        if status == 'SUCCESS':
            ready = True
        elif status in ['FAILURE', 'REVOKED']:
            raise Exception(json.loads(r.text)['message'])
        else:
            time.sleep(5)
    else:
        # Requests the result of the job. If an error occurs an exception is
        # raised and the program exits. If the request is successful, the result
        # is returned.
        url_result = '{}api/result/pdb_file/dssp/{}/'.format(rest_url, job_id)
        r = requests.get(url_result)
        r.raise_for_status()
        result = json.loads(r.text)['result']

        # Return the result to the caller, which prints it to the screen.
        return result

PDB ids in datasets

  • The pdb files have been downloaded into the respective directories prior, processed ("unused" parts are removed, and spaces added between columns where necessary) and given the _pdb.txt extension. This pre-processing may cause DSSP to fail; use original pdb file.
  • Notes specific to idiosyancrasies of my implementation. Please adjust accordingly to taste.
In [2]:
# dirname = "Paci"
# PIDs = ["1imq", "1lmb4", "1bf4", "2ptl", "2ci2", "1aps", "1fmk", "1bk2", "1shf", "1ten"]
In [3]:
# dirname = "Uzunoglo"
# PIDs = ["2pdd", "1enh", "1idy", "1lmb", "1ayi", "1imq", "2abd", "1hrc", "1div", "2ptl", 
#        "1bf4", "2ci2", "1ubq", "1poh", "1urn", "1ris", "1aps", "2acy", "1spr", "1fkb", 
#        "2vik", "1e0l", "1rlq", "1shg", "1shf", "1csp", "1mjc", "1ten", "1afi", "1c9o", 
#        "1e0m", "1fex", "1g6p", "1jo8", "1k9q", "1m9s", "1n88", "1o6x", "1psf", "1rfa", 
#        "1ryk", "1ss1", "1w4e", "1w4j", "1yza", "2bth", "2qjl", "3ait", "1pgb", "1tit"]
# 1div_n, 1fkb, 1ten, 1c9o, 1jo8, 1m9s, 2qjl - need to re-download pdb to generate dssp
# 1div split to 1div_n 1div_c
# 1m9s extract subset of residues, 1c9o extract A chain
# check for broken chain: grep '!' *_dssp.out
# 2pdd dssp output excludes residue 43
In [4]:
# dirname = "Kamagata"
# PIDs = ["1pgb", "2cro", "1cei", "2abd", "1hng", "1gxt", "1bta", "1tit", "1ttg", "1bni",
#        "1adw", "3chy", "1ael", "1joo", "1i1b", "1dwr", "2rn2", "2a5e", "1ra9", "1l63", 
#        "1php"]
# 1gxt 3chy 1i1b - need hetatoms in pdb to generate dssp
# exclude 1hrh missing residues 538...543
# check for broken chain: grep '!' *_dssp.out
In [5]:
# Canonical set 14 pids
dirname = "SingleDomain"
PIDs = [ "2f4k", "1bdd", "2abd_0", "1gb1_0", "1mi0", "1mhx", "2ptl_0", 
        "1shg", "1srm_1", "2ci2", "1aps_0", "2kjv_0", "2kjw_0", "1qys" ]
# 2f4k, 1mi0, 1qys - need to re-download pdb to generate dssp
In [6]:
import os
from shutil import copy2

def get_dssp(dirname, pid, copy=False):
    pdb_fn = os.path.join(dirname, pid + ".pdb")    
    if copy: # only the first time, check if file already exists
        copy2(os.path.join(dirname, pid + "_pdb.txt"), pdb_fn)     
    res = pdb_to_dssp(pdb_fn, "https://www3.cmbi.umcn.nl/xssp/")
    dssp_fn = os.path.join(dirname, pid + "_dssp.out")
    print(dssp_fn)
    with open(dssp_fn, 'w') as fout:  
        fout.write("nid rid Chain AA DSSP\n")
        for line in res[res.find('#'):].splitlines()[1:]: # ignore first line, header
            s = line.lstrip()[:15]
            fout.write(s+'\n')
    return
In [7]:
# Demo with one.
get_dssp("SingleDomain", "1bdd", copy=True)
Job submitted successfully. Id is: '15d24e5b-901a-4cc3-856f-6a8c2fa3c08c'
Job status is: 'SUCCESS'
SingleDomain\1bdd_dssp.out
In [8]:
# Run all
for pid in PIDs:
    print(pid)
    get_dssp(dirname, pid, copy=False)
2f4k
Job submitted successfully. Id is: '52e2c0cb-73d5-457c-847b-b9ba15fb57b0'
Job status is: 'SUCCESS'
SingleDomain\2f4k_dssp.out
1bdd
Job submitted successfully. Id is: '4a68e842-d705-4e7e-a7d8-58375e48f082'
Job status is: 'SUCCESS'
SingleDomain\1bdd_dssp.out
2abd_0
Job submitted successfully. Id is: '831920a5-59fe-44b7-a1e0-2fd945980deb'
Job status is: 'SUCCESS'
SingleDomain\2abd_0_dssp.out
1gb1_0
Job submitted successfully. Id is: '24302b04-c5f4-451a-996d-42f5baf46fca'
Job status is: 'SUCCESS'
SingleDomain\1gb1_0_dssp.out
1mi0
Job submitted successfully. Id is: 'ab01cbef-d3e4-4188-930e-0e912c94b9d1'
Job status is: 'SUCCESS'
SingleDomain\1mi0_dssp.out
1mhx
Job submitted successfully. Id is: 'f95b333b-e649-42cf-8c20-926d8b08729f'
Job status is: 'SUCCESS'
SingleDomain\1mhx_dssp.out
2ptl_0
Job submitted successfully. Id is: '88eecbf9-487f-47d1-b9fa-178a7bb18a8c'
Job status is: 'SUCCESS'
SingleDomain\2ptl_0_dssp.out
1shg
Job submitted successfully. Id is: '99902bbd-0742-4185-8c83-86cacf3fe706'
Job status is: 'SUCCESS'
SingleDomain\1shg_dssp.out
1srm_1
Job submitted successfully. Id is: '72984fd5-1d7c-4960-a639-f713d2a682a5'
Job status is: 'SUCCESS'
SingleDomain\1srm_1_dssp.out
2ci2
Job submitted successfully. Id is: 'ccf1bec8-0b77-4a84-a3e5-ebd7bdbbc1f6'
Job status is: 'SUCCESS'
SingleDomain\2ci2_dssp.out
1aps_0
Job submitted successfully. Id is: 'f5c4b784-99ac-47df-b1d8-5cf5cd0aade5'
Job status is: 'SUCCESS'
SingleDomain\1aps_0_dssp.out
2kjv_0
Job submitted successfully. Id is: '1c6dc06d-0f30-4dbe-867b-fa4fd144e2dd'
Job status is: 'SUCCESS'
SingleDomain\2kjv_0_dssp.out
2kjw_0
Job submitted successfully. Id is: '911a6b51-5db9-4902-948e-95b8ff74af7e'
Job status is: 'SUCCESS'
SingleDomain\2kjw_0_dssp.out
1qys
Job submitted successfully. Id is: 'e2790487-54d9-460b-93c6-4e79a5827c96'
Job status is: 'SUCCESS'
SingleDomain\1qys_dssp.out

C/C++ function to do the conversion.

//ND/FP1
//Converts a pid_dssp.out input text file to a pid_sse.txt output text file.
//input header: nid rid Chain AA DSSP
//output header: nid rid SS SSE AA

void FU::prep_sse(string pid) {  
  string fn = pid + "_dssp.out";
  ifstream fin(fn.c_str());
  if (fin.fail()) {
    cout << fn << " input error." << endl;
    return;
  }
  string ofn = pid + "_sse1.txt";
  ofstream fout(ofn.c_str());
  fout << "nid rid SS SSE AA" << endl;
  char prev_d = '-', d; int nid = 0, sse = 0;  
  string s, ds; vector<string> ss;        
  getline(fin, s);  //header: nid rid Chain AA DSSP
  getline(fin, s);
  while (!fin.eof()) {
    ::tokenize_string(s, ss); //external function to tokenize space separated string
    ds = ss.at(4);
    if (ds == "H" || ds == "I" || ds == "G") //helix
      d = 'H';
    else if (ds == "B" || ds == "E") //strand
      d = 'S';
    else
      d = 'T'; //otherwise
    if ('-' == prev_d) {  //first residue
      prev_d = d;
      fout << nid++ << " " << ss.at(1) << " " << d << " " 
        << sse << " " << ss.at(3) << endl;
    }
    else {
      if (d == prev_d) {  //remain in same sse
        fout << nid++ << " " << ss.at(1) << " " << d << " " 
          << sse << " " << ss.at(3) << endl;
      }
      else {  //switch to new sse
        fout << nid++ << " " << ss.at(1) << " " << d << " " 
          << ++sse << " " << ss.at(3) << endl;
        prev_d = d;
      }
    }
    getline(fin, s);
  }
  fin.close();
  fout.close();
}

Query SSEs

In [9]:
import pandas as pd
import numpy as np
import os
import re
In [10]:
pid = "1aps_0"
fn = os.path.join(dirname, pid + "_sse.txt")
df = pd.read_table(fn, sep='\s+')
df.sample(5)
Out[10]:
nid rid SS SSE
22 22 23 H 3
74 74 75 T 10
82 82 83 S 11
75 75 76 T 10
5 5 6 T 0
In [11]:
# calculate number of H, S, T residues
n = len(df)
h = len(df.query("SS == 'H'"))
s = len(df.query("SS == 'S'"))
t = len(df.query("SS == 'T'"))
print(pid, n)
print("H", h, round(h/n, 2))
print("S", s, round(s/n, 2))
print("T", t, round(t/n, 2))
1aps_0 98
H 23 0.23
S 36 0.37
T 39 0.4
In [12]:
ds = ''.join(df.SS.values)
ds
Out[12]:
'TTTTTTSSSSSSSTTTTTTTTHHHHHHHHHHHHTTSSSSSSSTTTSSSSSSSSTHHHHHHHHHHHTTTTTTTTTTTSSSSSSSSSTTTTTTTSSSSST'
In [13]:
s = df.SSE.values
s
Out[13]:
array([ 0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,
        2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,
        4,  5,  5,  5,  5,  5,  5,  5,  6,  6,  6,  7,  7,  7,  7,  7,  7,
        7,  7,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 14], dtype=int64)
In [14]:
# number of SSEs, non-Turn SSEs (fold stpes)
gdf = df.groupby('SSE').min()
print("Number of SSEs", len(gdf))
print("Number of non-Turn SSEs", len(gdf.query("SS != 'T'")))
Number of SSEs 15
Number of non-Turn SSEs 7
In [15]:
gdf.query("SS != 'T'").index
Out[15]:
Int64Index([1, 3, 5, 7, 9, 11, 13], dtype='int64', name='SSE')
In [16]:
gdf.query("SS != 'T'").SS
Out[16]:
SSE
1     S
3     H
5     S
7     S
9     H
11    S
13    S
Name: SS, dtype: object
In [17]:
# get the non-Turn SSE strings
p = re.compile(r'[S|H]+')
m = p.findall(ds)
m
Out[17]:
['SSSSSSS',
 'HHHHHHHHHHHH',
 'SSSSSSS',
 'SSSSSSSS',
 'HHHHHHHHHHH',
 'SSSSSSSSS',
 'SSSSS']
In [18]:
# longest Turn SSE
p = re.compile(r'T+')
m = p.findall(ds)
print(m)
longest = max([ len(s) for s in m ])
longest
['TTTTTT', 'TTTTTTTT', 'TT', 'TTT', 'T', 'TTTTTTTTTTT', 'TTTTTTT', 'T']
Out[18]:
11

End of notebook

SKLC August 2020