Source code for godmd.godmd_prep

#!/usr/bin/env python3

"""Module containing the GOdMDPrep class and the command line interface."""
import argparse
from pathlib import Path
from biobb_common.generic.biobb_object import BiobbObject
from biobb_common.configuration import settings
from biobb_common.tools import file_utils as fu
from biobb_common.tools.file_utils import launchlogger
from biobb_godmd.godmd.common import check_input_path, check_output_path


[docs]class GOdMDPrep(BiobbObject):
    """
    | biobb_godmd GOdMDPrep
    | Helper bb to prepare inputs for the `GOdMD tool <http://mmb.irbbarcelona.org/GOdMD/>`_ module.
    | Prepares input files for the GOdMD tool.

    Args:
        input_pdb_orig_path (str): Input PDB file to be used as origin in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
        input_pdb_target_path (str): Input PDB file to be used as target in the conformational transition. File type: input. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.pdb>`_. Accepted formats: pdb (edam:format_1476).
        output_aln_orig_path (str): Output GOdMD alignment file corresponding to the origin structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/1ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
        output_aln_target_path (str): Output GOdMD alignment file corresponding to the target structure of the conformational transition. File type: output. `Sample file <https://github.com/bioexcel/biobb_godmd/raw/master/biobb_godmd/test/data/godmd/4ake_A.aln>`_. Accepted formats: aln (edam:format_2330), txt (edam:format_2330).
        properties (dict - Python dictionary object containing the tool parameters, not input/output files):
            * **gapopen** (*float*) - (12.0) Standard gap penalty: score taken away when a gap is created.
            * **gapextend** (*float*) - (2.0) Penalty added to the standard gap penalty for each base or residue in the gap.
            * **datafile** (*str*) - ("EPAM250") Scoring matrix file used when comparing sequences.
            * **binary_path** (*str*) - ("water") Binary path.
            * **remove_tmp** (*bool*) - (True) [WF property] Remove temporal files.
            * **restart** (*bool*) - (False) [WF property] Do not execute if output files exist.

    Examples:
        This is a use example of how to use the building block from Python::

            from biobb_godmd.godmd.godmd_prep import godmd_prep
            prop = {
                'gapopen': 10.0,
                'gapextend': 2.0
            }
            godmd_prep( input_pdb_orig_path='/path/to/input_orig.pdb',
                        input_pdb_target_path='/path/to/input_target.pdb',
                        output_aln_orig_path='/path/to/orig.aln',
                        output_aln_target_path='/path/to/target.aln',
                        properties=prop)

    Info:
        * wrapped_software:
            * name: In house
            * license: Apache-2.0
        * ontology:
            * name: EDAM
            * schema: http://edamontology.org/EDAM.owl

    """
    def __init__(self, input_pdb_orig_path: str, input_pdb_target_path: str,
                 output_aln_orig_path: str, output_aln_target_path: str,
                 properties: dict = None, **kwargs) -> None:

        properties = properties or {}

        # Call parent class constructor
        super().__init__(properties)
        self.locals_var_dict = locals().copy()

        self.AA_TRANSLATOR = {'GLY': 'G',
                              'ALA': 'A',
                              'VAL': 'V',
                              'LEU': 'L',
                              'ILE': 'I',
                              'MET': 'M',
                              'PHE': 'F',
                              'TRP': 'W',
                              'PRO': 'P',
                              'SER': 'S',
                              'THR': 'T',
                              'CYS': 'C',
                              'CYX': 'C',
                              'CYM': 'C',
                              'TYR': 'Y',
                              'TYM': 'Y',
                              'ASN': 'N',
                              'GLN': 'Q',
                              'ASP': 'D',
                              'ASH': 'D',
                              'GLU': 'E',
                              'GLH': 'E',
                              'LYS': 'K',
                              'LYN': 'K',
                              'ARG': 'R',
                              'ARN': 'R',
                              'HIS': 'H',
                              'HIE': 'H',
                              'HID': 'H',
                              'HIP': 'H'}

        # Input/Output files
        self.io_dict = {
            'in': {'input_pdb_orig_path': input_pdb_orig_path,
                   'input_pdb_target_path': input_pdb_target_path},
            'out': {'output_aln_orig_path': output_aln_orig_path,
                    'output_aln_target_path': output_aln_target_path}
        }

        # Properties specific for BB
        self.properties = properties
        self.gapopen = properties.get('gapopen', 12.0)
        self.gapextend = properties.get('gapextend', 2.0)
        self.datafile = properties.get('datafile', "EPAM250")
        self.binary_path = properties.get('binary_path', "water")

        # Check the properties
        self.check_properties(properties)
        self.check_arguments()

[docs]    def check_data_params(self, out_log, out_err):
        """ Checks input/output paths correctness """

        # Check input(s)
        self.io_dict["in"]["input_pdb_orig_path"] = check_input_path(self.io_dict["in"]["input_pdb_orig_path"], "input_pdb_orig_path", False, out_log, self.__class__.__name__)
        self.io_dict["in"]["input_pdb_target_path"] = check_input_path(self.io_dict["in"]["input_pdb_target_path"], "input_pdb_target_path", False, out_log, self.__class__.__name__)

        # Check output(s)
        self.io_dict["out"]["output_aln_orig_path"] = check_output_path(self.io_dict["out"]["output_aln_orig_path"], "output_aln_orig_path", False, out_log, self.__class__.__name__)
        self.io_dict["out"]["output_aln_target_path"] = check_output_path(self.io_dict["out"]["output_aln_target_path"], "output_aln_target_path", False, out_log, self.__class__.__name__)

[docs]    def extract_sequence(self, pdb):
        '''
        Parses a PDB file to retrieve the sequence of a certain chain.
        '''
        seq = ''
        resids = []
        with open(pdb) as file:
            for line in file:
                line = line.rstrip('\n')
                record = line[:6].replace(" ", "")
                if record != 'ATOM':
                    continue
                atomname = line[12:16]
                atomname = atomname.replace(" ", "")
                if atomname != 'CA':
                    continue
                alternate = line[16]
                if alternate != ' ':
                    if alternate != 'A':
                        continue  # if alt loc, only A-labeled residues
                resname = line[17:20].replace(" ", "")
                if resname not in self.AA_TRANSLATOR:
                    continue
                resnum = line[22:26].replace(" ", "")
                icode = line[26:27]
                seq += self.AA_TRANSLATOR[resname]
                resids += [(' ', int(resnum), icode)]  # BioPython's standard PDB atom id (' ',resid,insertioncode) (e.g. (' ',40,'B'))

        return seq, resids

[docs]    def retrieve_alignment(self, waterFile, resids1, resids2):
        '''
        Gets the FASTA sequence of two PDB structures and a list of their residues in Biopython residue format.
        Opens a file containing a local sequence alignment (Water program of EMBOSS package) between those two PDB structures.
        Returns the sequence identity and the pairs of residues of the alignment.
        '''
        # Parsing Water alignment file
        is_alignment = 0
        is_qseq = 0
        is_sseq = 0
        qseq = ""
        sseq = ""
        ident = ""
        next_seq = 0
        qstart = ""
        sstart = ""
        with open(waterFile) as file:
            for line in file:
                line = line.rstrip("\n")
                if is_alignment == 0:
                    if not line.startswith(">>#1"):
                        continue
                    else:
                        is_alignment = 1
                else:
                    if line.startswith("; sw_ident:"):
                        ident = line[12:]
                    elif line.startswith("; al_start:"):
                        if next_seq == 0:
                            qstart = line[12:]
                        else:
                            sstart = line[12:]
                    elif line.startswith("; al_display_start:"):
                        if next_seq == 0:
                            is_qseq = 1
                        else:
                            is_sseq = 1
                    else:
                        if is_qseq == 1:
                            if not line.startswith(">"):
                                qseq = qseq + line
                            else:
                                is_qseq = 0
                                next_seq = 1
                        if is_sseq == 1:
                            if line != "":
                                sseq = sseq + line
                            else:
                                is_sseq = 0

        # Get the Sequence Identity of the alignment
        seq_id = float(ident)

        # Get the residues of the alignment.
        # -2 is applied because in the first iteration of next loop all values will always be increased by 1 (local alignment)
        # and we also need to convert the number into a python index to retrieve the proper token in the list
        resid_pairs = []
        idx1 = int(qstart)-2
        idx2 = int(sstart)-2
        for i in range(len(qseq)):  # it could be also len(sseq)
            if qseq[i] != '-':
                idx1 += 1
            if sseq[i] != '-':
                idx2 += 1
            if qseq[i] == '-' or sseq[i] == '-':
                continue
            resid_pairs += [(resids1[idx1], resids2[idx2])]
        # Check contents of the residues of the alignment
        if len(resid_pairs) == 0:  # Alignment file was empty or there was no possible matching residues between the PDBs
            fu.log('Alignment file was empty or there was no possible matching residues between the PDBs' % self.stage_io_dict["unique_dir"], self.out_log)
            return False, False
        else:
            return seq_id, resid_pairs

[docs]    @launchlogger
    def launch(self):
        """Launches the execution of the GOdMDPrep module."""

        # check input/output paths and parameters
        self.check_data_params(self.out_log, self.err_log)

        # Setup Biobb
        if self.check_restart():
            return 0
        self.stage_files()

        # Starting the work...
        # Parsing the input PDB files
        pdb1 = self.io_dict["in"]["input_pdb_orig_path"]
        pdb2 = self.io_dict["in"]["input_pdb_target_path"]

        # Generate sequence of first PDB
        seq1, resids1 = self.extract_sequence(pdb1)

        # Generate sequence of second PDB
        seq2, resids2 = self.extract_sequence(pdb2)

        if len(seq1) < 50:
            fu.log('WARNING: Short sequence (ORIGIN)' % self.stage_io_dict["unique_dir"], self.out_log)
        if len(seq2) < 50:
            fu.log('WARNING: Short sequence (TARGET)' % self.stage_io_dict["unique_dir"], self.out_log)

        # Produce FASTA files
        name1 = "fasta1.fa"
        name2 = "fasta2.fa"
        fasta1Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name1))
        fasta1File = open(fasta1Filename, "w")
        fasta1File.write(">"+pdb1+"\n"+seq1)
        fasta1File.close()
        fasta2Filename = str(Path(self.stage_io_dict["unique_dir"]).joinpath(name2))
        fasta2File = open(fasta2Filename, "w")
        fasta2File.write(">"+pdb2+"\n"+seq2)
        fasta2File.close()

        waterFilename = str(Path(self.stage_io_dict["unique_dir"]).joinpath("water_align.out"))

        # water -auto -outfile=water_align.out -asequence=1ake.chains.nolig.pdb.fa
        # -bsequence=4ake.chains.pdb.fa -gapopen=12 -gapextend=2
        # -datafile=EPAM250 -aformat=markx10

        # Command line
        self.cmd = [self.binary_path,
                    '-auto',
                    '-outfile', waterFilename,
                    '-asequence', fasta1Filename,
                    '-bsequence', fasta2Filename,
                    '-gapopen', str(self.gapopen),
                    '-gapextend', str(self.gapextend),
                    '-datafile', self.datafile,
                    '-aformat', "markx10"
                    ]

        # Run Biobb block
        self.run_biobb()

        # Starting post-alignment process: generating .aln files

        # Retrieve sequence identity of the pair, based on local sequence alignment
        # previously computed, and generate the pairs of residues from both structres
        # to drive the superimposition
        seq_id, resid_pairs = self.retrieve_alignment(waterFilename, resids1, resids2)

        aln1Filename = self.io_dict["out"]["output_aln_orig_path"]
        aln1File = open(aln1Filename, 'w')
        for pair in resid_pairs:
            aln1File.write("%s%s\n" % (pair[0][1], pair[0][2].replace(" ", "")))
        aln1File.close()

        aln2Filename = self.io_dict["out"]["output_aln_target_path"]
        aln2File = open(aln2Filename, 'w')
        for pair in resid_pairs:
            aln2File.write("%s%s\n" % (pair[1][1], pair[1][2].replace(" ", "")))
        aln2File.close()

        # Copy files to host
        self.copy_to_host()

        # remove temporary folder(s)
        self.tmp_files.extend([
            self.stage_io_dict.get("unique_dir")
        ])
        self.remove_tmp_files()

        self.check_arguments(output_files_created=True, raise_exception=False)

        return self.return_code


[docs]def godmd_prep(input_pdb_orig_path: str, input_pdb_target_path: str,
               output_aln_orig_path: str, output_aln_target_path: str,
               properties: dict = None, **kwargs) -> int:
    """Create :class:`GOdMDPrep <godmd.godmd_prep.GOdMDPrep>`godmd.godmd_prep.GOdMDPrep class and
    execute :meth:`launch() <godmd.godmd_prep.GOdMDPrep.launch>` method"""

    return GOdMDPrep(input_pdb_orig_path=input_pdb_orig_path,
                     input_pdb_target_path=input_pdb_target_path,
                     output_aln_orig_path=output_aln_orig_path,
                     output_aln_target_path=output_aln_target_path,
                     properties=properties).launch()


[docs]def main():
    parser = argparse.ArgumentParser(description='Prepares input files for the GOdMD tool.', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=99999))
    parser.add_argument('--config', required=False, help='Configuration file')

    # Specific args
    required_args = parser.add_argument_group('required arguments')
    required_args.add_argument('--input_pdb_orig_path', required=True, help='Input PDB file to be used as origin in the conformational transition. Accepted formats: pdb.')
    required_args.add_argument('--input_pdb_target_path', required=True, help='Input PDB file to be used as target in the conformational transition. Accepted formats: pdb.')
    required_args.add_argument('--output_aln_orig_path', required=True, help='Output GOdMD alignment file corresponding to the origin structure of the conformational transition. Accepted formats: aln, txt.')
    required_args.add_argument('--output_aln_target_path', required=True, help='Output GOdMD alignment file corresponding to the target structure of the conformational transition. Accepted formats: aln, txt.')

    args = parser.parse_args()
    # config = args.config if args.config else None
    args.config = args.config or "{}"
    # properties = settings.ConfReader(config=config).get_prop_dic()
    properties = settings.ConfReader(config=args.config).get_prop_dic()

    # Specific call
    godmd_prep(input_pdb_orig_path=args.input_pdb_orig_path,
               input_pdb_target_path=args.input_pdb_target_path,
               output_aln_orig_path=args.output_aln_orig_path,
               output_aln_target_path=args.output_aln_target_path,
               properties=properties)


if __name__ == '__main__':
    main()