Source code for pyprot.structure.gor

from math import log, sqrt

import matplotlib.pyplot as pyplot
import numpy as np

from pyprot.base.aminoacid import AminoAcid


[docs]class GOR3:
    """
    Implements the GOR III secondary structure prediction algorithm.
    Objects of this class must be trained with known examples of sequences and their structure
    before being able to predict the structures of new sequences.
    """

[docs]    def __init__(self):
        self.structures = "HETC"
        self.roc = {(s, r): 0 for s in self.structures for r in ("TP", "TN", "FP", "FN")}
        self.minScore, self.maxScore = 0, 0
        self.scores = {(classifier, realS): [] for classifier in self.structures for realS in self.structures}
        self.correctPred = 0
        self.totalPred = 0
        self.neighbourOffset = 8

        self.trainings = 0  # Number of trainings (one per AA)
        self.strucCount = {s: 0 for s in self.structures}
        self.pairCount = {(s, a): 0 for s in self.structures for a in AminoAcid.getAllNames()}
        self.tripletCount = {}
        for s in self.structures:
            for a in AminoAcid.getAllNames():
                for na in AminoAcid.getAllNames():  # Neighbour AA
                    self.tripletCount[(s, a, na)] = 0

[docs]    def train(self, sequence, structure):
        """
        Trains the system with a known example of a sequence and its structure.
        """
        self.trainings += len(sequence)

        for index in range(len(sequence)):
            curAminoacid = sequence[index]
            curStructure = structure[index]
            self.strucCount[curStructure] += 1
            self.pairCount[(curStructure, curAminoacid)] += 1

            for neiAminoacid in self.neighbourValues(sequence, index):
                self.tripletCount[(curStructure, curAminoacid, neiAminoacid)] += 1

[docs]    def predict(self, sequence, realStructure=None):
        """
        Returns the predicted structure of 'sequence', based on received training.
        """
        structure = []  # Result: predicted structure

        # Predict structures for each aminoacid in sequence
        for index in range(len(sequence)):
            curAminoacid = sequence[index]

            # First possible structure
            predStructure = self.structures[0]
            predScore = self.__getScore(sequence, index, predStructure)

            # Other structures
            for curStructure in self.structures[1:]:
                curScore = self.__getScore(sequence, index, curStructure)

                # Remember structure that gives best score
                if curScore > predScore:
                    predStructure = curStructure
                    predScore = curScore

            structure.append(predStructure)
        structure = "".join(structure)

        if not realStructure is None:
            self.__quality(sequence, structure, realStructure)

        return structure

    def __getScore(self, sequence, index, struct):
        """
        Returns I(deltaS, R) as defined by the GOR III algorithm.
        """
        aminoacid = sequence[index]
        scoreTerms = []

        score = self.pairCount[(struct, aminoacid)]
        score /= sum(self.pairCount[(otherStruct, aminoacid)] for otherStruct in self.getStructures(struct))
        scoreTerms.append(score)

        score = (self.trainings - self.strucCount[struct]) / self.strucCount[struct]
        scoreTerms.append(score)

        for neiAminoacid in self.neighbourValues(sequence, index):
            score = self.tripletCount[(struct, aminoacid, neiAminoacid)]
            score /= sum(
                self.tripletCount[(otherStruct, aminoacid, neiAminoacid)] for otherStruct in self.getStructures(struct))
            scoreTerms.append(score)

            score = sum(self.pairCount[(otherStruct, aminoacid)] for otherStruct in self.getStructures(struct))
            score /= self.pairCount[(struct, aminoacid)]
            scoreTerms.append(score)

        return sum(log(s) for s in scoreTerms)

[docs]    def neighbourOffsets(self):
        for offset in range(-self.neighbourOffset, self.neighbourOffset + 1):
            if offset != 0:
                yield offset

[docs]    def neighbourValues(self, sequence, index):
        for offset in self.neighbourOffsets():
            neiIndex = index + offset
            if neiIndex >= 0 and neiIndex < len(sequence):
                yield sequence[neiIndex]

[docs]    def getStructures(self, exclude=None):
        for s in self.structures:
            if s != exclude:
                yield s

    def __quality(self, sequence, structure, reality):
        self.totalPred += len(structure)  # Total predictions
        self.correctPred += sum([1 if s == r else 0 for s, r in zip(structure, reality)])  # Correct predictions
        for index in range(len(sequence)):
            scores = [self.__getScore(sequence, index, classifier) for classifier in self.structures]
            maxScore = max(scores)
            for classifier, score in zip(self.structures, scores):
                realS = reality[index]
                self.scores[(classifier, realS)].append(score)
                self.minScore = score if score < self.minScore else self.minScore
                self.maxScore = score if score > self.maxScore else self.maxScore
                if score == maxScore:  # Positive
                    if classifier == realS:  # True
                        self.roc[(classifier, "TP")] += 1
                    else:  # False
                        self.roc[(classifier, "FP")] += 1
                else:  # Negative
                    if classifier != realS:  # True
                        self.roc[(classifier, "TN")] += 1
                    else:  # False
                        self.roc[(classifier, "FN")] += 1

[docs]    def getQuality(self):
        tp = sum([self.roc[(c, "TP")] for c in self.structures])
        tn = sum([self.roc[(c, "TN")] for c in self.structures])
        fp = sum([self.roc[(c, "FP")] for c in self.structures])
        fn = sum([self.roc[(c, "FN")] for c in self.structures])
        mcc = (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        q3 = self.correctPred / self.totalPred
        return q3, mcc

[docs]    def plotROC(self):
        for classifier in self.structures:
            x, y = [], []
            for threshold in np.arange(self.minScore, self.maxScore, 0.1):
                tp, tn, fp, fn = 0, 0, 0, 0
                for realS in self.structures:
                    for score in self.scores[(classifier, realS)]:
                        if score >= threshold:  # Positive
                            if classifier == realS:  # True
                                tp += 1
                            else:  # False
                                fp += 1
                        else:  # Negative
                            if classifier != realS:  # True
                                tn += 1
                            else:  # False
                                fn += 1
                tpr = tp / (tp + fn)
                fpr = fp / (tn + fp)
                x.append(fpr)
                y.append(tpr)
            print("----- Receiver Operating Characteristic Curve -----")
            print("Classifier:", classifier)
            x.sort()
            y.sort()
            auc = np.trapz(y, x)  # Area Under Curve
            print("AUC:", auc)
            pyplot.title("Classifier " + classifier)
            pyplot.xlabel('False Positive Rate')
            pyplot.ylabel('True Positive Rate')
            pyplot.plot([0, 1], [0, 1])
            pyplot.plot(x, y)
            pyplot.show()
Navigation

Source code for pyprot.structure.gor

Quick search

Navigation