Source code for pyprot.align.align

from copy import deepcopy

from pyprot.base.sequence import Sequence


[docs]class Aligned:
    """
    Represents two aligned Sequences with some metadata about the alignment.
    Alignments can be two-sequences alignments or multiple sequence alignments.
    """

[docs]    def __init__(self, seqA, seqB, seqAStart, seqBStart, alignType, alignScore, scoreMatrix, isMultiple=False):
        assert (len(seqA) == len(seqB))

        # seqA is always a Sequence, seqB can be a list of ints (if MSA)
        self.seqA = seqA if isinstance(seqA, Sequence) else seqB
        self.seqB = seqB if isinstance(seqA, Sequence) else seqA
        self.seqInter = []  # Sequence separators (.: )

        self.seqAStart = seqAStart if isinstance(seqA, Sequence) else seqBStart
        self.seqBStart = seqBStart if isinstance(seqA, Sequence) else seqAStart

        self.size = len(self.seqA)

        self.alignType = alignType  # Options used for alignemnt
        self.alignScore = round(alignScore, 2)  # Score of alignment

        self.isMultiple = isMultiple  # Is this a M.S.A ?
        self.pssmDescription = None
        if self.isMultiple:
            self.pssmDescription = scoreMatrix.getDescription()

        self.condensed = False  # When true, aligned Sequences are not displayed
        self.chunkSize = 80  # Used for display
        self.sizeIndic = 20  # Same

        self.identity = 0  # Identity (same AAs)
        self.gaps = 0  # Gap count (either AA is a gap)
        self.seqAGaps = 0
        self.seqBGaps = 0
        self.similarity = 0  # Similarity (positive score between AAs)

        if self.isMultiple:
            for aa in self.seqA:
                if aa.isGap():
                    self.gaps += 1
                    self.seqAGaps += 1
        else:
            for aa1, aa2 in zip(self.seqA, self.seqB):
                if aa1.isGap() or aa2.isGap():
                    if aa1.isGap():
                        self.seqAGaps += 1
                    else:
                        self.seqBGaps += 1
                    self.seqInter.append(" ")
                    self.gaps += 1
                elif aa1 == aa2:
                    self.seqInter.append(":")
                    self.identity += 1
                elif scoreMatrix.getScore(aa1, aa2) >= 0:
                    self.seqInter.append(".")
                    self.similarity += 1
                else:
                    self.seqInter.append(" ")
        self.seqAEnd = self.seqAStart + len(self.seqA) - self.seqAGaps
        self.seqBEnd = self.seqBStart + len(self.seqB) - self.seqBGaps

[docs]    def getBottomSequence(self):
        return self.seqB

    def __repr__(self):
        """
        Object representation.
        """
        res = []
        if self.isMultiple:
            res.append("---------- Multi-Seq. Alignment ----------")
        else:
            res.append("---------- Alignment ----------")
        res.append("Size       : " + str(self.size))
        res.append("Type       : " + self.alignType)
        res.append("Score      : " + str(self.alignScore))
        if not self.isMultiple:
            res.append("Identity   : " + str(self.identity) \
                       + " ({0:.2f}%)".format(100 * (self.identity / len(self.seqA))))
            res.append("Similarity : " + str(self.similarity) \
                       + " ({0:.2f}%)".format(100 * (self.similarity / len(self.seqA))))
        res.append("Gaps       : " + (str(self.gaps) if self.gaps > 0 else "None"))
        res.append("")
        if self.isMultiple:
            res.append("PSSM : " + self.pssmDescription)
            res.append("Aligned seq. : " + self.seqA.getDescription())
            res.append("\t" + str(self.seqAGaps) + " Gaps, " + str(self.size - self.seqAGaps) +
                       " AAs (positions " + str(self.seqAStart) + " to " + str(self.seqAEnd) + ")")
        else:
            res.append("Upper seq. : " + self.seqA.getDescription())
            res.append("\t" + str(self.seqAGaps) + " Gaps, " + str(self.size - self.seqAGaps) +
                       " AAs (positions " + str(self.seqAStart) + " to " + str(self.seqAEnd) + ")")
            res.append("Lower seq. : " + self.seqB.getDescription())
            res.append("\t" + str(self.seqBGaps) + " Gaps, " + str(self.size - self.seqBGaps) +
                       " AAs (positions " + str(self.seqBStart) + " to " + str(self.seqBEnd) + ")")
        res.append("")

        if self.condensed:
            return "\n".join(res)

        listA, listB, listI = [], [], []
        maxALen, maxBLen = 0, 0
        seqAPos, seqBPos = self.seqAStart, self.seqBStart
        seqANextPos, seqBNextPos = seqAPos, seqBPos

        for index in range(self.size):
            a = self.seqA[index]
            if not a.isGap():
                seqANextPos += 1
            listA.append(str(a))
            maxALen = max(len(str(a)), maxALen)

            if not self.isMultiple:
                listI.append(self.seqInter[index])
                b = self.seqB[index]
                if not b.isGap():
                    seqBNextPos += 1
                listB.append(str(b))
                maxBLen = max(len(str(b)), maxBLen)

            if index != 0 and (index % self.chunkSize == 0 or index == len(self.seqA) - 1):
                res.append(str(seqAPos))
                for i in range(maxALen - 1, -1, -1):
                    res.append("".join([s[-i - 1] if len(s) > i else " " for s in listA]))

                if not self.isMultiple:
                    res.append("".join(listI))
                    for i in range(maxBLen):
                        res.append("".join([s[i] if len(s) > i else " " for s in listB]))
                        res.append(str(seqBPos))
                res.append("\n")
                listA, listB, listI = [], [], []
                maxALen, maxBLen = 0, 0
                seqAPos, seqBPos = seqANextPos, seqBNextPos

        return "\n".join(res)


[docs]class Align:
    """
    Represents an alignment matrix, used to determine the alignment score between two Sequences
    """

[docs]    def __init__(self, scoreMatrix):
        """
        Creates an AlignMatrix object that uses scoreMatrix as a scoring system between amino acids.
        """
        self._scoreMatrix = scoreMatrix  # scoring matrix to use

        self._colSeq = None  # copies of Sequences to align
        self._rowSeq = None

        self._alignMatrix = None  # Used for alignment scores between Sequences
        self._rowGapMatrix = None  # used for affine gap penalty
        self._colGapMatrix = None
        self._originMatrix = None  # Used for backtracking purposes

        self._alignMode = ""  # Align mode (global, semiglobal, local)
        self._isSuboptimal = False  # Is the alignment subobtimal ?
        self._isMultiple = False  # Is the alignment done against multiple Sequences at once ?
        self._resultCount = None  # The number of expected results
        self._subOptimalDepth = None

        self._maxAlignScore = 0  # Maximum score
        self._currentAlignScore = 0  # Score for this alignment

        self._maxScoreRows = []  # Index of maximum values from _alignMatrix
        self._maxScoreCols = []
        self._currentAlignPath = []
        self._bestAlignPath = []  # Sequence of origins for alignment with best score
        self._allAlignPaths = []  # all Sequences of origins for alignments with best scores

        self._iniGapPenalty = 0  # Initial gap penalty
        self._extGapPenalty = 0  # Extended gap penalty

        self._alignedRowSeq = None  # Aligned Sequences (result of alignment)
        self._alignedColSeq = None

[docs]    def globalAlign(self, seqA, seqB, iniGapPenalty=1, extGapPenalty=None, semiGlobal=False, resultCount=-1):
        """
        Returns all best global alignment between 'seqA' and 'seqB' with the provided
        initial and extended gap penalties.
        If 'semiGlobal' is True, allows alignment to discard the start and end of either Sequence.
        (this allows for better alignments when Sequences overlap only partially)
        """
        if semiGlobal:
            self._alignMode = "semiglobal"
        else:
            self._alignMode = "global"
        self._resultCount = resultCount

        # Initialize all data structures
        self.__initialize(seqA, seqB, iniGapPenalty, extGapPenalty)

        # Align in global mode, from the bottom right
        if self._alignMode == "global":
            self._currentAlignScore = self._alignMatrix[-1][-1]
            yield from self.__align(len(self._rowSeq), len(self._colSeq))
        else:
            self._currentAlignScore = self._maxAlignScore
            for row, col in zip(self._maxScoreRows, self._maxScoreCols):
                yield from self.__align(row, col)

[docs]    def localAlign(self, seqA, seqB, iniGapPenalty=1, extGapPenalty=None, resultCount=1, subOptimalDepth=0):
        """
        Returns n best local alignment between 'seqA' and 'seqB' with the provided
        initial and extended gap penalties, where n equals 'resultCount' (-1 for all).
        If 'subOptimalDepth' equals m, looks for m best suboptimal alignments as well.
        """
        self._alignMode = "local"

        # Initialize all data structures
        self.__initialize(seqA, seqB, iniGapPenalty, extGapPenalty)

        # Number of expected results for each max. score
        self._resultCount = resultCount

        # Align in local mode, from the maximum value (optimal)
        self._currentAlignScore = self._maxAlignScore
        for row, col in zip(self._maxScoreRows, self._maxScoreCols):
            yield from self.__align(row, col)  # Results

        # If suboptimal alignments are requested
        self._subOptimalDepth = 0
        while subOptimalDepth != 0:
            self._isSuboptimal = True
            subOptimalDepth -= 1
            self._subOptimalDepth += 1

            self.__clearBestPath()  # Reevalute scores with best alignment reset to 0

            # Number of expected results for each max. score
            self._resultCount = resultCount

            # Align in local mode, from the new maximum value (sub-optimal)
            self._currentAlignScore = self._maxAlignScore
            for row, col in zip(self._maxScoreRows, self._maxScoreCols):
                yield from self.__align(row, col)

[docs]    def multiAlign(self, Sequence, resultCount=1, subOptimalDepth=3):
        """
        Returns n best local M.S.A. of 'Sequence' against the ScoreMatrix (which must be 
        position-specific and provide gap penalties) where n equals 'resultCount' (-1 for all).
        If 'subOptimalDepth' equals m, looks for m best suboptimal alignments as well.
        """
        self._isMultiple = True
        colSeq = [i for i in range(len(self._scoreMatrix))]
        yield from self.localAlign(colSeq, Sequence, 0, None, resultCount, subOptimalDepth)

    def __initialize(self, seqA, seqB, iniGapPenalty, extGapPenalty):
        """
        Sets all initial values for required data structures.
        """
        # Sequences
        if len(seqA) == 0 or len(seqB) == 0:
            raise ValueError("Sequences to align cannot be empty")
        self._colSeq = seqA
        self._rowSeq = seqB

        # Alignments
        self._isSuboptimal = False
        self._subOptimalDepth = 0

        self._alignedRowSeq = Sequence()
        if self._isMultiple:
            # In MSA (align with PSSM) there is no column Sequence
            self._alignedColSeq = []
        else:
            self._alignedColSeq = Sequence()

        self._maxAlignScore = 0  # Maximum score found while aligning
        self._maxScoreRows = []  # Rows of maximum score
        self._maxScoreCols = []  # Columns of maximum score

        self._currentAlignPath = []  # indexes of the current alignment
        self._bestAlignPath = []  # indexes of the alignment with the best score

        # Gap penalties
        self._iniGapPenalty = iniGapPenalty
        self._extGapPenalty = iniGapPenalty if extGapPenalty is None else extGapPenalty

        # Matrices
        self._alignMatrix = [[0 for i in range(len(self._colSeq) + 1)] \
                             for j in range(len(self._rowSeq) + 1)]
        self._rowGapMatrix = deepcopy(self._alignMatrix)
        self._colGapMatrix = deepcopy(self._alignMatrix)

        self._originMatrix = [["" for i in range(len(self._colSeq) + 1)] \
                              for j in range(len(self._rowSeq) + 1)]

        # Global alignment : first line and colunm have initial scores and origins
        if self._alignMode == "global":
            self.__initAlignValues()

        # Fill all matrices
        for row in range(1, len(self._rowSeq) + 1):
            for col in range(1, len(self._colSeq) + 1):
                self.__fill(row, col)

        # Find best scores
        self.__findBestScore()

    def __initAlignValues(self):
        for i in range(1, max(len(self._colSeq), len(self._rowSeq)) + 1):
            val = self._iniGapPenalty + self._extGapPenalty * (i - 1)
            if i <= len(self._colSeq):  # First row
                self._alignMatrix[0][i] = val
                self._rowGapMatrix[0][i] = val
                self._colGapMatrix[0][i] = val
                self._originMatrix[0][i] = "L"  # Origin is left

            if i <= len(self._rowSeq):  # First column
                self._alignMatrix[i][0] = val
                self._rowGapMatrix[i][0] = val
                self._colGapMatrix[i][0] = val
                self._originMatrix[i][0] = "T"  # Origin is top

    def __fill(self, row, col):
        """
        Calculates the score for matrix values at row 'row' and column 'col'.
        """
        # Row gap matrix
        if self._isMultiple:
            self._rowGapMatrix[row][col] = self._scoreMatrix.getGapPenalty(col)
        else:
            initialGapScore = self._alignMatrix[row - 1][col] + self._iniGapPenalty
            extendedGapScore = self._rowGapMatrix[row - 1][col] + self._extGapPenalty
            self._rowGapMatrix[row][col] = max(initialGapScore, extendedGapScore)

        # Column gap matrix
        if self._isMultiple:
            self._colGapMatrix[row][col] = self._scoreMatrix.getGapPenalty(col - 1)
        else:
            initialGapScore = self._alignMatrix[row][col - 1] + self._iniGapPenalty
            extendedGapScore = self._colGapMatrix[row][col - 1] + self._extGapPenalty
            self._colGapMatrix[row][col] = max(initialGapScore, extendedGapScore)

        # Align matrix
        deleteScore = self._rowGapMatrix[row][col]
        insertScore = self._colGapMatrix[row][col]
        matchScore = self._alignMatrix[row - 1][col - 1] + \
                     self._scoreMatrix.getScore(self._rowSeq[row - 1], self._colSeq[col - 1])

        maxScore = 0
        if self._alignMode == "local":  # Local alignment
            self._alignMatrix[row][col] = maxScore = max(insertScore, deleteScore, matchScore, 0)
        else:  # Global alignment
            self._alignMatrix[row][col] = maxScore = max(insertScore, deleteScore, matchScore)

        # Origin matrix
        if insertScore == maxScore:
            self._originMatrix[row][col] += "L"
        if matchScore == maxScore:
            self._originMatrix[row][col] += "D"
        if deleteScore == maxScore:
            self._originMatrix[row][col] += "T"

    def __clearBestPath(self):
        """
        Resets all scores from the first best alignment and then reevaluates the affected scores.
        A new maximum score will be found from these, allowing for new alignment lookups.
        """
        self._allAlignPaths.extend(self._bestAlignPath)

        # Clear scores from best align path
        for row, col in self._bestAlignPath:
            self._alignMatrix[row][col] = 0  # Reset scores
            self._colGapMatrix[row][col] = 0
            self._rowGapMatrix[row][col] = 0
            self._originMatrix[row][col] = ""

        # Reevaluate scores for further rows and columns (affected values)
        row, col = self._bestAlignPath[-1]
        for furtherRow in range(row, len(self._rowSeq) + 1):
            for furtherCol in range(col, len(self._colSeq) + 1):
                if (furtherRow, furtherCol) not in self._allAlignPaths:
                    self._originMatrix[furtherRow][furtherCol] = ""
                    self.__fill(furtherRow, furtherCol)

        self._bestAlignPath = []
        self.__findBestScore()

    def __findBestScore(self):
        self._maxAlignScore = -1
        self._maxScoreCols = []
        self._maxScoreRows = []
        for row in range(1, len(self._rowSeq) + 1):
            for col in range(1, len(self._colSeq) + 1):
                score = self._alignMatrix[row][col]
                if score > self._maxAlignScore:  # New max alignment
                    self._maxAlignScore = score
                    self._maxScoreRows = [row]
                    self._maxScoreCols = [col]
                elif score == self._maxAlignScore:  # Same max alignment
                    self._maxScoreRows.append(row)
                    self._maxScoreCols.append(col)

    def __align(self, i, j):
        """
        Yields all best alignments starting from row i and column j.
        """
        if self._resultCount == 0:
            return

        # Local alignments are complete when we reach a null score
        # Global alignments are complete when we reach the beginning of the matrix
        # Semiglobal alignments are complete when we reach an edge of the matrix
        if (self._alignMode == "local" and self._alignMatrix[i][j] == 0) \
                or (self._alignMode == "global" and i == 0 and j == 0) \
                or (self._alignMode == "semiglobal" and (i == 0 or j == 0)):

            alignDescription = self._alignMode + \
                               ("-suboptimal" + "(" + str(self._subOptimalDepth) + ")") * self._isSuboptimal

            # Create result (Sequence obj. for MSA, Aligned obj. otherwise)
            if self._isMultiple:
                result = Aligned(self._alignedColSeq,
                                 Sequence(self._alignedRowSeq, self._rowSeq.getDescription()), j, i,
                                 alignDescription, self._currentAlignScore, self._scoreMatrix, True)
            else:
                result = Aligned(Sequence(self._alignedColSeq, self._colSeq.getDescription()),
                                 Sequence(self._alignedRowSeq, self._rowSeq.getDescription()), j, i,
                                 alignDescription, self._currentAlignScore, self._scoreMatrix, False)

            # Remember first best alignment for subobtimal lookup
            if self._bestAlignPath == []:
                self._bestAlignPath = deepcopy(self._currentAlignPath)

            self._resultCount -= 1
            yield result

        else:
            for origin in self._originMatrix[i][j]:
                self._currentAlignPath.append((i, j))

                if origin == "T":  # top
                    self._alignedColSeq.insert(0, "-")
                    self._alignedRowSeq.insert(0, self._rowSeq[i - 1])
                    yield from self.__align(i - 1, j)

                elif origin == "D":  # diagonal
                    self._alignedColSeq.insert(0, self._colSeq[j - 1])
                    self._alignedRowSeq.insert(0, self._rowSeq[i - 1])
                    yield from self.__align(i - 1, j - 1)

                elif origin == "L":  # left
                    self._alignedColSeq.insert(0, self._colSeq[j - 1])
                    self._alignedRowSeq.insert(0, "-")
                    yield from self.__align(i, j - 1)

                else:
                    raise ValueError("Origin must be T (top), D (diagonal) or L (left)")

                self._currentAlignPath.pop()
                del self._alignedColSeq[0]
                del self._alignedRowSeq[0]

    # Representation
    def __repr__(self):
        """
        Representation of the alignment matrix.
        """
        if self._colSeq is None:
            return "No alignment done yet"

        result = []
        sepSize = 5
        rep = {"": "", "L": "_   ", "D": " \\  ", "T": "   |", "LD": "_\\  ", "DT": " \\ |", "LT": "_  |",
               "LDT": "_\\ |"}
        # Amino Acid column Sequence
        result.append(" " * 2 * sepSize)
        for aa in self._colSeq:
            result[-1] += '{a!s:<{w}}'.format(a=aa, w=sepSize)

        # First line of values (no amino acid)
        result.append(" ")
        for origin in self._originMatrix[0]:
            result[-1] += '{o:>{w}}'.format(o=rep[origin], w=sepSize)

        result.append(" " * sepSize)
        for value in self._alignMatrix[0]:
            result[-1] += '{v:<{w}}'.format(v=value, w=sepSize)

        # Amino Acid line Sequence and values
        for values, origins, aa in zip(self._alignMatrix[1:], self._originMatrix[1:], self._rowSeq):
            result.append(" ")
            for origin in origins:
                result[-1] += '{o:>{w}}'.format(o=rep[origin], w=sepSize)
            result.append("")
            result[-1] += '{a!s:<{w}}'.format(a=aa, w=sepSize)
            for value in values:
                result[-1] += '{v:<{w}}'.format(v=value, w=sepSize)

        return "\n".join(result)
Navigation

Source code for pyprot.align.align

Quick search

Navigation