Source code for pyprot.align.blosum

from math import log, ceil

from pyprot.base.aminoacid import AminoAcid
from pyprot.data.fasta import getSequencesFromFasta
from pyprot.align.score import ScoreMatrix


[docs]def belongs(outSequence, group, minMatches):
    """
    Returns True if there's at least 'minMatches' matches between outSequence and any sequence 
    from 'group'; False otherwise.
    Size of all sequences is assumed to be equal.
    """
    for inSequence in group:
        matches = 0
        for i in range(len(outSequence)):
            if outSequence[i] == inSequence[i]:
                matches += 1
            if matches >= minMatches:  # As soon as a match is found
                return True
    return False


[docs]def makeGroupsFromFasta(path, requiredIdentityPercent):
    """
    Loads Sequences from file at 'path' and separate them in groups with an identity of 
    at least 'requiredIdentityPercent'.
    Returns a list representing the groups as lists of Protein objects.
    """
    sequences = [seq for seq in getSequencesFromFasta(path)]
    groups = [[sequences[0]]]  # First sequence is assigned to first group

    seqSize = len(sequences[0])  # Size of the sequences

    # Number of matches required to achieve requiredIdentityPercent
    minMatches = ceil((requiredIdentityPercent / 100) * seqSize)

    # For each outSequence not yet in a group
    for outSequence in sequences[1:]:
        groupFound = False  # has a group been found ?
        groupIndex = 0  # index of the group we're looking in

        # Look for a group where outSequence belongs
        while (not groupFound) and groupIndex < len(groups):
            if belongs(outSequence, groups[groupIndex], minMatches):
                groups[groupIndex].append(outSequence)
                groupFound = True
            else:
                groupIndex += 1  # Move on to next group

        # If no group works, create a new one
        if not groupFound:
            groups.append([outSequence])

    return groups


[docs]def valueDictsFromGroups(groups):
    """
    Transforms each group from 'groups' into a list of dictionaries, one for each Protein column.
    The dictionaries map each AminoAcid found in that column to their count.
    Returns the list of dictionaries and a list of the size (in Sequences) of their groups.
    """
    seqSize = len(groups[0][0])
    groupCount = len(groups)

    groupValues = []
    groupSizes = []

    for group in groups:  # For each group
        groupAAs = []
        groupSize = len(group)  # Size of group (n sequences)

        for col in range(seqSize):  # For each column
            groupCol = {}

            for seq in group:  # For each sequence in group
                try:
                    groupCol[seq[col]] += 1  # Increment count
                except:
                    groupCol[seq[col]] = 1

            groupAAs.append(groupCol)
        groupValues.append(groupAAs)
        groupSizes.append(groupSize)
    return groupValues, groupSizes


[docs]def getFrequencies(groupValues, groupSizes):
    """
    Evaluates the frequencies of AminoAcids within columns of groups in 'groupValues'.
    Frequencies are weighted according to group sizes in 'groupSizes'.
    Returns two dictionaries and a number:
    -'freqPairs' maps pairs of AminoAcids to their frequencies
    -'freqSingle' maps single AminoAcids to their frequencies
    -'freqSum' is the sum of all frequencies
    """
    seqSize = len(groupValues[0])  # Size of the Sequences
    groupCount = len(groupSizes)  # Number of groups

    freqPairs = {}  # frequencies of amino acid pairs (fAB)

    # Frequencies of single amino acids (fA)
    freqSingle = {AminoAcid(aa): 0 for aa in AminoAcid.getAllNames()}

    freqSum = 0  # Sum of frequencies  sum(fAB)

    for col in range(seqSize):  # Each column
        for groupAIndex in range(groupCount - 1):  # Each groupA
            groupA = groupValues[groupAIndex]
            groupASize = groupSizes[groupAIndex]

            for groupBIndex in range(groupAIndex + 1, groupCount):  # Each further groupB
                groupB = groupValues[groupBIndex]
                groupBSize = groupSizes[groupBIndex]

                for aaA, aaACount in groupA[col].items():  # Each AA from groupA
                    aaAFreq = aaACount / groupASize  # Its frequency within groupA

                    for aaB, aaBCount in groupB[col].items():  # Each AA from groupB
                        aaBFreq = aaBCount / groupBSize  # Its frequency within groupB

                        aaPairFreq = aaAFreq * aaBFreq  # Pair frequency
                        freqSum += aaPairFreq  # Sum of all frequencies
                        freqSingle[aaA] += aaPairFreq / 2
                        freqSingle[aaB] += aaPairFreq / 2

                        # Index is unique to this pair
                        pairIndex = (aaA, aaB) if aaA > aaB else (aaB, aaA)
                        try:
                            freqPairs[pairIndex] += aaPairFreq
                        except:
                            freqPairs[pairIndex] = aaPairFreq

    return freqPairs, freqSingle, freqSum


[docs]def sumFrequenciesToProb(freqPairsList, freqSingleList, freqSumList):
    """
    Sums all frequencies in provided lists, and transforms them to probabilities according
    to the sum of frequencies found in 'freqSumList'.
    """
    fSum = sum(freqSumList)  # Absolute sum of all frequencies
    probPairs, probSingle = {}, {}  # Probabilities for pairs and single AAs

    for freqPairs, freqSingle in zip(freqPairsList, freqSingleList):
        for key, value in freqPairs.items():
            # Sum all frequencies for matching AA pairs, divided by fSum
            try:
                probPairs[key] += value / fSum
            except:
                probPairs[key] = value / fSum
        for key, value in freqSingle.items():
            # Sum all frequencies for matching AAs, divided by fSum
            try:
                probSingle[key] += value / fSum
            except:
                probSingle[key] = value / fSum

    return probPairs, probSingle


[docs]def blosumFromProbabilities(probPairs, probSingle, requiredIdentityPercent):
    """
    Fills and returns a Score according to the BLOSUM algorithm, from the probabilities
    of AA pairs and singletons provided in 'probPairs' and 'probSingle'.
    """
    # Create empty Score, ignoring AAs B,Z,J,U,O
    scoreMatrix = ScoreMatrix("", "BLOSUM{}".format(requiredIdentityPercent), "BZJUO")

    for key, qAB in probPairs.items():
        # qAB is the evolutionary probability of the AA pair (A,B)
        aaA, aaB = key  # Both amino acids
        pA, pB = probSingle[aaA], probSingle[aaB]  # Their single probabilities

        # eAB is the random probability of the AA pair (A, B) given their single probabilities
        eAB = pA * pA if aaA == aaB else 2 * pA * pB

        # The BLOSUM score for this pair is the log-odds-ratio of evolutionary and random prob.
        sAB = int(round(2 * log(qAB / eAB, 2)))
        scoreMatrix.setScore(aaA, aaB, sAB)  # Fill the matrix

    return scoreMatrix


[docs]def blosumFromFasta(requiredIdentityPercent, *filepaths):
    """
    Creates and returns a ScoreMatrix for all sequences in the provided 'filepaths',
    using the BLOSUM approach with an identity of at least 'requiredIdentityPercent'.
    Each file is grouped independently and only then their weighted probabilities are merged.
    :param requiredIdentityPercent: 
    :param filepaths: 
    """
    # Results for each different files are first stored in lists
    freqPairsList, freqSingleList, freqSumList = [], [], []

    for path in filepaths:  # for each .fasta file
        groups = makeGroupsFromFasta(path, requiredIdentityPercent)  # groups
        groupValues, groupSizes = valueDictsFromGroups(groups)  # groups as dicts
        freqPairs, freqSingle, freqSum = getFrequencies(groupValues, groupSizes)  # frequencies

        freqPairsList.append(freqPairs)  # Append results
        freqSingleList.append(freqSingle)
        freqSumList.append(freqSum)

    # Merge (sum) results together
    probPairs, probSingle = sumFrequenciesToProb(freqPairsList, freqSingleList, freqSumList)
    # Create the BLOSUM matrix
    blosum = blosumFromProbabilities(probPairs, probSingle, requiredIdentityPercent)
    print(blosum)
Navigation

Source code for pyprot.align.blosum

Quick search

Navigation