Source code for pyprot.align.blosum

from math import log, ceil

from pyprot.base.aminoacid import AminoAcid
from pyprot.data.fasta import getSequencesFromFasta
from pyprot.align.score import ScoreMatrix


[docs]def belongs(outSequence, group, minMatches): """ Returns True if there's at least 'minMatches' matches between outSequence and any sequence from 'group'; False otherwise. Size of all sequences is assumed to be equal. """ for inSequence in group: matches = 0 for i in range(len(outSequence)): if outSequence[i] == inSequence[i]: matches += 1 if matches >= minMatches: # As soon as a match is found return True return False
[docs]def makeGroupsFromFasta(path, requiredIdentityPercent): """ Loads Sequences from file at 'path' and separate them in groups with an identity of at least 'requiredIdentityPercent'. Returns a list representing the groups as lists of Protein objects. """ sequences = [seq for seq in getSequencesFromFasta(path)] groups = [[sequences[0]]] # First sequence is assigned to first group seqSize = len(sequences[0]) # Size of the sequences # Number of matches required to achieve requiredIdentityPercent minMatches = ceil((requiredIdentityPercent / 100) * seqSize) # For each outSequence not yet in a group for outSequence in sequences[1:]: groupFound = False # has a group been found ? groupIndex = 0 # index of the group we're looking in # Look for a group where outSequence belongs while (not groupFound) and groupIndex < len(groups): if belongs(outSequence, groups[groupIndex], minMatches): groups[groupIndex].append(outSequence) groupFound = True else: groupIndex += 1 # Move on to next group # If no group works, create a new one if not groupFound: groups.append([outSequence]) return groups
[docs]def valueDictsFromGroups(groups): """ Transforms each group from 'groups' into a list of dictionaries, one for each Protein column. The dictionaries map each AminoAcid found in that column to their count. Returns the list of dictionaries and a list of the size (in Sequences) of their groups. """ seqSize = len(groups[0][0]) groupCount = len(groups) groupValues = [] groupSizes = [] for group in groups: # For each group groupAAs = [] groupSize = len(group) # Size of group (n sequences) for col in range(seqSize): # For each column groupCol = {} for seq in group: # For each sequence in group try: groupCol[seq[col]] += 1 # Increment count except: groupCol[seq[col]] = 1 groupAAs.append(groupCol) groupValues.append(groupAAs) groupSizes.append(groupSize) return groupValues, groupSizes
[docs]def getFrequencies(groupValues, groupSizes): """ Evaluates the frequencies of AminoAcids within columns of groups in 'groupValues'. Frequencies are weighted according to group sizes in 'groupSizes'. Returns two dictionaries and a number: -'freqPairs' maps pairs of AminoAcids to their frequencies -'freqSingle' maps single AminoAcids to their frequencies -'freqSum' is the sum of all frequencies """ seqSize = len(groupValues[0]) # Size of the Sequences groupCount = len(groupSizes) # Number of groups freqPairs = {} # frequencies of amino acid pairs (fAB) # Frequencies of single amino acids (fA) freqSingle = {AminoAcid(aa): 0 for aa in AminoAcid.getAllNames()} freqSum = 0 # Sum of frequencies sum(fAB) for col in range(seqSize): # Each column for groupAIndex in range(groupCount - 1): # Each groupA groupA = groupValues[groupAIndex] groupASize = groupSizes[groupAIndex] for groupBIndex in range(groupAIndex + 1, groupCount): # Each further groupB groupB = groupValues[groupBIndex] groupBSize = groupSizes[groupBIndex] for aaA, aaACount in groupA[col].items(): # Each AA from groupA aaAFreq = aaACount / groupASize # Its frequency within groupA for aaB, aaBCount in groupB[col].items(): # Each AA from groupB aaBFreq = aaBCount / groupBSize # Its frequency within groupB aaPairFreq = aaAFreq * aaBFreq # Pair frequency freqSum += aaPairFreq # Sum of all frequencies freqSingle[aaA] += aaPairFreq / 2 freqSingle[aaB] += aaPairFreq / 2 # Index is unique to this pair pairIndex = (aaA, aaB) if aaA > aaB else (aaB, aaA) try: freqPairs[pairIndex] += aaPairFreq except: freqPairs[pairIndex] = aaPairFreq return freqPairs, freqSingle, freqSum
[docs]def sumFrequenciesToProb(freqPairsList, freqSingleList, freqSumList): """ Sums all frequencies in provided lists, and transforms them to probabilities according to the sum of frequencies found in 'freqSumList'. """ fSum = sum(freqSumList) # Absolute sum of all frequencies probPairs, probSingle = {}, {} # Probabilities for pairs and single AAs for freqPairs, freqSingle in zip(freqPairsList, freqSingleList): for key, value in freqPairs.items(): # Sum all frequencies for matching AA pairs, divided by fSum try: probPairs[key] += value / fSum except: probPairs[key] = value / fSum for key, value in freqSingle.items(): # Sum all frequencies for matching AAs, divided by fSum try: probSingle[key] += value / fSum except: probSingle[key] = value / fSum return probPairs, probSingle
[docs]def blosumFromProbabilities(probPairs, probSingle, requiredIdentityPercent): """ Fills and returns a Score according to the BLOSUM algorithm, from the probabilities of AA pairs and singletons provided in 'probPairs' and 'probSingle'. """ # Create empty Score, ignoring AAs B,Z,J,U,O scoreMatrix = ScoreMatrix("", "BLOSUM{}".format(requiredIdentityPercent), "BZJUO") for key, qAB in probPairs.items(): # qAB is the evolutionary probability of the AA pair (A,B) aaA, aaB = key # Both amino acids pA, pB = probSingle[aaA], probSingle[aaB] # Their single probabilities # eAB is the random probability of the AA pair (A, B) given their single probabilities eAB = pA * pA if aaA == aaB else 2 * pA * pB # The BLOSUM score for this pair is the log-odds-ratio of evolutionary and random prob. sAB = int(round(2 * log(qAB / eAB, 2))) scoreMatrix.setScore(aaA, aaB, sAB) # Fill the matrix return scoreMatrix
[docs]def blosumFromFasta(requiredIdentityPercent, *filepaths): """ Creates and returns a ScoreMatrix for all sequences in the provided 'filepaths', using the BLOSUM approach with an identity of at least 'requiredIdentityPercent'. Each file is grouped independently and only then their weighted probabilities are merged. :param requiredIdentityPercent: :param filepaths: """ # Results for each different files are first stored in lists freqPairsList, freqSingleList, freqSumList = [], [], [] for path in filepaths: # for each .fasta file groups = makeGroupsFromFasta(path, requiredIdentityPercent) # groups groupValues, groupSizes = valueDictsFromGroups(groups) # groups as dicts freqPairs, freqSingle, freqSum = getFrequencies(groupValues, groupSizes) # frequencies freqPairsList.append(freqPairs) # Append results freqSingleList.append(freqSingle) freqSumList.append(freqSum) # Merge (sum) results together probPairs, probSingle = sumFrequenciesToProb(freqPairsList, freqSingleList, freqSumList) # Create the BLOSUM matrix blosum = blosumFromProbabilities(probPairs, probSingle, requiredIdentityPercent) print(blosum)