1 Comments

DNA序列比对,Needleman-Wunsch算法的python实现

发布于:2018-04-25  |   作者:admin  |   已聚集:人围观

       Needleman-Wunsch算法也算是DNA序列比对中的经典算法了,由于写一个OTU聚类软件的需要,仔细研究了一下NW算法,然后用python编程实现。再次,就不解释Needleman-Wunsch算法的具体内容(网上能搜索到一大堆,好吧,我承认其实是因为我很难说清楚)。下面的代码除了比对之外,还计算了两条序列的遗传距离(genetic distance),直接贴代码吧!

       由于python的速度确实是瓶颈,而且需要比对的序列规模实在太大,因此,我又在c语言上实现了一次。代码下次再贴出来。

 

def NWDistance(seedSequence, candidateSequence):
    s = -1  # a mismatch would deduce 1 point.
    m = 1  # plus 1 point for one match.
    g = -3  # deduce 2 point for one gap.
    seedSequence = seedSequence.strip()
    candidateSequence = candidateSequence.strip()
    if len(seedSequence) == 0:
        print "Error, seed sequence length equal zero."
        sys.exit(1)
    elif len(candidateSequence) == 0:
        print "Error, candidate sequence length equal zero."
        sys.exit(1)
    sLen = len(seedSequence)
    cLen = len(candidateSequence)
    table = []
    for m in range(0, len(seedSequence) + 1):
        table.append([m * g])
    table[0] = []
    for n in range(0, len(candidateSequence) + 1):
        table[0].append(n * g)
    for i in range(sLen):
        for j in range(cLen):
            table[i + 1].append(
                max(
                    table[i][j] + (m if seedSequence[i] == candidateSequence[j] else s),
                    table[i][j + 1] + g,
                    table[i + 1][j] + g,
                )
            )
    #
    i = sLen - 1
    j = cLen - 1
    NewSeed = seedSequence[i]
    NewCandidate = candidateSequence[j]
    if len(seedSequence) <= 1 or len(candidateSequence) <= 1:
        print "Error, too short!"
        sys.exit(1)
    while True:
        if i == 0 and j == 0:
            break
        if seedSequence[i] == candidateSequence[j]:
            if table[i - 1][j - 1] + 1 > table[i - 1][j] - 2 and table[i - 1][j - 1] + 1 > table[i][j - 1] - 2:
                i = i - 1
                j = j - 1
                NewSeed = u"%s%s" % (seedSequence[i], NewSeed)
                NewCandidate = u"%s%s" % (candidateSequence[j], NewCandidate)
            else:
                if table[i][j + 1] > table[i + 1][j]:
                    i = i - 1
                    NewSeed = u"%s%s" % (seedSequence[i], NewSeed)
                    NewCandidate = u"%s%s" % ('-', NewCandidate)
                else:
                    j = j - 1
                    NewSeed = u"%s%s" % ('-', NewSeed)
                    NewCandidate = u"%s%s" % (candidateSequence[j], NewCandidate)
        else:
            if table[i - 1][j - 1] + 1 > table[i - 1][j] - 2 and table[i - 1][j - 1] + 1 > table[i][j - 1] - 2:
                i = i - 1
                j = j - 1
                NewSeed = u"%s%s" % (seedSequence[i], NewSeed)
                NewCandidate = u"%s%s" % (candidateSequence[j], NewCandidate)
            else:
                if table[i][j + 1] > table[i + 1][j]:
                    i = i - 1
                    NewSeed = u"%s%s" % (seedSequence[i], NewSeed)
                    NewCandidate = u"%s%s" % ('-', NewCandidate)
                else:
                    j = j - 1
                    NewSeed = u"%s%s" % ('-', NewSeed)
                    NewCandidate = u"%s%s" % (candidateSequence[j], NewCandidate)
    print NewSeed
    print NewCandidate
    # distance
    mismath = 0
    math = 0
    gap = 0
    charZipList = zip(NewSeed, NewCandidate)
    # delete the head gap
    for n in range(len(charZipList)):
        if "-" in charZipList[n]:
            del charZipList[0]
        else:
            break
    # delete the tail gap
    while True:
        lastTuple = charZipList.pop()
        if "-" in lastTuple:
            continue
        else:
            charZipList.append(lastTuple)
            break
    #
    for n in range(len(charZipList)):
        charTuple = charZipList[n]
        if charTuple[0] == charTuple[1]:
            math += 1
        elif "-" in charTuple:
            gapLoc = charTuple.index("-")
            if charZipList[n + 1][gapLoc] == "-":
                continue
            else:
                gap += 1
        else:
            mismath += 1
    distance = round(1.0 - float(math) / float(mismath + math + gap), 4)
    return distance

标签:python(18)DNA比对(1)
    输入验证码:
点击我更换验证码