Last active
December 6, 2018 08:32
-
-
Save AlexanderFillbrunn/436cac1312cef603199c0c8f5466eaab to your computer and use it in GitHub Desktop.
Remove insertions and deletions in a String given a reference
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script repairs insertions and deletions in a sequence by replacing them | |
with the correct value from a reference sequence, but keeps replacements. | |
The algorithm calculates a Levenshtein matrix first and then retraces the path | |
of least distances from the bottom right to the top left. Depending on the | |
movement along that path, it indentifies the different types of modifications | |
and fixes them if they are insertions or deletions. | |
Examples: | |
======================== | |
Reference: TAGCAGACAT | |
Sequence: TAGXCAGACAT | |
Result: TAGCAGACAT | |
Here the result is the reference sequence, because the input sequence | |
only has an additional "X" at the 4th position. | |
======================== | |
Reference: TAGCAGACAT | |
Sequence: TAGXXGACAT | |
Result: TAGXXGACAT | |
Here the result is the unchanged input sequence, because it only contains two | |
replacements that are kept. | |
======================== | |
Reference: TAGCAGACAT | |
Sequence: TAXCAGCAT | |
Result: TAXCAGACAT | |
Here the "X" at the 3rd position is kept, as it is a replacement, but the | |
missing "A" at position 7 is reinserted. | |
""" | |
import numpy as np | |
from sys import argv | |
_, ref, mod = argv | |
# Initialize matrix | |
mat = np.zeros((len(mod) + 1, len(ref) + 1)) | |
mat[:, 0] = np.arange(0, mat.shape[0]) | |
mat[0, :] = np.arange(0, mat.shape[1]) | |
# Calculate matrix | |
for i in range(1, mat.shape[0]): | |
for j in range(1, mat.shape[1]): | |
top = mat[i - 1, j] + 1 | |
left = mat[i, j - 1] + 1 | |
diag = mat[i - 1, j - 1] + int(mod[i - 1] != ref[j - 1]) | |
mat[i, j] = min(top, left, diag) | |
# Here we prepend the letters back to front | |
result = "" | |
# Retrace the path from the bottom right to top left | |
cur = (mat.shape[0] - 1, mat.shape[1] - 1) | |
while cur != (0,0): | |
top = mat[cur[0] - 1, cur[1]] | |
left = mat[cur[0], cur[1] - 1] | |
diag = mat[cur[0] - 1,cur[1] - 1] | |
# Move along the path with the smallest distances, | |
# preferring the diagonal | |
if diag <= top and diag <= left: | |
next = (cur[0] - 1, cur[1] - 1) | |
elif top <= left: | |
next = (cur[0] - 1, cur[1]) | |
else: | |
next = (cur[0], cur[1] - 1) | |
# Diagonal move means either modification or match, | |
# so we just append the original character | |
if cur[0] != next[0] and cur[1] != next[1]: | |
result = mod[cur[0] - 1] + result | |
# Horizontal move is a deletion that has to be fixed by inserting | |
# the character from the reference | |
elif cur[0] == next[0]: | |
result = ref[cur[1] - 1] + result | |
# Vertical move is an insertion that is not appended to result | |
# Advance the pointer along the path | |
cur = next | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment