Created
April 29, 2015 03:57
-
-
Save nickstenning/bf09f4538878b97ebe6f to your computer and use it in GitHub Desktop.
Unicode character counting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html lang="en"> | |
<title>Unicode normalisation and counting</title> | |
<meta charset="utf-8"> | |
<style> | |
table { | |
border-collapse: collapse; | |
border-spacing: 0; | |
} | |
tr.count { | |
border-top: 1px solid #aaa; | |
} | |
tr.mismatch td:nth-child(3) { | |
background: #c00; | |
} | |
tr.mismatch td:nth-child(4) { | |
background: #b00; | |
} | |
th, td { | |
padding: 5px; | |
} | |
td:nth-child(even) { | |
background: rgba(0, 0, 0, 0.2); | |
} | |
</style> | |
<table> | |
<thead> | |
<tr> | |
<th>unicode</th> | |
<th>repr</th> | |
<th>len (js)</th> | |
<th>len (py)</th> | |
<th>what?</th> | |
</tr> | |
</thead> | |
<tbody> | |
<tr | |
title="U+1D400 MATHEMATICAL BOLD CAPITAL A"> | |
<td>U+1D400</td> | |
<td>𝐀</td> | |
<td></td> | |
<td>1</td> | |
<td>astral codepoint</td> | |
</tr> | |
<tr | |
title="U+1F600 GRINNING FACE"> | |
<td>U+1F600</td> | |
<td>😀</td> | |
<td></td> | |
<td>1</td> | |
<td>astral codepoint</td> | |
</tr> | |
<tr | |
title="U+00F1 LATIN SMALL LETTER N WITH TILDE"> | |
<td>U+00F1</td> | |
<td>ñ</td> | |
<td></td> | |
<td>1</td> | |
<td>combining</td> | |
</tr> | |
<tr | |
title="U+006E LATIN SMALL LETTER N + U+0303 COMBINING TILDE"> | |
<td>U+006E U+0303</td> | |
<td>ñ</td> | |
<td></td> | |
<td>2</td> | |
<td>combining</td> | |
</tr> | |
<tr | |
title="U+0061 LATIN SMALL LETTER A + U+0328 COMBINING OGONEK + U+0301 COMBINING ACUTE ACCENT" | |
> | |
<td>U+0061 U+0328 U+0301</td> | |
<td>ą́</td> | |
<td></td> | |
<td>3</td> | |
<td>combining</td> | |
</tr> | |
<tr | |
title="U+1F600 GRINNING FACE + U+0362 COMBINING DOUBLE RIGHTWARDS ARROW BELOW"> | |
<td>U+1F600 U+0362</td> | |
<td>😀͢</td> | |
<td></td> | |
<td>2</td> | |
<td>astral codepoint + combining</td> | |
</tr> | |
</tbody> | |
</table> | |
<script type="text/javascript"> | |
var rows = document.querySelectorAll('tbody tr'); | |
Array.prototype.forEach.call(rows, function (tr) { | |
var rep = tr.children[1]; | |
var lenjs = tr.children[2]; | |
var lenpy = tr.children[3]; | |
lenjs.innerText = rep.innerText.length; | |
if (lenjs.innerText !== lenpy.innerText) { | |
tr.classList.add('mismatch'); | |
} | |
}); | |
</script> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment