Skip to content

Instantly share code, notes, and snippets.

@NothingCtrl
Created November 21, 2023 04:50
Show Gist options
  • Save NothingCtrl/da8943dd4ac888e413cae31efecf5ef4 to your computer and use it in GitHub Desktop.
Save NothingCtrl/da8943dd4ac888e413cae31efecf5ef4 to your computer and use it in GitHub Desktop.
(Python) Remove Vietnamese accents in text
import re
import unicodedata
def remove_vietnamese_accents(input_str: str) -> str:
# normalize input to unicode composed
input_str = unicodedata.normalize('NFC', input_str)
s1 = 'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
s0 = 'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
s = ''
for c in input_str:
if c in s1:
s += s0[s1.index(c)]
else:
s += c
latin_only = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', u'', s)
if len(latin_only):
return latin_only
return s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment