Skip to content

Instantly share code, notes, and snippets.

@advpetc
Last active July 13, 2017 11:23
Show Gist options
  • Select an option

  • Save advpetc/e9f057e4fbb3986baedefd83254ee8a7 to your computer and use it in GitHub Desktop.

Select an option

Save advpetc/e9f057e4fbb3986baedefd83254ee8a7 to your computer and use it in GitHub Desktop.
elimination cjk characters (including cjk punctuations) the efficient way
from nltk.tokenize.util import is_cjk
nltk_punc_range = (65280, 65519)
def eli_cjk(text):
"""
>>> eli_cjk("yes(中文),好玩".decode('utf8'))
u'yes'
>>> eli_cjk("how are you".decode('utf8'))
u'how are you'
>>> eli_cjk("how are 你。怎么yes样。".decode('utf8'))
u'how are yes'
>>> eli_cjk("請先安裝/更新您手機上的Google文字轉語音https://play.google.com/store/apps/details?id=com.google.android.tts".decode('utf8'))
u'/Googlehttps://play.google.com/store/apps/details?id=com.google.android.tts'
>>> eli_cjk("“Circular 37” means the Notice on Relevant Issues Concerning Foreign Exchange "
... "Administration for Domestic Residents to Engage in Overseas Investment and Financing and "
... "Round Trip Investment via Overseas Special Purpose Companies (关于境内居民通过境外特殊目的公司境"
... "外投融资及返程投资外汇管理有关问题的通知) issued by SAFE on July 4, 2014, as amended from time to time.".decode('utf8'))
u'\u201cCircular 37\u201d means the Notice on Relevant Issues Concerning Foreign Exchange Administration for Domestic Residents to Engage in Overseas Investment and Financing and \
Round Trip Investment via Overseas Special Purpose Companies () issued by SAFE on July 4, 2014, as amended from time to time.'
"""
concated_result = []
start_idx = 0
for curr_idx, each_char in enumerate(text):
if is_cjk(each_char) or (nltk_punc_range[0] <= ord(each_char) <= nltk_punc_range[1]):
if start_idx != curr_idx:
concated_result.append(text[start_idx:curr_idx])
start_idx = curr_idx+1
if start_idx == 0:
return text
if start_idx != len(text):
concated_result.append(text[start_idx:])
return "".join(concated_result)
if __name__ == "__main__":
import doctest, sys
if len(sys.argv) == 3:
if sys.argv[1] == '-v' and sys.argv[2] == '-t':
doctest.testmod()
else:
run() # params ain't defined
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment