Last active
July 13, 2017 11:23
-
-
Save advpetc/e9f057e4fbb3986baedefd83254ee8a7 to your computer and use it in GitHub Desktop.
elimination cjk characters (including cjk punctuations) the efficient way
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.tokenize.util import is_cjk | |
| nltk_punc_range = (65280, 65519) | |
| def eli_cjk(text): | |
| """ | |
| >>> eli_cjk("yes(中文),好玩".decode('utf8')) | |
| u'yes' | |
| >>> eli_cjk("how are you".decode('utf8')) | |
| u'how are you' | |
| >>> eli_cjk("how are 你。怎么yes样。".decode('utf8')) | |
| u'how are yes' | |
| >>> eli_cjk("請先安裝/更新您手機上的Google文字轉語音https://play.google.com/store/apps/details?id=com.google.android.tts".decode('utf8')) | |
| u'/Googlehttps://play.google.com/store/apps/details?id=com.google.android.tts' | |
| >>> eli_cjk("“Circular 37” means the Notice on Relevant Issues Concerning Foreign Exchange " | |
| ... "Administration for Domestic Residents to Engage in Overseas Investment and Financing and " | |
| ... "Round Trip Investment via Overseas Special Purpose Companies (关于境内居民通过境外特殊目的公司境" | |
| ... "外投融资及返程投资外汇管理有关问题的通知) issued by SAFE on July 4, 2014, as amended from time to time.".decode('utf8')) | |
| u'\u201cCircular 37\u201d means the Notice on Relevant Issues Concerning Foreign Exchange Administration for Domestic Residents to Engage in Overseas Investment and Financing and \ | |
| Round Trip Investment via Overseas Special Purpose Companies () issued by SAFE on July 4, 2014, as amended from time to time.' | |
| """ | |
| concated_result = [] | |
| start_idx = 0 | |
| for curr_idx, each_char in enumerate(text): | |
| if is_cjk(each_char) or (nltk_punc_range[0] <= ord(each_char) <= nltk_punc_range[1]): | |
| if start_idx != curr_idx: | |
| concated_result.append(text[start_idx:curr_idx]) | |
| start_idx = curr_idx+1 | |
| if start_idx == 0: | |
| return text | |
| if start_idx != len(text): | |
| concated_result.append(text[start_idx:]) | |
| return "".join(concated_result) | |
| if __name__ == "__main__": | |
| import doctest, sys | |
| if len(sys.argv) == 3: | |
| if sys.argv[1] == '-v' and sys.argv[2] == '-t': | |
| doctest.testmod() | |
| else: | |
| run() # params ain't defined |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment