Last active
June 27, 2018 08:25
-
-
Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.
Revisions
-
messyidea renamed this gist
Jun 27, 2018 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
messyidea created this gist
Jun 27, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,88 @@ # -*- coding: utf-8 -*- import os import sys import codecs import threading import json import time import hashlib import shutil from chardet.universaldetector import UniversalDetector class ConvertHandler(object): def __init__(self): self.confidence = 0.95 self.max_detect_lines = 600 def _detect(self, file_name, cnt): if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: return "", False detector = UniversalDetector() fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] # result = 'Detected {0} with {1} confidence'.format(encoding, confidence) # print(result) return encoding, confidence > 0.95 def _convert(self, file_name, encoding, to_encoding): if encoding == to_encoding: # print("same encoding, ignore") return fp = None try: fp = codecs.open(file_name, 'rb', encoding, errors='strict') contents = fp.read() contents = contents.replace('\r\n', '\n').replace('\r', '\n') contents = contents.encode(to_encoding) except LookupError as e: print("LookupError") return except UnicodeDecodeError as e: print("UnicodeDecodeError") return except UnicodeEncodeError as e: print(file_name + ":UnicodeEncodeError") return finally: if fp: fp.close() print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name)) with open(file_name, 'wb') as f: f.write(contents) def convert_file(self, file_name, to_encoding): encoding, confidence = self._detect(file_name, self.max_detect_lines) if confidence: self._convert(file_name, encoding, to_encoding) def convert_dir(self, dir, to_encoding='UTF-8'): for fpathe, dirs, fs in os.walk(dir): # 是否有必要跳过某些隐藏目录? for f in fs: path_str = os.path.join(fpathe, f) if path_str.endswith("ConvertToUTF8.py"): continue self.convert_file(path_str, to_encoding) if __name__ == '__main__': handler = ConvertHandler() handler.convert_dir(".", "UTF-8")