messyidea · June 27, 2018 08:25 · Jun 27, 2018 · Jun 27, 2018
diff --git a/gistfile1.txt → ConvertToUtf8.py b/gistfile1.txt → ConvertToUtf8.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+import codecs
+import threading
+import json
+import time
+import hashlib
+import shutil
+from chardet.universaldetector import UniversalDetector
+
+class ConvertHandler(object):
+	def __init__(self):
+		self.confidence = 0.95
+		self.max_detect_lines = 600
+
+	def _detect(self, file_name, cnt):
+		if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
+			return "", False
+		detector = UniversalDetector()
+		fp = open(file_name, 'rb')
+		for line in fp:
+			# cut MS-Windows CR code
+			line = line.replace(b'\r',b'')
+			detector.feed(line)
+			cnt -= 1
+			if detector.done or cnt == 0:
+				break
+		fp.close()
+		detector.close()
+		encoding = detector.result['encoding']
+		if encoding:
+			encoding = encoding.upper()
+		confidence = detector.result['confidence']
+
+		# result = 'Detected {0} with {1} confidence'.format(encoding, confidence)
+		# print(result)
+
+		return encoding, confidence > 0.95
+
+	def _convert(self, file_name, encoding, to_encoding):
+		if encoding == to_encoding:
+			# print("same encoding, ignore")
+			return
+
+		fp = None
+		try:
+			fp = codecs.open(file_name, 'rb', encoding, errors='strict')
+			contents = fp.read()
+			contents = contents.replace('\r\n', '\n').replace('\r', '\n')
+			contents = contents.encode(to_encoding)
+		except LookupError as e:
+			print("LookupError")
+			return
+		except UnicodeDecodeError as e:
+			print("UnicodeDecodeError")
+			return
+		except UnicodeEncodeError as e:
+			print(file_name + ":UnicodeEncodeError")
+			return
+		finally:
+			if fp:
+				fp.close()
+
+		print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))
+
+		with open(file_name, 'wb') as f:
+			f.write(contents)
+
+	def convert_file(self, file_name, to_encoding):
+		encoding, confidence = self._detect(file_name, self.max_detect_lines)
+		if confidence:
+			self._convert(file_name, encoding, to_encoding)
+
+	def convert_dir(self, dir, to_encoding='UTF-8'):
+		for fpathe, dirs, fs in os.walk(dir):
+			# 是否有必要跳过某些隐藏目录？
+			for f in fs:
+				path_str = os.path.join(fpathe, f)
+				if path_str.endswith("ConvertToUTF8.py"):
+					continue
+				self.convert_file(path_str, to_encoding)
+
+
+
+if __name__ == '__main__':
+	handler = ConvertHandler()
+	handler.convert_dir(".", "UTF-8")