messyidea · June 27, 2018 09:11 · Jun 27, 2018
diff --git a/ConvertTool.py b/ConvertTool.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+import os
+import sys
+import codecs
+import threading
+import json
+import time
+import hashlib
+import shutil
+import argparse
+from chardet.universaldetector import UniversalDetector
+
+class ConvertHandler(object):
+	def __init__(self, confidence=0.95):
+		self.confidence = confidence
+		self.max_detect_lines = 600
+
+	def _detect(self, file_name, cnt):
+		if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
+			return "", False
+		detector = UniversalDetector()
+		fp = open(file_name, 'rb')
+		for line in fp:
+			# cut MS-Windows CR code
+			line = line.replace(b'\r',b'')
+			detector.feed(line)
+			cnt -= 1
+			if detector.done or cnt == 0:
+				break
+		fp.close()
+		detector.close()
+		encoding = detector.result['encoding']
+		if encoding:
+			encoding = encoding.upper()
+		confidence = detector.result['confidence']
+
+		result = '{2}: Detected {0} with {1} confidence'.format(encoding, confidence, file_name)
+		if args.verbose:
+			print(result)
+
+		return encoding, confidence > self.confidence
+
+	def _convert(self, file_name, encoding, to_encoding):
+		if encoding == to_encoding:
+			# print("same encoding, ignore")
+			return
+
+		fp = None
+		try:
+			fp = codecs.open(file_name, 'rb', encoding, errors='strict')
+			contents = fp.read()
+			contents = contents.replace('\r\n', '\n').replace('\r', '\n')
+			contents = contents.encode(to_encoding)
+		except LookupError as e:
+			print(file_name + ":LookupError")
+			return
+		except UnicodeDecodeError as e:
+			print(file_name + ":UnicodeDecodeError")
+			return
+		except UnicodeEncodeError as e:
+			print(file_name + ":UnicodeEncodeError")
+			return
+		finally:
+			if fp:
+				fp.close()
+
+		if args.verbose:
+			print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))
+
+		with open(file_name, 'wb') as f:
+			f.write(contents)
+
+	def convert_file(self, file_name, to_encoding):
+		encoding, confidence = self._detect(file_name, self.max_detect_lines)
+		if confidence:
+			self._convert(file_name, encoding, to_encoding)
+
+	def convert_dir(self, dir, to_encoding='UTF-8'):
+		for fpathe, dirs, fs in os.walk(dir):
+			# 是否有必要跳过某些隐藏目录？
+			for f in fs:
+				path_str = os.path.join(fpathe, f)
+				if path_str.endswith("ConvertTool.py"):
+					continue
+				self.convert_file(path_str, to_encoding)
+
+			if not args.handle_all:
+				return
+
+
+
+if __name__ == '__main__':
+	parser = argparse.ArgumentParser(description="Convert your file to some other encodings")
+	parser.add_argument('--verbose', '-v', action='store_true', help='verbose mode')
+	parser.add_argument('--path', '-p', action="store", dest="path", required=True, type=str, help="path")
+	parser.add_argument('--to', '-t', action="store", dest="to_encoding", default='UTF-8', type=str, help="to encoding")
+	parser.add_argument('--all', '-r', action='store_true', dest="handle_all", help='handler all file in the subdir')
+	parser.add_argument('--confidence', '-c', action="store", dest="confidence", default=0.95, type=float, help="confidence")
+	global args
+	args = parser.parse_args() 
+
+	handler = ConvertHandler(args.confidence)
+	handler.convert_dir(args.path, args.to_encoding)
No results found