Created
June 27, 2018 09:11
-
-
Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.
Revisions
-
messyidea created this gist
Jun 27, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,103 @@ # -*- coding: utf-8 -*- import os import sys import codecs import threading import json import time import hashlib import shutil import argparse from chardet.universaldetector import UniversalDetector class ConvertHandler(object): def __init__(self, confidence=0.95): self.confidence = confidence self.max_detect_lines = 600 def _detect(self, file_name, cnt): if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: return "", False detector = UniversalDetector() fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] result = '{2}: Detected {0} with {1} confidence'.format(encoding, confidence, file_name) if args.verbose: print(result) return encoding, confidence > self.confidence def _convert(self, file_name, encoding, to_encoding): if encoding == to_encoding: # print("same encoding, ignore") return fp = None try: fp = codecs.open(file_name, 'rb', encoding, errors='strict') contents = fp.read() contents = contents.replace('\r\n', '\n').replace('\r', '\n') contents = contents.encode(to_encoding) except LookupError as e: print(file_name + ":LookupError") return except UnicodeDecodeError as e: print(file_name + ":UnicodeDecodeError") return except UnicodeEncodeError as e: print(file_name + ":UnicodeEncodeError") return finally: if fp: fp.close() if args.verbose: print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name)) with open(file_name, 'wb') as f: f.write(contents) def convert_file(self, file_name, to_encoding): encoding, confidence = self._detect(file_name, self.max_detect_lines) if confidence: self._convert(file_name, encoding, to_encoding) def convert_dir(self, dir, to_encoding='UTF-8'): for fpathe, dirs, fs in os.walk(dir): # 是否有必要跳过某些隐藏目录? for f in fs: path_str = os.path.join(fpathe, f) if path_str.endswith("ConvertTool.py"): continue self.convert_file(path_str, to_encoding) if not args.handle_all: return if __name__ == '__main__': parser = argparse.ArgumentParser(description="Convert your file to some other encodings") parser.add_argument('--verbose', '-v', action='store_true', help='verbose mode') parser.add_argument('--path', '-p', action="store", dest="path", required=True, type=str, help="path") parser.add_argument('--to', '-t', action="store", dest="to_encoding", default='UTF-8', type=str, help="to encoding") parser.add_argument('--all', '-r', action='store_true', dest="handle_all", help='handler all file in the subdir') parser.add_argument('--confidence', '-c', action="store", dest="confidence", default=0.95, type=float, help="confidence") global args args = parser.parse_args() handler = ConvertHandler(args.confidence) handler.convert_dir(args.path, args.to_encoding)