Skip to content

Instantly share code, notes, and snippets.

@messyidea
Created June 27, 2018 09:11
Show Gist options
  • Select an option

  • Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.

Select an option

Save messyidea/9d2dd3ac644f9ed9347143389ecd1628 to your computer and use it in GitHub Desktop.

Revisions

  1. messyidea created this gist Jun 27, 2018.
    103 changes: 103 additions & 0 deletions ConvertTool.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,103 @@
    # -*- coding: utf-8 -*-
    import os
    import sys
    import codecs
    import threading
    import json
    import time
    import hashlib
    import shutil
    import argparse
    from chardet.universaldetector import UniversalDetector

    class ConvertHandler(object):
    def __init__(self, confidence=0.95):
    self.confidence = confidence
    self.max_detect_lines = 600

    def _detect(self, file_name, cnt):
    if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
    return "", False
    detector = UniversalDetector()
    fp = open(file_name, 'rb')
    for line in fp:
    # cut MS-Windows CR code
    line = line.replace(b'\r',b'')
    detector.feed(line)
    cnt -= 1
    if detector.done or cnt == 0:
    break
    fp.close()
    detector.close()
    encoding = detector.result['encoding']
    if encoding:
    encoding = encoding.upper()
    confidence = detector.result['confidence']

    result = '{2}: Detected {0} with {1} confidence'.format(encoding, confidence, file_name)
    if args.verbose:
    print(result)

    return encoding, confidence > self.confidence

    def _convert(self, file_name, encoding, to_encoding):
    if encoding == to_encoding:
    # print("same encoding, ignore")
    return

    fp = None
    try:
    fp = codecs.open(file_name, 'rb', encoding, errors='strict')
    contents = fp.read()
    contents = contents.replace('\r\n', '\n').replace('\r', '\n')
    contents = contents.encode(to_encoding)
    except LookupError as e:
    print(file_name + ":LookupError")
    return
    except UnicodeDecodeError as e:
    print(file_name + ":UnicodeDecodeError")
    return
    except UnicodeEncodeError as e:
    print(file_name + ":UnicodeEncodeError")
    return
    finally:
    if fp:
    fp.close()

    if args.verbose:
    print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))

    with open(file_name, 'wb') as f:
    f.write(contents)

    def convert_file(self, file_name, to_encoding):
    encoding, confidence = self._detect(file_name, self.max_detect_lines)
    if confidence:
    self._convert(file_name, encoding, to_encoding)

    def convert_dir(self, dir, to_encoding='UTF-8'):
    for fpathe, dirs, fs in os.walk(dir):
    # 是否有必要跳过某些隐藏目录?
    for f in fs:
    path_str = os.path.join(fpathe, f)
    if path_str.endswith("ConvertTool.py"):
    continue
    self.convert_file(path_str, to_encoding)

    if not args.handle_all:
    return



    if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Convert your file to some other encodings")
    parser.add_argument('--verbose', '-v', action='store_true', help='verbose mode')
    parser.add_argument('--path', '-p', action="store", dest="path", required=True, type=str, help="path")
    parser.add_argument('--to', '-t', action="store", dest="to_encoding", default='UTF-8', type=str, help="to encoding")
    parser.add_argument('--all', '-r', action='store_true', dest="handle_all", help='handler all file in the subdir')
    parser.add_argument('--confidence', '-c', action="store", dest="confidence", default=0.95, type=float, help="confidence")
    global args
    args = parser.parse_args()

    handler = ConvertHandler(args.confidence)
    handler.convert_dir(args.path, args.to_encoding)