Skip to content

Instantly share code, notes, and snippets.

@messyidea
Last active June 27, 2018 08:25
Show Gist options
  • Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.
Save messyidea/6baf736b2d2dd05ec590c1f3e69bb995 to your computer and use it in GitHub Desktop.

Revisions

  1. messyidea renamed this gist Jun 27, 2018. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. messyidea created this gist Jun 27, 2018.
    88 changes: 88 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,88 @@
    # -*- coding: utf-8 -*-
    import os
    import sys
    import codecs
    import threading
    import json
    import time
    import hashlib
    import shutil
    from chardet.universaldetector import UniversalDetector

    class ConvertHandler(object):
    def __init__(self):
    self.confidence = 0.95
    self.max_detect_lines = 600

    def _detect(self, file_name, cnt):
    if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0:
    return "", False
    detector = UniversalDetector()
    fp = open(file_name, 'rb')
    for line in fp:
    # cut MS-Windows CR code
    line = line.replace(b'\r',b'')
    detector.feed(line)
    cnt -= 1
    if detector.done or cnt == 0:
    break
    fp.close()
    detector.close()
    encoding = detector.result['encoding']
    if encoding:
    encoding = encoding.upper()
    confidence = detector.result['confidence']

    # result = 'Detected {0} with {1} confidence'.format(encoding, confidence)
    # print(result)

    return encoding, confidence > 0.95

    def _convert(self, file_name, encoding, to_encoding):
    if encoding == to_encoding:
    # print("same encoding, ignore")
    return

    fp = None
    try:
    fp = codecs.open(file_name, 'rb', encoding, errors='strict')
    contents = fp.read()
    contents = contents.replace('\r\n', '\n').replace('\r', '\n')
    contents = contents.encode(to_encoding)
    except LookupError as e:
    print("LookupError")
    return
    except UnicodeDecodeError as e:
    print("UnicodeDecodeError")
    return
    except UnicodeEncodeError as e:
    print(file_name + ":UnicodeEncodeError")
    return
    finally:
    if fp:
    fp.close()

    print('{2}: {0} -> {1}'.format(encoding, to_encoding, file_name))

    with open(file_name, 'wb') as f:
    f.write(contents)

    def convert_file(self, file_name, to_encoding):
    encoding, confidence = self._detect(file_name, self.max_detect_lines)
    if confidence:
    self._convert(file_name, encoding, to_encoding)

    def convert_dir(self, dir, to_encoding='UTF-8'):
    for fpathe, dirs, fs in os.walk(dir):
    # 是否有必要跳过某些隐藏目录?
    for f in fs:
    path_str = os.path.join(fpathe, f)
    if path_str.endswith("ConvertToUTF8.py"):
    continue
    self.convert_file(path_str, to_encoding)



    if __name__ == '__main__':
    handler = ConvertHandler()
    handler.convert_dir(".", "UTF-8")