#!/usr/bin/env python # -*- coding: utf-8 -*- import re apple = {} for line in open('RomajiRule_Default_original.txt', 'r'): line = line.strip() if len(line) > 0 and line[0] == '#': continue data = line.split('\t') if len(data) < 2: continue # print data apple[data[0]] = data diff = [] for line in open('dvorakjp-edited.txt', 'r'): data = line.strip().split('\t') if data[0] not in apple: diff.append([data[0], ''.join(data[1:]).decode('utf-8')]) # print data[0] + '\t' + ''.join(data[1:]) def compare(s1, s2): for i,j in zip(s1, s2): if ord(i) < ord(j): return -1 elif ord(i) > ord(j): return 1 if len(s1) < len(s2): return -1 elif len(s1) > len(s2): return 1 else: return 0 def strcmp(x, y): n = len(x[1]) - len(y[1]) if n == 0: return compare(x[1], y[1]) else: return n p = re.compile(r'[,\.;]') for t in sorted(diff, cmp=strcmp): n = len(t[1]) if n > 1: t.append(('0,' * (n - 1) + '%d') % len(t[0])) if p.search(t[0]): t[0] = '#' + t[0] t.append(u'# 無効') print '\t'.join(t).encode('utf-8')