import re import itertools import sys def snake_case_split(ident): """ Split a snake case identifier into words, returning the original ident and its splits as a list """ splits = filter(None, re.split('_', ident)) if len(splits) <= 1: return [ident] splits.append(ident) return splits def camel_case_split(ident): """ Split a camel-case identifier into words, returning the original ident and its splits """ matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', ident) splits = [m.group(0) for m in matches] if len(splits) <= 1: return [ident] splits.append(ident) return splits def tokenize_code(fileName): """ Open a file name and return it's expanded tokenized version by removing non alphanumeric stuff, and splitting camel/snake case """ with open(fileName) as f: lines = [] for line in f: toks = filter(None, re.split('\W', line )) if not toks: continue toks = itertools.chain(*(camel_case_split(x) for x in toks)) toks = itertools.chain(*(snake_case_split(x) for x in toks)) lines.append(toks) return lines if __name__ == '__main__': print('\n'.join((' '.join(line) for line in tokenize_code(sys.argv[1]))))