""" n-gram of a string by characters (not by words) takes all args as input unless the first two args are numbers which are in order, then it assume these are min and max length. Default min max is 2 - 60 """ def ngram(inp='', mn=2, mx=60): """ Returns a list of all segments of input inp that range in length from mn to mx characters. List comprehension version of the below code runs faster than it: def ngram(inp='', mn=2, mx=60): out = [] for i in range(0, len(inp) + 1 - mn): for j in range(i + mn, min(i + mx + 1, len(inp) + 1)): out.append(inp[i:j]) return out """ return [ inp[i:j] for i in range(0, len(inp) + 1 - mn) for j in range(i + mn, min(i + mx + 1, len(inp) + 1)) ] def main(): from sys import argv mn = 2 mx = 60 if len(argv) < 2: return if len(argv) > 3: try: pmn = int(argv[1]) pmx = int(argv[2]) if pmn < pmx: mn = pmn mx = pmx argv.pop(0) argv.pop(0) except ValueError: pass res = ngram(inp=" ".join(argv[1:]), mn=mn, mx=mx) print(res) print('{} values.'.format(len(res))) print('{} unique values.'.format(len({k: 1 for k in res}.keys()))) if __name__ == '__main__': main()