Created
September 12, 2020 13:56
-
-
Save jcrousse/cfef485c101639ea27834a58e79a529c to your computer and use it in GitHub Desktop.
Revisions
-
jcrousse created this gist
Sep 12, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,40 @@ from prefixspan import PrefixSpan from data_sources.data_generator import ExamplesGenerator, get_multiple_patterns VOCAB_SIZE = 1000 SEQ_LEN = 250 multiple_patterns = get_multiple_patterns(10) NUM_EXAMPLES = 200 MIN_FREQ = 25 MIN_LEN = 5 MIN_DIST = 3 data_generator = ExamplesGenerator(seq_len=SEQ_LEN, vocab_size=VOCAB_SIZE, seed=111, multiple_patterns=multiple_patterns) data_sequences = [next(data_generator()) for _ in range(NUM_EXAMPLES)] positive_sequences = [s[0] for s in data_sequences if s[1] == 1] negative_sequences = [s[0] for s in data_sequences if s[1] == 0] positive_seq = PrefixSpan(positive_sequences).frequent(MIN_FREQ) long_seq = [s for s in positive_seq if len(s[1]) >= MIN_LEN] seq_by_freq = sorted(long_seq, key=lambda x: x[0], reverse=True) def distance_from_seqs(s, s_list: list): """return distance (in terms of number of different tokens) between the sequence s and the list of sequence s_list""" if not s_list: s_list = [[]] dist_per_seq = [len(set(s) - set(s2)) for s2 in s_list] return min(dist_per_seq) most_freq_seq = [] for s in seq_by_freq: if distance_from_seqs(s[1], most_freq_seq) >= MIN_DIST: most_freq_seq.append(s[1]) print(most_freq_seq[0:10])