Skip to content

Instantly share code, notes, and snippets.

@jcrousse
Created September 12, 2020 13:56
Show Gist options
  • Save jcrousse/cfef485c101639ea27834a58e79a529c to your computer and use it in GitHub Desktop.
Save jcrousse/cfef485c101639ea27834a58e79a529c to your computer and use it in GitHub Desktop.

Revisions

  1. jcrousse created this gist Sep 12, 2020.
    40 changes: 40 additions & 0 deletions prefixspan
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,40 @@
    from prefixspan import PrefixSpan

    from data_sources.data_generator import ExamplesGenerator, get_multiple_patterns

    VOCAB_SIZE = 1000
    SEQ_LEN = 250
    multiple_patterns = get_multiple_patterns(10)

    NUM_EXAMPLES = 200
    MIN_FREQ = 25
    MIN_LEN = 5
    MIN_DIST = 3

    data_generator = ExamplesGenerator(seq_len=SEQ_LEN, vocab_size=VOCAB_SIZE, seed=111,
    multiple_patterns=multiple_patterns)

    data_sequences = [next(data_generator()) for _ in range(NUM_EXAMPLES)]
    positive_sequences = [s[0] for s in data_sequences if s[1] == 1]
    negative_sequences = [s[0] for s in data_sequences if s[1] == 0]

    positive_seq = PrefixSpan(positive_sequences).frequent(MIN_FREQ)
    long_seq = [s for s in positive_seq if len(s[1]) >= MIN_LEN]
    seq_by_freq = sorted(long_seq, key=lambda x: x[0], reverse=True)


    def distance_from_seqs(s, s_list: list):
    """return distance (in terms of number of different tokens) between the sequence s
    and the list of sequence s_list"""
    if not s_list:
    s_list = [[]]
    dist_per_seq = [len(set(s) - set(s2)) for s2 in s_list]
    return min(dist_per_seq)


    most_freq_seq = []
    for s in seq_by_freq:
    if distance_from_seqs(s[1], most_freq_seq) >= MIN_DIST:
    most_freq_seq.append(s[1])

    print(most_freq_seq[0:10])