Skip to content

Instantly share code, notes, and snippets.

@pabloem
Created August 23, 2021 18:38
Show Gist options
  • Select an option

  • Save pabloem/845d96e5142fbbbd1edd8512ac416fdf to your computer and use it in GitHub Desktop.

Select an option

Save pabloem/845d96e5142fbbbd1edd8512ac416fdf to your computer and use it in GitHub Desktop.

Revisions

  1. pabloem created this gist Aug 23, 2021.
    29 changes: 29 additions & 0 deletions README.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,29 @@
    ## Tail At Scale

    Codigo para demostrar distribuciones de latencia en un servicio
    de software con distintos algoritmos.


    **Para instalar el entorno en Linux/Unix**:

    ```
    virtualenv venv
    . venv/bin/activate
    pip install -r requirements.txt
    ```

    **Para ejecutar:**

    En una ventana:

    ```
    python search.py
    ```

    En otra ventana:

    ```
    python measure.py
    ```

    Revisar `out.csv` para ver las latencias y analizarlas.
    71 changes: 71 additions & 0 deletions measure.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,71 @@
    import argparse
    import csv
    import logging
    import random
    import requests


    HOST = '127.0.0.1'
    PORT = '5000'
    PATHS = [
    'binary',
    'hash',
    'list'
    ]


    def measure_distribution(word_list):
    req_count = 0
    for word in word_list:
    word_results = {}
    for path in random.sample(PATHS, 3):
    full_path = f'http://{HOST}:{PORT}/{path}/{word}'
    req_count += 1
    response = requests.get(full_path)
    word_results[path] = response.elapsed.total_seconds() * 1000

    if req_count and req_count % 1000 == 0:
    logging.info('Ran %s requests', req_count)

    yield word_results


    def save_distribution(word_list, fname):
    with open(fname, 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=PATHS)
    writer.writeheader()

    for sample_row in measure_distribution(word_list):
    writer.writerow(sample_row)


    def read_words(fname):
    result = []
    logging.info('Loading directory from %s', fname)
    with open(fname, 'r') as f:
    for word in f:
    word = word.split('/')[0]
    word = word.strip()
    result.append(word)

    random.shuffle(result)
    return result


    parser = argparse.ArgumentParser(description='Client to gather latency data')
    parser.add_argument('--input', help='File with list of words',
    default='zdict_corto.dic')
    parser.add_argument('--output', help='CSV file to write latency data',
    default='out.csv')


    def run(args):
    logging.getLogger().setLevel('INFO')
    word_list = read_words(args.input)
    logging.info('Saving distribution over %d words', len(word_list))
    save_distribution(word_list, args.output)


    if __name__ == '__main__':
    args = parser.parse_args()
    run(args)
    78 changes: 78 additions & 0 deletions search.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,78 @@
    import argparse
    import json
    import logging
    from flask import Flask


    parser = argparse.ArgumentParser(description='Server to demonstrate latency')
    parser.add_argument('--dictionary', help='File with list of words',
    default='zdict_largo.dic')


    LIST_DIRECTORY = []
    SET_DIRECTORY = set()


    def _binary_search(directory, start, end, word):
    spot = (start + end) // 2
    if directory[spot] == word:
    return True
    elif spot == start:
    return False
    elif directory[spot] > word:
    return _binary_search(directory, start, spot, word)
    else:
    assert directory[spot] < word
    return _binary_search(directory, spot, end, word)


    def _list_search(directory: list, word):
    return word in directory


    def _hash_search(directory: set, word):
    return word in directory


    def _load_dicts(fname):
    logging.info('Loading directory from %s', fname)
    with open(fname, 'r') as f:
    for word in f:
    word = word.split('/')[0]
    word = word.strip()
    LIST_DIRECTORY.append(word)
    SET_DIRECTORY.add(word)

    LIST_DIRECTORY.sort()
    logging.info('Directory has %d/%d words. Sample:\n%s',
    len(LIST_DIRECTORY), len(SET_DIRECTORY), LIST_DIRECTORY[0:10])


    app = Flask(__name__)

    @app.route("/binary/<word>")
    def serve_binary(word):
    return json.dumps({
    'word': word,
    'found': _binary_search(LIST_DIRECTORY, 0, len(LIST_DIRECTORY) + 1, word)})

    @app.route("/list/<word>")
    def serve_list(word):
    return json.dumps({
    'word': word,
    'found': _list_search(LIST_DIRECTORY, word)})

    @app.route("/hash/<word>")
    def serve_hash(word):
    return json.dumps({
    'word': word,
    'found': _hash_search(SET_DIRECTORY, word)})


    if __name__ == '__main__':
    logging.getLogger().setLevel('WARNING')
    logging.getLogger('werkzeug').setLevel('ERROR')
    args = parser.parse_args()
    _load_dicts(args.dictionary)
    app.run()

    49,569 changes: 49,569 additions & 0 deletions zdict_corto.dic
    49,569 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
    466,550 changes: 466,550 additions & 0 deletions zdict_largo.dic
    466,550 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.