Skip to content

Instantly share code, notes, and snippets.

@dchentech
Created May 13, 2015 05:24
Show Gist options
  • Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.
Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.

Revisions

  1. dchentech created this gist May 13, 2015.
    85 changes: 85 additions & 0 deletions local_benchmark.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,85 @@
    # -*-coding:utf-8-*-

    from etl_utils import process_notifier
    from collections import Counter, defaultdict
    import ujson
    null = None


    def char_count(func):
    result = defaultdict(int)
    root = "/home/primary_user/tmp/"
    f1 = root + "redmine10050_final_merge_range.json" # 139.5 MB
    # f1 = root + "en_exam_20150429.json" # 1.1 GB
    # f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt"
    for line in process_notifier(file(f1)):
    if isinstance(line, str):
    line = line.decode("UTF-8")
    d1 = line
    d1 = Counter(line.split(" "))
    # d1 = dict(Counter(list(line)))
    # d1 = func(d1) # benchmark serialize and deserialize
    for k2, v2 in d1.iteritems():
    result[k2] += v2


    def python_func(d1):
    return eval(repr(d1))
    # return eval((d1))


    def ujson_func(d1):
    return ujson.loads(ujson.dumps(d1))
    # return ujson.loads(d1)


    print "** python_func"
    char_count(python_func)

    print "** ujson_func"
    char_count(ujson_func)


    # 139.5 MB
    # Python 3.3 MB/s
    # ujson 3.7 MB/s

    # 1.2 GB
    # Python 1.5 MB/s


    """
    ####### decode
    (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
    ** python_func
    {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s
    ** ujson_func
    {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s
    ####### word + decode + encode
    (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
    ** python_func
    {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s
    **
    {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s
    ####### char + decode + encode
    (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
    ** python_func
    {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s
    ** ujson_func
    {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s
    ####### char
    {pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s
    ####### word
    {pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s
    """
    101 changes: 101 additions & 0 deletions mapreduce_benchmark.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,101 @@
    # -*-coding:utf-8-*-

    from __future__ import print_function

    from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop
    luigi.plug_packages("ujson", "jsonpickle")
    from collections import Counter


    class LargeFile(luigi.ExternalTask):

    def output(self):
    return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json") # 133 MB
    # return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json") # 1.1 GB
    # return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json") # 16 GB


    class WordCountTemplate(TaskDayHadoop):

    root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange"
    n_reduce_tasks = 30

    def requires(self):
    self.serialize # preload
    self.deserialize # preload
    return LargeFile()

    def mapper(self, line):
    if isinstance(line, str):
    line = line.decode("UTF-8")
    d1 = Counter(list(line)) # too heavy CPU !!!
    # d1 = Counter(line.split(" "))
    d2 = dict(d1) # convert to JSON format
    _ = len(d2) # more partitioning
    yield _, d2

    def reducer(self, _, words_counters):
    result = defaultdict(int)
    for words_counter in words_counters:
    for word, count in words_counter.iteritems():
    result[word] += count
    result["serialize"] = str(self.serialize)
    result["deserialize"] = str(self.deserialize)
    yield "", MRUtils.str_dump(result)


    """
    ### Already warmed by run reading the input file.
    mapreduce.task.io.sort.mb == 512MB
    dfs.block.size == 128 MB
    n_reduce_tasks = 30
    CharCount + json 133 MB input file
    Python 48s
    JSON 50s
    CharCount + json 1.1 GB input file
    Python 2m:0s
    JSON 1m:30s
    CharCount + json 16 GB input file
    Python 3m:17s
    JSON 7m:13s
    ********************************
    CharCount + ujson 133 MB input file
    Python 52s
    JSON 51s
    CharCount + ujson 1.1 GB input file
    Python 2m:2s
    JSON 1m:18s
    CharCount + ujson 16 GB input file
    Python 3m:13s
    JSON 2m:42s
    ********************************
    WordCount 133 MB input file
    Python 30s
    JSON 30s
    WordCount 1.1 GB input file
    Python 1m:32s
    JSON 48s
    WordCount 16 GB input file
    Python 2m:22s
    JSON 1m:21s
    """

    class JsonBenchmarkDay(WordCountTemplate):

    data_interchange_format = "json"

    class PythonBenchmarkDay(WordCountTemplate):

    data_interchange_format = "python"