dchentech · May 13, 2015 05:24 · May 13, 2015
diff --git a/local_benchmark.py b/local_benchmark.py
@@ -0,0 +1,85 @@
+# -*-coding:utf-8-*-
+
+from etl_utils import process_notifier
+from collections import Counter, defaultdict
+import ujson
+null = None
+
+
+def char_count(func):
+    result = defaultdict(int)
+    root = "/home/primary_user/tmp/"
+    f1 = root + "redmine10050_final_merge_range.json"  # 139.5 MB
+    # f1 = root + "en_exam_20150429.json"  # 1.1 GB
+    # f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt"
+    for line in process_notifier(file(f1)):
+        if isinstance(line, str):
+            line = line.decode("UTF-8")
+        d1 = line
+        d1 = Counter(line.split(" "))
+        # d1 = dict(Counter(list(line)))
+        # d1 = func(d1)  # benchmark serialize and deserialize
+        for k2, v2 in d1.iteritems():
+            result[k2] += v2
+
+
+def python_func(d1):
+    return eval(repr(d1))
+    # return eval((d1))
+
+
+def ujson_func(d1):
+    return ujson.loads(ujson.dumps(d1))
+    # return ujson.loads(d1)
+
+
+print "** python_func"
+char_count(python_func)
+
+print "** ujson_func"
+char_count(ujson_func)
+
+
+# 139.5 MB
+# Python 3.3 MB/s
+# ujson  3.7 MB/s
+
+# 1.2 GB
+# Python 1.5 MB/s
+
+
+"""
+####### decode
+
+(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
+** python_func
+  {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s
+
+** ujson_func
+  {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s
+
+####### word + decode + encode
+
+(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
+** python_func
+  {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s
+
+**
+  {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s
+
+####### char + decode + encode
+
+(ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py
+** python_func
+  {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s
+
+** ujson_func
+  {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s
+
+
+####### char
+  {pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s
+
+####### word
+  {pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s
+"""
diff --git a/mapreduce_benchmark.py b/mapreduce_benchmark.py
@@ -0,0 +1,101 @@
+# -*-coding:utf-8-*-
+
+from __future__ import print_function
+
+from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop
+luigi.plug_packages("ujson", "jsonpickle")
+from collections import Counter
+
+
+class LargeFile(luigi.ExternalTask):
+
+    def output(self):
+        return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json")  # 133 MB
+        # return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json")  # 1.1 GB
+        # return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json")  # 16 GB
+
+
+class WordCountTemplate(TaskDayHadoop):
+
+    root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange"
+    n_reduce_tasks = 30
+
+    def requires(self):
+        self.serialize  # preload
+        self.deserialize  # preload
+        return LargeFile()
+
+    def mapper(self, line):
+        if isinstance(line, str):
+            line = line.decode("UTF-8")
+        d1 = Counter(list(line))  # too heavy CPU !!!
+        # d1 = Counter(line.split(" "))
+        d2 = dict(d1)  # convert to JSON format
+        _ = len(d2)  # more partitioning
+        yield _, d2
+
+    def reducer(self, _, words_counters):
+        result = defaultdict(int)
+        for words_counter in words_counters:
+            for word, count in words_counter.iteritems():
+                result[word] += count
+        result["serialize"] = str(self.serialize)
+        result["deserialize"] = str(self.deserialize)
+        yield "", MRUtils.str_dump(result)
+
+
+"""
+### Already warmed by run reading the input file.
+
+mapreduce.task.io.sort.mb == 512MB
+dfs.block.size == 128 MB
+n_reduce_tasks = 30
+
+CharCount + json 133 MB input file
+Python 48s
+JSON   50s
+
+CharCount + json 1.1 GB input file
+Python 2m:0s
+JSON   1m:30s
+
+CharCount + json 16 GB input file
+Python 3m:17s
+JSON   7m:13s
+
+********************************
+
+CharCount + ujson 133 MB input file
+Python 52s
+JSON   51s
+
+CharCount + ujson 1.1 GB input file
+Python 2m:2s
+JSON   1m:18s
+
+CharCount + ujson 16 GB input file
+Python 3m:13s
+JSON   2m:42s
+
+********************************
+
+WordCount 133 MB input file
+Python 30s
+JSON   30s
+
+WordCount 1.1 GB input file
+Python 1m:32s
+JSON   48s
+
+WordCount 16 GB input file
+Python 2m:22s
+JSON   1m:21s
+"""
+
+class JsonBenchmarkDay(WordCountTemplate):
+
+    data_interchange_format = "json"
+
+class PythonBenchmarkDay(WordCountTemplate):
+
+    data_interchange_format = "python"