Created
May 13, 2015 05:24
-
-
Save dchentech/72f34141a6e9c56695d5 to your computer and use it in GitHub Desktop.
Revisions
-
dchentech created this gist
May 13, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,85 @@ # -*-coding:utf-8-*- from etl_utils import process_notifier from collections import Counter, defaultdict import ujson null = None def char_count(func): result = defaultdict(int) root = "/home/primary_user/tmp/" f1 = root + "redmine10050_final_merge_range.json" # 139.5 MB # f1 = root + "en_exam_20150429.json" # 1.1 GB # f1 = root + "redmine9523_final_report_four_weeks_before_range.json/16GB.txt" for line in process_notifier(file(f1)): if isinstance(line, str): line = line.decode("UTF-8") d1 = line d1 = Counter(line.split(" ")) # d1 = dict(Counter(list(line))) # d1 = func(d1) # benchmark serialize and deserialize for k2, v2 in d1.iteritems(): result[k2] += v2 def python_func(d1): return eval(repr(d1)) # return eval((d1)) def ujson_func(d1): return ujson.loads(ujson.dumps(d1)) # return ujson.loads(d1) print "** python_func" char_count(python_func) print "** ujson_func" char_count(ujson_func) # 139.5 MB # Python 3.3 MB/s # ujson 3.7 MB/s # 1.2 GB # Python 1.5 MB/s """ ####### decode (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py ** python_func {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 7.5 MB/s ** ujson_func {pid: 19344, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 70.1 MB/s ####### word + decode + encode (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py ** python_func {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 9.6 MB/s ** {pid: 23318, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |#################################################################| 19.1 MB/s ####### char + decode + encode (ENV)[primary_user@BJ-NAMENODE-145 benchmark_mr_internal_data_interchange]$ python local_benchmark.py ** python_func {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.5 MB/s ** ujson_func {pid: 25309, file: "/home/primary_user/tmp/redmine10050_final_merge_range.json", size: 139.5 MB}100% |##################################################################| 3.7 MB/s ####### char {pid:5204, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |########################################################################| 3.8 MB/s ####### word {pid:5846, file:/home/primary_user/tmp/redmine10050_final_merge_range.json, size:139.5 MB}100% |#######################################################################| 21.8 MB/s """ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,101 @@ # -*-coding:utf-8-*- from __future__ import print_function from luiti import luigi, TargetUtils, MRUtils, defaultdict, TaskDayHadoop luigi.plug_packages("ujson", "jsonpickle") from collections import Counter class LargeFile(luigi.ExternalTask): def output(self): return TargetUtils.hdfs("/primary/BI_report/redmine10050_afenti_experience_report/2015-05-03/redmine10050_final_merge_range.json") # 133 MB # return TargetUtils.hdfs("/primary/question_result/en_exam/en_exam_20150429.json") # 1.1 GB # return TargetUtils.hdfs("/primary/BI_report/afenti_stimulate_the_paying_customers_201504_report/english/2015-05-04/redmine9523_final_report_four_weeks_before_range.json") # 16 GB class WordCountTemplate(TaskDayHadoop): root_dir = "/primary/experiments/benchmark_mr_internal_data_interchange" n_reduce_tasks = 30 def requires(self): self.serialize # preload self.deserialize # preload return LargeFile() def mapper(self, line): if isinstance(line, str): line = line.decode("UTF-8") d1 = Counter(list(line)) # too heavy CPU !!! # d1 = Counter(line.split(" ")) d2 = dict(d1) # convert to JSON format _ = len(d2) # more partitioning yield _, d2 def reducer(self, _, words_counters): result = defaultdict(int) for words_counter in words_counters: for word, count in words_counter.iteritems(): result[word] += count result["serialize"] = str(self.serialize) result["deserialize"] = str(self.deserialize) yield "", MRUtils.str_dump(result) """ ### Already warmed by run reading the input file. mapreduce.task.io.sort.mb == 512MB dfs.block.size == 128 MB n_reduce_tasks = 30 CharCount + json 133 MB input file Python 48s JSON 50s CharCount + json 1.1 GB input file Python 2m:0s JSON 1m:30s CharCount + json 16 GB input file Python 3m:17s JSON 7m:13s ******************************** CharCount + ujson 133 MB input file Python 52s JSON 51s CharCount + ujson 1.1 GB input file Python 2m:2s JSON 1m:18s CharCount + ujson 16 GB input file Python 3m:13s JSON 2m:42s ******************************** WordCount 133 MB input file Python 30s JSON 30s WordCount 1.1 GB input file Python 1m:32s JSON 48s WordCount 16 GB input file Python 2m:22s JSON 1m:21s """ class JsonBenchmarkDay(WordCountTemplate): data_interchange_format = "json" class PythonBenchmarkDay(WordCountTemplate): data_interchange_format = "python"