Last active
August 29, 2015 13:56
-
-
Save airbob/9246248 to your computer and use it in GitHub Desktop.
Revisions
-
airbob revised this gist
Feb 27, 2014 . 1 changed file with 92 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,92 @@ #!/usr/bin/python import os import sys import csv import re import string import time import datetime ''' This is modified version of week.py I raised this question in http://v2ex.com/t/102160 and thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake) with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now. ''' def main(): start_time = time.time() weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] inputfilename = "./test_refine/test.txt" outputfilename = "test_week/" + inputfilename[14:] print outputfilename open_file = open(inputfilename, 'r') contents = open_file.readlines() to_file = open(outputfilename, 'w') i = 0 totalLines = len(contents) totalLines = int(totalLines) while i < totalLines: outputCONTENT = "" print i if ( i == totalLines-1): print time.time()-start_time , "seconds" return if (i>0): lineString = contents[i] user = lineString.split()[0] j = i nextFlag = 1 while (nextFlag == 1 and ( j < totalLines )): tempString = contents[j] user2 = tempString.split()[0] if (user != user2): nextFlag = 0 j = j + 1 markIndex = j ## do the main check totalTW = {} totalQS = {} totalResult = {} for z in range(i,markIndex): tempString = contents[z] tweetmonth = tempString.split()[1] tweetday = tempString.split()[2] tweethour = tempString.split()[3] tweetTW = tempString.split()[4] tweetQS = tempString.split()[5] tweetResult = tempString.split()[6] tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a') key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour) if key in totalTW: totalTW[key]+=int(tweetTW) totalQS[key]+=int(tweetQS) totalResult[key]+=int(tweetResult) else: totalTW[key]=int(tweetTW) totalQS[key]=int(tweetQS) totalResult[key]=int(tweetResult) for month in range(5,13): for day in weekday: for hour in range(0,24): key = "%02d%s%02d" % ( month, day, hour) if key in totalTW: lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key]) outputCONTENT = outputCONTENT + lineoutput else: lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0) outputCONTENT = outputCONTENT + lineoutput ## to_file.write(lineoutput) i = markIndex-1 else: ## to_file.write(contents[0]) outputCONTENT = outputCONTENT + contents[0] i = i + 1 to_file.write(outputCONTENT) to_file.close() open_file.close() if __name__ == "__main__": main() -
airbob revised this gist
Feb 27, 2014 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -17,14 +17,14 @@ totally there are 28,000,000 lines in the file, and I have 6 this kind of files. write script to process the input data, to: <br> for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br> lets say: there are lines like this(year is 2012): <br> | userID | month |date |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | | 21535110 |05 |08 |02 |2 |1 | 0 | then this 2 data points should sum since they both belong to tue of May and hour is 02 <br> | userID | month |day |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| -
airbob revised this gist
Feb 27, 2014 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -33,5 +33,5 @@ then this 2 data points should sum to <br> ## Problem the week.py script I added in this gist is working, the problem is, it seems too slow. <br> I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only) <br> Is there any way to optimize this script? <br> -
airbob revised this gist
Feb 27, 2014 . 1 changed file with 4 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ ### background Below is input File format(*.txt): @@ -13,7 +13,7 @@ Below is input File format(*.txt): totally there are 28,000,000 lines in the file, and I have 6 this kind of files. ### object write script to process the input data, to: <br> for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br> lets say: @@ -25,12 +25,13 @@ there are lines like this: <br> | 21535110 |05 |08 |02 |2 |1 | 0 | then this 2 data points should sum to <br> | userID | month |day |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |Tue |02 |5 |3 | 1 | ## Problem the week.py script I added in this gist is working, the problem is, it seems too slow. <br> I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br> Is there any way to optimize this script? <br> -
airbob revised this gist
Feb 27, 2014 . 2 changed files with 11 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,4 @@ == background == Below is input File format(*.txt): @@ -12,20 +13,24 @@ Below is input File format(*.txt): totally there are 28,000,000 lines in the file, and I have 6 this kind of files. ==object== write script to process the input data, to: <br> for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br> lets say: there are lines like this: <br> | userID | month |date |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | | 21535110 |05 |08 |02 |2 |1 | 0 | then this 2 data points should sum to <br> | userID | month |day |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |Tue |02 |5 |3 | 1 | == Problem == the week.py script I added in this gist is working, the problem is, it seems too slow. <br> I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br> Is there any way to optimize this script? <br> File renamed without changes. -
airbob renamed this gist
Feb 27, 2014 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
airbob revised this gist
Feb 27, 2014 . 1 changed file with 2 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,6 @@ Below is input File format(*.txt): | userID | month |date |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | -
airbob renamed this gist
Feb 27, 2014 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
airbob revised this gist
Feb 27, 2014 . 3 changed files with 29 additions and 11 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,11 +0,0 @@ File renamed without changes.This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,29 @@ Below is input File format(*.txt): | userID | month |date |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | | 21535110 |05 |01 |03 |3 |2 | 1 | | 21535110 |05 |01 |06 |1 |0 | 0 | | 21535110 |05 |02 |02 |1 |0 | 0 | | 21535110 |05 |03 |05 |3 |2 | 0 | | 21535112 |05 |01 |05 |1 |1 | 1 | totally there are 28,000,000 lines in the file, and I have 6 this kind of files. **object** write script to process the input data, to: 1. for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. lets say: there is line like this: | userID | month |date |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | | 21535110 |05 |08 |02 |2 |1 | 0 | then this 2 data points should sum to | userID | month |day |hour |totalTW|totalQs|result | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |Tue |02 |5 |3 | 1 | -
airbob created this gist
Feb 27, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,11 @@ File format(*.txt): | userID | month |date |hour |totalTW|totalQs|how | | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:| | 21535110 |05 |01 |02 |3 |2 | 1 | | 21535110 |05 |01 |03 |3 |2 | 1 | | 21535110 |05 |01 |06 |1 |0 | 0 | | 21535110 |05 |02 |02 |1 |0 | 0 | | 21535110 |05 |03 |05 |3 |2 | 0 | | 21535112 |05 |01 |05 |1 |1 | 1 | This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,78 @@ #!/usr/bin/python import os import sys import csv import re import string import time import datetime ''' weekday of each month ''' def main(): weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] ## read stats file and filter not existing users inputfilename = "input.txt" outputfilename = "output.txt" print outputfilename open_file = open(inputfilename, 'r') contents = open_file.readlines() to_file = open(outputfilename, 'w') i = 0 totalLines = len(contents) totalLines = int(totalLines) print "going to while loop" while i < totalLines: outputCONTENT = "" print i if ( i == totalLines-1): return if (i>0): lineString = contents[i] user = lineString.split()[0] j = i nextFlag = 1 while (nextFlag == 1 and ( j < totalLines )): tempString = contents[j] user2 = tempString.split()[0] if (user != user2): nextFlag = 0 j = j + 1 markIndex = j for month in range(5,13): for day in weekday: for hour in range ( 0, 24): ## print "%s-%s-%s-%s" % (user,month, day , hour) totalTW = 0 totalQS = 0 totalResult = 0 for z in range(i,markIndex): tempString = contents[z] tweetmonth = tempString.split()[1] tweetday = tempString.split()[2] tweethour = tempString.split()[3] tweetTW = tempString.split()[4] tweetQS = tempString.split()[5] tweetResult = tempString.split()[6] tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a') if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ): totalTW += int(tweetTW) totalQS += int(tweetQS) totalResult += int(tweetResult) lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult) ## to_file.write(lineoutput) outputCONTENT = outputCONTENT + lineoutput i = markIndex-1 else: ## to_file.write(contents[0]) outputCONTENT = outputCONTENT + contents[0] i = i + 1 to_file.write(outputCONTENT) to_file.close() open_file.close() if __name__ == "__main__": main()