airbob · August 29, 2015 13:56 · Feb 27, 2014 · Feb 27, 2014 · Feb 27, 2014 · Feb 27, 2014
diff --git a/week-modified.py b/week-modified.py
@@ -0,0 +1,92 @@
+#!/usr/bin/python
+import os
+import sys
+import csv
+import re
+import string 
+import time
+import datetime
+'''
+This is modified version of week.py
+I raised this question in http://v2ex.com/t/102160 and 
+thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
+with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
+'''
+
+def main():
+    start_time = time.time()
+    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
+    inputfilename = "./test_refine/test.txt"
+    outputfilename = "test_week/" + inputfilename[14:]
+    print outputfilename 
+    open_file = open(inputfilename, 'r')
+    contents = open_file.readlines()
+    to_file = open(outputfilename, 'w')
+    i = 0
+    totalLines = len(contents)
+    totalLines = int(totalLines)
+    while i < totalLines: 
+        outputCONTENT = ""
+        print i
+        if ( i == totalLines-1):
+            print time.time()-start_time , "seconds"
+            return
+        if (i>0):
+            lineString = contents[i]
+            user = lineString.split()[0]
+            j = i
+            nextFlag = 1 
+            while (nextFlag == 1 and ( j < totalLines )):
+                tempString = contents[j]
+                user2 = tempString.split()[0]
+                if (user != user2):
+                    nextFlag = 0 
+                j = j + 1
+            markIndex = j   
+            ## do the main check 
+            totalTW = {} 
+            totalQS = {}
+            totalResult =  {} 
+            for z in range(i,markIndex):
+                tempString = contents[z]
+                tweetmonth = tempString.split()[1]
+                tweetday = tempString.split()[2]
+                tweethour = tempString.split()[3]
+                tweetTW = tempString.split()[4]
+                tweetQS = tempString.split()[5]
+                tweetResult = tempString.split()[6]
+                tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) 
+                dayOfWeek =  datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
+                key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
+                if key in totalTW:
+                    totalTW[key]+=int(tweetTW)
+                    totalQS[key]+=int(tweetQS)
+                    totalResult[key]+=int(tweetResult)
+                else:
+                    totalTW[key]=int(tweetTW)
+                    totalQS[key]=int(tweetQS)
+                    totalResult[key]=int(tweetResult)
+            for month in range(5,13):
+                for day in weekday:
+                    for hour in range(0,24):
+                        key = "%02d%s%02d" % ( month, day, hour)
+                        if key in totalTW:
+                            lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
+                            outputCONTENT = outputCONTENT + lineoutput
+                        else:
+                            lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
+                            outputCONTENT = outputCONTENT + lineoutput
+
+            ## to_file.write(lineoutput)
+            i = markIndex-1 
+        else:
+            ## to_file.write(contents[0]) 
+            outputCONTENT = outputCONTENT + contents[0] 
+            i = i + 1 
+
+        to_file.write(outputCONTENT) 
+    to_file.close()
+    open_file.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/readme.md b/readme.md
@@ -17,14 +17,14 @@ totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
 write script to process the input data, to: <br>
 for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
 lets say: 
-there are lines like this: <br>
+there are lines like this(year is 2012): <br>
 
 | userID        | month   |date   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
 | 21535110      |05       |01     |02     |3      |2      | 1     |
 | 21535110      |05       |08     |02     |2      |1      | 0     |
 
-then this 2 data points should sum to <br>
+then this 2 data points should sum since they both belong to tue of May and hour is 02 <br>
 
 | userID        | month   |day   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|

diff --git a/readme.md b/readme.md
@@ -33,5 +33,5 @@ then this 2 data points should sum to <br>
 
 ## Problem 
 the week.py script I added in this gist is working, the problem is, it seems too slow.  <br>
-I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
+I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only) <br>
 Is there any way to optimize this script?  <br>
diff --git a/readme.md b/readme.md
@@ -1,4 +1,4 @@
-== background == 
+### background
 Below is input File format(*.txt):
 
 
@@ -13,7 +13,7 @@ Below is input File format(*.txt):
 
 totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
 
-==object==
+### object
 write script to process the input data, to: <br>
 for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
 lets say: 
@@ -25,12 +25,13 @@ there are lines like this: <br>
 | 21535110      |05       |08     |02     |2      |1      | 0     |
 
 then this 2 data points should sum to <br>
+
 | userID        | month   |day   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
 | 21535110      |05       |Tue    |02     |5      |3      | 1     |
 
 
-== Problem ==
+## Problem 
 the week.py script I added in this gist is working, the problem is, it seems too slow.  <br>
 I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
 Is there any way to optimize this script?  <br>
diff --git a/readme.md b/readme.md
@@ -1,3 +1,4 @@
+== background == 
 Below is input File format(*.txt):
 
 
@@ -12,20 +13,24 @@ Below is input File format(*.txt):
 
 totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
 
-**object** 
-write script to process the input data, to:
-1. for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
+==object==
+write script to process the input data, to: <br>
+for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
 lets say: 
-there is line like this:
+there are lines like this: <br>
+
 | userID        | month   |date   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
 | 21535110      |05       |01     |02     |3      |2      | 1     |
 | 21535110      |05       |08     |02     |2      |1      | 0     |
 
-then this 2 data points should sum to 
+then this 2 data points should sum to <br>
 | userID        | month   |day   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
 | 21535110      |05       |Tue    |02     |5      |3      | 1     |
 
 
-
+== Problem ==
+the week.py script I added in this gist is working, the problem is, it seems too slow.  <br>
+I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
+Is there any way to optimize this script?  <br>
diff --git a/gistfile1.py → week.py b/gistfile1.py → week.py
diff --git a/input → readme.md b/input → readme.md
diff --git a/input b/input
@@ -1,4 +1,6 @@
 Below is input File format(*.txt):
+
+
 | userID        | month   |date   |hour   |totalTW|totalQs|result |
 | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
 | 21535110      |05       |01     |02     |3      |2      | 1     |

diff --git a/input.txt → input b/input.txt → input
diff --git a/gistfile1.md b/gistfile1.md
@@ -1,11 +0,0 @@
-File format(*.txt):
-
-
-| userID        | month   |date   |hour   |totalTW|totalQs|how    |
-| :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
-| 21535110      |05       |01     |02     |3      |2      | 1     |
-| 21535110      |05       |01     |03     |3      |2      | 1     |
-| 21535110      |05       |01     |06     |1      |0      | 0     |
-| 21535110      |05       |02     |02     |1      |0      | 0     |
-| 21535110      |05       |03     |05     |3      |2      | 0     |
-| 21535112      |05       |01     |05     |1      |1      | 1     |

diff --git a/gistfile2.py → gistfile1.py b/gistfile2.py → gistfile1.py
diff --git a/input.txt b/input.txt
@@ -0,0 +1,29 @@
+Below is input File format(*.txt):
+| userID        | month   |date   |hour   |totalTW|totalQs|result |
+| :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
+| 21535110      |05       |01     |02     |3      |2      | 1     |
+| 21535110      |05       |01     |03     |3      |2      | 1     |
+| 21535110      |05       |01     |06     |1      |0      | 0     |
+| 21535110      |05       |02     |02     |1      |0      | 0     |
+| 21535110      |05       |03     |05     |3      |2      | 0     |
+| 21535112      |05       |01     |05     |1      |1      | 1     |
+
+totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
+
+**object** 
+write script to process the input data, to:
+1. for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
+lets say: 
+there is line like this:
+| userID        | month   |date   |hour   |totalTW|totalQs|result |
+| :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
+| 21535110      |05       |01     |02     |3      |2      | 1     |
+| 21535110      |05       |08     |02     |2      |1      | 0     |
+
+then this 2 data points should sum to 
+| userID        | month   |day   |hour   |totalTW|totalQs|result |
+| :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
+| 21535110      |05       |Tue    |02     |5      |3      | 1     |
+
+
+
diff --git a/gistfile1.md b/gistfile1.md
@@ -0,0 +1,11 @@
+File format(*.txt):
+
+
+| userID        | month   |date   |hour   |totalTW|totalQs|how    |
+| :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
+| 21535110      |05       |01     |02     |3      |2      | 1     |
+| 21535110      |05       |01     |03     |3      |2      | 1     |
+| 21535110      |05       |01     |06     |1      |0      | 0     |
+| 21535110      |05       |02     |02     |1      |0      | 0     |
+| 21535110      |05       |03     |05     |3      |2      | 0     |
+| 21535112      |05       |01     |05     |1      |1      | 1     |
diff --git a/gistfile2.py b/gistfile2.py
@@ -0,0 +1,78 @@
+#!/usr/bin/python
+import os
+import sys
+import csv
+import re
+import string 
+import time
+import datetime
+'''
+weekday of each month
+'''
+
+def main():
+    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
+    ## read stats file and filter not existing users
+    inputfilename = "input.txt"
+    outputfilename = "output.txt"
+    print outputfilename 
+    open_file = open(inputfilename, 'r')
+    contents = open_file.readlines()
+    to_file = open(outputfilename, 'w')
+    i = 0
+    totalLines = len(contents)
+    totalLines = int(totalLines)
+    print "going to while loop"
+    while i < totalLines: 
+        outputCONTENT = ""
+        print i
+        if ( i == totalLines-1):
+            return
+        if (i>0):
+            lineString = contents[i]
+            user = lineString.split()[0]
+            j = i
+            nextFlag = 1 
+            while (nextFlag == 1 and ( j < totalLines )):
+                tempString = contents[j]
+                user2 = tempString.split()[0]
+                if (user != user2):
+                    nextFlag = 0 
+                j = j + 1
+            markIndex = j   
+            for month in range(5,13):
+                for day in weekday:
+                    for hour in range ( 0, 24):
+                        ## print "%s-%s-%s-%s" % (user,month, day , hour)
+                        totalTW = 0 
+                        totalQS = 0 
+                        totalResult =  0 
+                        for z in range(i,markIndex):
+                            tempString = contents[z]
+                            tweetmonth = tempString.split()[1]
+                            tweetday = tempString.split()[2]
+                            tweethour = tempString.split()[3]
+                            tweetTW = tempString.split()[4]
+                            tweetQS = tempString.split()[5]
+                            tweetResult = tempString.split()[6]
+                            tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) 
+                            dayOfWeek =  datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
+                            if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ):
+                                totalTW += int(tweetTW)
+                                totalQS += int(tweetQS)
+                                totalResult += int(tweetResult) 
+                        lineoutput =  "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult)
+                        ## to_file.write(lineoutput)
+                        outputCONTENT = outputCONTENT + lineoutput
+            i = markIndex-1 
+        else:
+            ## to_file.write(contents[0]) 
+            outputCONTENT = outputCONTENT + contents[0] 
+            i = i + 1 
+
+        to_file.write(outputCONTENT) 
+    to_file.close()
+    open_file.close()
+
+if __name__ == "__main__":
+    main()
No results found