Skip to content

Instantly share code, notes, and snippets.

@airbob
Last active August 29, 2015 13:56
Show Gist options
  • Select an option

  • Save airbob/9246248 to your computer and use it in GitHub Desktop.

Select an option

Save airbob/9246248 to your computer and use it in GitHub Desktop.

Revisions

  1. airbob revised this gist Feb 27, 2014. 1 changed file with 92 additions and 0 deletions.
    92 changes: 92 additions & 0 deletions week-modified.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,92 @@
    #!/usr/bin/python
    import os
    import sys
    import csv
    import re
    import string
    import time
    import datetime
    '''
    This is modified version of week.py
    I raised this question in http://v2ex.com/t/102160 and
    thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
    with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
    '''

    def main():
    start_time = time.time()
    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
    inputfilename = "./test_refine/test.txt"
    outputfilename = "test_week/" + inputfilename[14:]
    print outputfilename
    open_file = open(inputfilename, 'r')
    contents = open_file.readlines()
    to_file = open(outputfilename, 'w')
    i = 0
    totalLines = len(contents)
    totalLines = int(totalLines)
    while i < totalLines:
    outputCONTENT = ""
    print i
    if ( i == totalLines-1):
    print time.time()-start_time , "seconds"
    return
    if (i>0):
    lineString = contents[i]
    user = lineString.split()[0]
    j = i
    nextFlag = 1
    while (nextFlag == 1 and ( j < totalLines )):
    tempString = contents[j]
    user2 = tempString.split()[0]
    if (user != user2):
    nextFlag = 0
    j = j + 1
    markIndex = j
    ## do the main check
    totalTW = {}
    totalQS = {}
    totalResult = {}
    for z in range(i,markIndex):
    tempString = contents[z]
    tweetmonth = tempString.split()[1]
    tweetday = tempString.split()[2]
    tweethour = tempString.split()[3]
    tweetTW = tempString.split()[4]
    tweetQS = tempString.split()[5]
    tweetResult = tempString.split()[6]
    tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
    dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
    key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
    if key in totalTW:
    totalTW[key]+=int(tweetTW)
    totalQS[key]+=int(tweetQS)
    totalResult[key]+=int(tweetResult)
    else:
    totalTW[key]=int(tweetTW)
    totalQS[key]=int(tweetQS)
    totalResult[key]=int(tweetResult)
    for month in range(5,13):
    for day in weekday:
    for hour in range(0,24):
    key = "%02d%s%02d" % ( month, day, hour)
    if key in totalTW:
    lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
    outputCONTENT = outputCONTENT + lineoutput
    else:
    lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
    outputCONTENT = outputCONTENT + lineoutput

    ## to_file.write(lineoutput)
    i = markIndex-1
    else:
    ## to_file.write(contents[0])
    outputCONTENT = outputCONTENT + contents[0]
    i = i + 1

    to_file.write(outputCONTENT)
    to_file.close()
    open_file.close()

    if __name__ == "__main__":
    main()
  2. airbob revised this gist Feb 27, 2014. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions readme.md
    Original file line number Diff line number Diff line change
    @@ -17,14 +17,14 @@ totally there are 28,000,000 lines in the file, and I have 6 this kind of files.
    write script to process the input data, to: <br>
    for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
    lets say:
    there are lines like this: <br>
    there are lines like this(year is 2012): <br>

    | userID | month |date |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |08 |02 |2 |1 | 0 |

    then this 2 data points should sum to <br>
    then this 2 data points should sum since they both belong to tue of May and hour is 02 <br>

    | userID | month |day |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
  3. airbob revised this gist Feb 27, 2014. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion readme.md
    Original file line number Diff line number Diff line change
    @@ -33,5 +33,5 @@ then this 2 data points should sum to <br>

    ## Problem
    the week.py script I added in this gist is working, the problem is, it seems too slow. <br>
    I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
    I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only) <br>
    Is there any way to optimize this script? <br>
  4. airbob revised this gist Feb 27, 2014. 1 changed file with 4 additions and 3 deletions.
    7 changes: 4 additions & 3 deletions readme.md
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    == background ==
    ### background
    Below is input File format(*.txt):


    @@ -13,7 +13,7 @@ Below is input File format(*.txt):

    totally there are 28,000,000 lines in the file, and I have 6 this kind of files.

    ==object==
    ### object
    write script to process the input data, to: <br>
    for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
    lets say:
    @@ -25,12 +25,13 @@ there are lines like this: <br>
    | 21535110 |05 |08 |02 |2 |1 | 0 |

    then this 2 data points should sum to <br>

    | userID | month |day |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |Tue |02 |5 |3 | 1 |


    == Problem ==
    ## Problem
    the week.py script I added in this gist is working, the problem is, it seems too slow. <br>
    I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
    Is there any way to optimize this script? <br>
  5. airbob revised this gist Feb 27, 2014. 2 changed files with 11 additions and 6 deletions.
    17 changes: 11 additions & 6 deletions readme.md
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,4 @@
    == background ==
    Below is input File format(*.txt):


    @@ -12,20 +13,24 @@ Below is input File format(*.txt):

    totally there are 28,000,000 lines in the file, and I have 6 this kind of files.

    **object**
    write script to process the input data, to:
    1. for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
    ==object==
    write script to process the input data, to: <br>
    for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour. <br>
    lets say:
    there is line like this:
    there are lines like this: <br>

    | userID | month |date |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |08 |02 |2 |1 | 0 |

    then this 2 data points should sum to
    then this 2 data points should sum to <br>
    | userID | month |day |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |Tue |02 |5 |3 | 1 |



    == Problem ==
    the week.py script I added in this gist is working, the problem is, it seems too slow. <br>
    I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 <br>
    Is there any way to optimize this script? <br>
    File renamed without changes.
  6. airbob renamed this gist Feb 27, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  7. airbob revised this gist Feb 27, 2014. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions input
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,6 @@
    Below is input File format(*.txt):


    | userID | month |date |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
  8. airbob renamed this gist Feb 27, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  9. airbob revised this gist Feb 27, 2014. 3 changed files with 29 additions and 11 deletions.
    11 changes: 0 additions & 11 deletions gistfile1.md
    Original file line number Diff line number Diff line change
    @@ -1,11 +0,0 @@
    File format(*.txt):


    | userID | month |date |hour |totalTW|totalQs|how |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |01 |03 |3 |2 | 1 |
    | 21535110 |05 |01 |06 |1 |0 | 0 |
    | 21535110 |05 |02 |02 |1 |0 | 0 |
    | 21535110 |05 |03 |05 |3 |2 | 0 |
    | 21535112 |05 |01 |05 |1 |1 | 1 |
    File renamed without changes.
    29 changes: 29 additions & 0 deletions input.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,29 @@
    Below is input File format(*.txt):
    | userID | month |date |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |01 |03 |3 |2 | 1 |
    | 21535110 |05 |01 |06 |1 |0 | 0 |
    | 21535110 |05 |02 |02 |1 |0 | 0 |
    | 21535110 |05 |03 |05 |3 |2 | 0 |
    | 21535112 |05 |01 |05 |1 |1 | 1 |

    totally there are 28,000,000 lines in the file, and I have 6 this kind of files.

    **object**
    write script to process the input data, to:
    1. for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
    lets say:
    there is line like this:
    | userID | month |date |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |08 |02 |2 |1 | 0 |

    then this 2 data points should sum to
    | userID | month |day |hour |totalTW|totalQs|result |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |Tue |02 |5 |3 | 1 |



  10. airbob created this gist Feb 27, 2014.
    11 changes: 11 additions & 0 deletions gistfile1.md
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,11 @@
    File format(*.txt):


    | userID | month |date |hour |totalTW|totalQs|how |
    | :-----------: |:-------:|:-----:|:-----:|:-----:|:-----:|:-----:|
    | 21535110 |05 |01 |02 |3 |2 | 1 |
    | 21535110 |05 |01 |03 |3 |2 | 1 |
    | 21535110 |05 |01 |06 |1 |0 | 0 |
    | 21535110 |05 |02 |02 |1 |0 | 0 |
    | 21535110 |05 |03 |05 |3 |2 | 0 |
    | 21535112 |05 |01 |05 |1 |1 | 1 |
    78 changes: 78 additions & 0 deletions gistfile2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,78 @@
    #!/usr/bin/python
    import os
    import sys
    import csv
    import re
    import string
    import time
    import datetime
    '''
    weekday of each month
    '''

    def main():
    weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
    ## read stats file and filter not existing users
    inputfilename = "input.txt"
    outputfilename = "output.txt"
    print outputfilename
    open_file = open(inputfilename, 'r')
    contents = open_file.readlines()
    to_file = open(outputfilename, 'w')
    i = 0
    totalLines = len(contents)
    totalLines = int(totalLines)
    print "going to while loop"
    while i < totalLines:
    outputCONTENT = ""
    print i
    if ( i == totalLines-1):
    return
    if (i>0):
    lineString = contents[i]
    user = lineString.split()[0]
    j = i
    nextFlag = 1
    while (nextFlag == 1 and ( j < totalLines )):
    tempString = contents[j]
    user2 = tempString.split()[0]
    if (user != user2):
    nextFlag = 0
    j = j + 1
    markIndex = j
    for month in range(5,13):
    for day in weekday:
    for hour in range ( 0, 24):
    ## print "%s-%s-%s-%s" % (user,month, day , hour)
    totalTW = 0
    totalQS = 0
    totalResult = 0
    for z in range(i,markIndex):
    tempString = contents[z]
    tweetmonth = tempString.split()[1]
    tweetday = tempString.split()[2]
    tweethour = tempString.split()[3]
    tweetTW = tempString.split()[4]
    tweetQS = tempString.split()[5]
    tweetResult = tempString.split()[6]
    tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
    dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
    if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ):
    totalTW += int(tweetTW)
    totalQS += int(tweetQS)
    totalResult += int(tweetResult)
    lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult)
    ## to_file.write(lineoutput)
    outputCONTENT = outputCONTENT + lineoutput
    i = markIndex-1
    else:
    ## to_file.write(contents[0])
    outputCONTENT = outputCONTENT + contents[0]
    i = i + 1

    to_file.write(outputCONTENT)
    to_file.close()
    open_file.close()

    if __name__ == "__main__":
    main()