#!/usr/bin/python import os import sys import csv import re import string import time import datetime ''' This is modified version of week.py I raised this question in http://v2ex.com/t/102160 and thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake) with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now. ''' def main(): start_time = time.time() weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] inputfilename = "./test_refine/test.txt" outputfilename = "test_week/" + inputfilename[14:] print outputfilename open_file = open(inputfilename, 'r') contents = open_file.readlines() to_file = open(outputfilename, 'w') i = 0 totalLines = len(contents) totalLines = int(totalLines) while i < totalLines: outputCONTENT = "" print i if ( i == totalLines-1): print time.time()-start_time , "seconds" return if (i>0): lineString = contents[i] user = lineString.split()[0] j = i nextFlag = 1 while (nextFlag == 1 and ( j < totalLines )): tempString = contents[j] user2 = tempString.split()[0] if (user != user2): nextFlag = 0 j = j + 1 markIndex = j ## do the main check totalTW = {} totalQS = {} totalResult = {} for z in range(i,markIndex): tempString = contents[z] tweetmonth = tempString.split()[1] tweetday = tempString.split()[2] tweethour = tempString.split()[3] tweetTW = tempString.split()[4] tweetQS = tempString.split()[5] tweetResult = tempString.split()[6] tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a') key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour) if key in totalTW: totalTW[key]+=int(tweetTW) totalQS[key]+=int(tweetQS) totalResult[key]+=int(tweetResult) else: totalTW[key]=int(tweetTW) totalQS[key]=int(tweetQS) totalResult[key]=int(tweetResult) for month in range(5,13): for day in weekday: for hour in range(0,24): key = "%02d%s%02d" % ( month, day, hour) if key in totalTW: lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key]) outputCONTENT = outputCONTENT + lineoutput else: lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0) outputCONTENT = outputCONTENT + lineoutput ## to_file.write(lineoutput) i = markIndex-1 else: ## to_file.write(contents[0]) outputCONTENT = outputCONTENT + contents[0] i = i + 1 to_file.write(outputCONTENT) to_file.close() open_file.close() if __name__ == "__main__": main()