|
#!/usr/bin/python |
|
import os |
|
import sys |
|
import csv |
|
import re |
|
import string |
|
import time |
|
import datetime |
|
''' |
|
This is modified version of week.py |
|
I raised this question in http://v2ex.com/t/102160 and |
|
thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake) |
|
with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now. |
|
''' |
|
|
|
def main(): |
|
start_time = time.time() |
|
weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] |
|
inputfilename = "./test_refine/test.txt" |
|
outputfilename = "test_week/" + inputfilename[14:] |
|
print outputfilename |
|
open_file = open(inputfilename, 'r') |
|
contents = open_file.readlines() |
|
to_file = open(outputfilename, 'w') |
|
i = 0 |
|
totalLines = len(contents) |
|
totalLines = int(totalLines) |
|
while i < totalLines: |
|
outputCONTENT = "" |
|
print i |
|
if ( i == totalLines-1): |
|
print time.time()-start_time , "seconds" |
|
return |
|
if (i>0): |
|
lineString = contents[i] |
|
user = lineString.split()[0] |
|
j = i |
|
nextFlag = 1 |
|
while (nextFlag == 1 and ( j < totalLines )): |
|
tempString = contents[j] |
|
user2 = tempString.split()[0] |
|
if (user != user2): |
|
nextFlag = 0 |
|
j = j + 1 |
|
markIndex = j |
|
## do the main check |
|
totalTW = {} |
|
totalQS = {} |
|
totalResult = {} |
|
for z in range(i,markIndex): |
|
tempString = contents[z] |
|
tweetmonth = tempString.split()[1] |
|
tweetday = tempString.split()[2] |
|
tweethour = tempString.split()[3] |
|
tweetTW = tempString.split()[4] |
|
tweetQS = tempString.split()[5] |
|
tweetResult = tempString.split()[6] |
|
tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday) |
|
dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a') |
|
key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour) |
|
if key in totalTW: |
|
totalTW[key]+=int(tweetTW) |
|
totalQS[key]+=int(tweetQS) |
|
totalResult[key]+=int(tweetResult) |
|
else: |
|
totalTW[key]=int(tweetTW) |
|
totalQS[key]=int(tweetQS) |
|
totalResult[key]=int(tweetResult) |
|
for month in range(5,13): |
|
for day in weekday: |
|
for hour in range(0,24): |
|
key = "%02d%s%02d" % ( month, day, hour) |
|
if key in totalTW: |
|
lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key]) |
|
outputCONTENT = outputCONTENT + lineoutput |
|
else: |
|
lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0) |
|
outputCONTENT = outputCONTENT + lineoutput |
|
|
|
## to_file.write(lineoutput) |
|
i = markIndex-1 |
|
else: |
|
## to_file.write(contents[0]) |
|
outputCONTENT = outputCONTENT + contents[0] |
|
i = i + 1 |
|
|
|
to_file.write(outputCONTENT) |
|
to_file.close() |
|
open_file.close() |
|
|
|
if __name__ == "__main__": |
|
main() |