一个读取和分析大文件的 python script 的优化问题

background

Below is input File format(*.txt):

userID	month	date	hour	totalTW	totalQs	result
21535110	05	01	02	3	2	1
21535110	05	01	03	3	2	1
21535110	05	01	06	1	0	0
21535110	05	02	02	1	0	0
21535110	05	03	05	3	2	0
21535112	05	01	05	1	1	1

totally there are 28,000,000 lines in the file, and I have 6 this kind of files.

object

write script to process the input data, to:
for each user, sum up the data (totalTW, totalQS, result) within same month, same day of the week, same hour.
lets say: there are lines like this(year is 2012):

userID	month	date	hour	totalTW	totalQs	result
21535110	05	01	02	3	2	1
21535110	05	08	02	2	1	0

then this 2 data points should sum since they both belong to tue of May and hour is 02

userID	month	day	hour	totalTW	totalQs	result
21535110	05	Tue	02	5	3	1

Problem

the week.py script I added in this gist is working, the problem is, it seems too slow.
I used lab server to run it for ~20 hours and it is currently processing at 2,300,000 (about 10% ! only)
Is there any way to optimize this script?

	#!/usr/bin/python
	import os
	import sys
	import csv
	import re
	import string
	import time
	import datetime
	'''
	This is modified version of week.py
	I raised this question in http://v2ex.com/t/102160 and
	thanks to v2ex fellows, the bottleneck is mainly due to the 3 for loops (which is quite a dummy mistake)
	with this version of the script, execution time has been reduced tremendously to ~10-20 mins, which fits my need for now.
	'''

	def main():
	start_time = time.time()
	weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
	inputfilename = "./test_refine/test.txt"
	outputfilename = "test_week/" + inputfilename[14:]
	print outputfilename
	open_file = open(inputfilename, 'r')
	contents = open_file.readlines()
	to_file = open(outputfilename, 'w')
	i = 0
	totalLines = len(contents)
	totalLines = int(totalLines)
	while i < totalLines:
	outputCONTENT = ""
	print i
	if ( i == totalLines-1):
	print time.time()-start_time , "seconds"
	return
	if (i>0):
	lineString = contents[i]
	user = lineString.split()[0]
	j = i
	nextFlag = 1
	while (nextFlag == 1 and ( j < totalLines )):
	tempString = contents[j]
	user2 = tempString.split()[0]
	if (user != user2):
	nextFlag = 0
	j = j + 1
	markIndex = j
	## do the main check
	totalTW = {}
	totalQS = {}
	totalResult = {}
	for z in range(i,markIndex):
	tempString = contents[z]
	tweetmonth = tempString.split()[1]
	tweetday = tempString.split()[2]
	tweethour = tempString.split()[3]
	tweetTW = tempString.split()[4]
	tweetQS = tempString.split()[5]
	tweetResult = tempString.split()[6]
	tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
	dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
	key = "%s%s%s" % ( tweetmonth, dayOfWeek, tweethour)
	if key in totalTW:
	totalTW[key]+=int(tweetTW)
	totalQS[key]+=int(tweetQS)
	totalResult[key]+=int(tweetResult)
	else:
	totalTW[key]=int(tweetTW)
	totalQS[key]=int(tweetQS)
	totalResult[key]=int(tweetResult)
	for month in range(5,13):
	for day in weekday:
	for hour in range(0,24):
	key = "%02d%s%02d" % ( month, day, hour)
	if key in totalTW:
	lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW[key],totalQS[key],totalResult[key])
	outputCONTENT = outputCONTENT + lineoutput
	else:
	lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,0,0,0)
	outputCONTENT = outputCONTENT + lineoutput

	## to_file.write(lineoutput)
	i = markIndex-1
	else:
	## to_file.write(contents[0])
	outputCONTENT = outputCONTENT + contents[0]
	i = i + 1

	to_file.write(outputCONTENT)
	to_file.close()
	open_file.close()

	if __name__ == "__main__":
	main()

	#!/usr/bin/python
	import os
	import sys
	import csv
	import re
	import string
	import time
	import datetime
	'''
	weekday of each month
	'''

	def main():
	weekday = ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
	## read stats file and filter not existing users
	inputfilename = "input.txt"
	outputfilename = "output.txt"
	print outputfilename
	open_file = open(inputfilename, 'r')
	contents = open_file.readlines()
	to_file = open(outputfilename, 'w')
	i = 0
	totalLines = len(contents)
	totalLines = int(totalLines)
	print "going to while loop"
	while i < totalLines:
	outputCONTENT = ""
	print i
	if ( i == totalLines-1):
	return
	if (i>0):
	lineString = contents[i]
	user = lineString.split()[0]
	j = i
	nextFlag = 1
	while (nextFlag == 1 and ( j < totalLines )):
	tempString = contents[j]
	user2 = tempString.split()[0]
	if (user != user2):
	nextFlag = 0
	j = j + 1
	markIndex = j
	for month in range(5,13):
	for day in weekday:
	for hour in range ( 0, 24):
	## print "%s-%s-%s-%s" % (user,month, day , hour)
	totalTW = 0
	totalQS = 0
	totalResult = 0
	for z in range(i,markIndex):
	tempString = contents[z]
	tweetmonth = tempString.split()[1]
	tweetday = tempString.split()[2]
	tweethour = tempString.split()[3]
	tweetTW = tempString.split()[4]
	tweetQS = tempString.split()[5]
	tweetResult = tempString.split()[6]
	tweetdate = "%s-%s-%s"%("2012",tweetmonth,tweetday)
	dayOfWeek = datetime.datetime.strptime(tweetdate, "%Y-%m-%d").strftime('%a')
	if ( day in dayOfWeek and hour == int(tweethour) and month ==int(tweetmonth) ):
	totalTW += int(tweetTW)
	totalQS += int(tweetQS)
	totalResult += int(tweetResult)
	lineoutput = "%s\t%02d\t%s\t%02d\t%s\t%s\t%s\n" %(user,month,day,hour,totalTW,totalQS,totalResult)
	## to_file.write(lineoutput)
	outputCONTENT = outputCONTENT + lineoutput
	i = markIndex-1
	else:
	## to_file.write(contents[0])
	outputCONTENT = outputCONTENT + contents[0]
	i = i + 1

	to_file.write(outputCONTENT)
	to_file.close()
	open_file.close()

	if __name__ == "__main__":
	main()