#!/usr/bin/python from printf import printf import time import re import numpy as np import itertools class Timer(): def __init__(self, arg): self.arg = arg def __enter__(self): self.start = time.time() def __exit__(self, *args): printf("%s: %f lines/sec\n", self.arg, 1e6 / (time.time() - self.start)) def test_split(): for n, line in enumerate(open('1m.raw', 'r')): out = [0]*6 tmp = [ int(i) for i in line.partition('#')[0].split() ] out[0:len(tmp)] = tmp if (n % 100000 == 0): printf("line %d = %s\n", n, str(out)) def test_split2(): for n, line in enumerate(open('1m.raw', 'r')): out = [0]*6 tmp = [ int(i,10) for i in line.partition('#')[0].split() ] out[0:len(tmp)] = tmp if (n % 100000 == 0): printf("line %d = %s\n", n, str(out)) def test_regex(): for n, line in enumerate(open('1m.raw', 'r')): out = [0]*6 tmp = [ int(x) for x in re.findall('(\d+)\s+',line.partition('#')[0]) ] out[0:len(tmp)] = tmp if (n % 100000 == 0): printf("line %d = %s\n", n, str(out)) def test_bigregex(): regex = re.compile('^(?:\s*)' + '(?:(\d+)\s+)?' * 6) for n, line in enumerate(open('1m.raw', 'r')): out = [ int(x or 0) for x in re.match(regex, line).groups() ] if (n % 100000 == 0): printf("line %d = %s\n", n, str(out)) def test_numpy(): out = np.genfromtxt(open('1m.raw', 'r'), dtype = np.dtype('i2,i2,i2,i2,i2,i2')) with Timer("numpy"): test_numpy() # 106k/sec with Timer("regex"): test_regex() # 121k/sec with Timer("split"): test_split() # 219k/sec with Timer("split2"): test_split2() # 328k/sec with Timer("bigregex"): test_bigregex() # 130k/sec # The "int" operation takes quite a while -- int(x,10) is twice as fast # Perl does about 500k/sec