nilmdb/test/speed-readascii.py

#!/usr/bin/python

from printf import printf
import time
import re
import numpy as np
import itertools

class Timer():
    def __init__(self, arg):
        self.arg = arg
    def __enter__(self): self.start = time.time()
    def __exit__(self, *args): printf("%s: %f lines/sec\n", self.arg, 1e6 / (time.time() - self.start))

def test_split():
    for n, line in enumerate(open('1m.raw', 'r')):
        out = [0]*6
        tmp = [ int(i) for i in line.partition('#')[0].split() ]
        out[0:len(tmp)] = tmp
        if (n % 100000 == 0):
            printf("line %d = %s\n", n, str(out))

def test_split2():
    for n, line in enumerate(open('1m.raw', 'r')):
        out = [0]*6
        tmp = [ int(i,10) for i in line.partition('#')[0].split() ]
        out[0:len(tmp)] = tmp
        if (n % 100000 == 0):
            printf("line %d = %s\n", n, str(out))

def test_regex():
    for n, line in enumerate(open('1m.raw', 'r')):
        out = [0]*6
        tmp = [ int(x) for x in re.findall('(\d+)\s+',line.partition('#')[0]) ]
        out[0:len(tmp)] = tmp
        if (n % 100000 == 0):
            printf("line %d = %s\n", n, str(out))

def test_bigregex():
    regex = re.compile('^(?:\s*)' + '(?:(\d+)\s+)?' * 6)
    for n, line in enumerate(open('1m.raw', 'r')):
        out = [ int(x or 0) for x in re.match(regex, line).groups() ]
        if (n % 100000 == 0):
            printf("line %d = %s\n", n, str(out))

def test_numpy():
    out = np.genfromtxt(open('1m.raw', 'r'),
                        dtype = np.dtype('i2,i2,i2,i2,i2,i2'))

with Timer("numpy"):
    test_numpy() # 106k/sec

with Timer("regex"):
    test_regex() # 121k/sec

with Timer("split"):
    test_split() # 219k/sec

with Timer("split2"):
    test_split2() # 328k/sec

with Timer("bigregex"):
    test_bigregex() # 130k/sec

# The "int" operation takes quite a while -- int(x,10) is twice as fast
# Perl does about 500k/sec