Tests for Unicode compliance

Update commandline test helpers to better handle Unicode
We replace cStringIO with StringIO subclass that forces UTF-8 encoding, and explicitly convert commandlines to UTF-8 before shlex. These changes will only affect tests, not normal commandline operation.
2013-01-03 17:03:52 -05:00 · 2013-01-03 17:03:52 -05:00 · 2013-01-03 17:02:38 -05:00 · 2013-01-03 17:02:38 -05:00 · 2013-01-03 17:02:38 -05:00 · 2013-01-03 17:02:38 -05:00
9 changed files with 195 additions and 47 deletions
--- a/nilmdb/bulkdata.py
+++ b/nilmdb/bulkdata.py
@@ -29,9 +29,18 @@ class BulkData(object):
    def close(self):
        self.getnode.cache_remove_all()

-    def create(self, path, layout_name):
+    def _encode_filename(self, path):
+        # Encode all paths to UTF-8, regardless of sys.getfilesystemencoding(),
+        # because we want to be able to represent all code points and the user
+        # will never be directly exposed to filenames.  We can then do path
+        # manipulations on the UTF-8 directly.
+        if isinstance(path, unicode):
+            return path.encode('utf-8')
+        return path
+
+    def create(self, unicodepath, layout_name):
        """
-        path: path to the data (e.g. '/newton/prep').
+        unicodepath: path to the data (e.g. u'/newton/prep').
        Paths must contain at least two elements, e.g.:
           /newton/prep
           /newton/raw
@@ -40,6 +49,8 @@ class BulkData(object):

        layout_name: string for nilmdb.layout.get_named(), e.g. 'float32_8'
        """
+        path = self._encode_filename(unicodepath)
+
        if path[0] != '/':
            raise ValueError("paths must start with /")
        [ group, node ] = path.rsplit("/", 1)
@@ -92,14 +103,15 @@ class BulkData(object):
            raise ValueError("error creating table at that path: " + e.strerror)

        # Open and cache it
-        self.getnode(path)
+        self.getnode(unicodepath)

        # Success
        return

-    def destroy(self, path):
+    def destroy(self, unicodepath):
        """Fully remove all data at a particular path.  No way to undo
        it!  The group/path structure is removed, too."""
+        path = self._encode_filename(unicodepath)

        # Get OS path
        elements = path.lstrip('/').split('/')
@@ -125,9 +137,10 @@ class BulkData(object):
    # Cache open tables
    @nilmdb.utils.lru_cache(size = table_cache_size,
                            onremove = lambda x: x.close())
-    def getnode(self, path):
+    def getnode(self, unicodepath):
        """Return a Table object corresponding to the given database
        path, which must exist."""
+        path = self._encode_filename(unicodepath)
        elements = path.lstrip('/').split('/')
        ospath = os.path.join(self.root, *elements)
        return Table(ospath)
--- a/nilmdb/cmdline/extract.py
+++ b/nilmdb/cmdline/extract.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from __future__ import print_function
 from nilmdb.utils.printf import *
 import nilmdb.client
 import sys
@@ -50,7 +51,7 @@ def cmd_extract(self):
            # Strip timestamp (first element).  Doesn't make sense
            # if we are only returning a count.
            dataline = ' '.join(dataline.split(' ')[1:])
-        print dataline
+        print(dataline)
        printed = True
    if not printed:
        if self.args.annotate:
--- a/nilmdb/httpclient.py
+++ b/nilmdb/httpclient.py
@@ -10,7 +10,6 @@ import re
 import os
 import simplejson as json
 import urlparse
-import urllib
 import pycurl
 import cStringIO

@@ -59,7 +58,8 @@ class HTTPClient(object):
    def _setup_url(self, url = "", params = ""):
        url = urlparse.urljoin(self.baseurl, url)
        if params:
-            url = urlparse.urljoin(url, "?" + urllib.urlencode(params, True))
+            url = urlparse.urljoin(
+                url, "?" + nilmdb.utils.urllib.urlencode(params, True))
        self.curl.setopt(pycurl.URL, url)
        self.url = url

--- a/nilmdb/server.py
+++ b/nilmdb/server.py
@@ -11,6 +11,7 @@ import sys
 import time
 import os
 import simplejson as json
+import functools

 try:
    import cherrypy
@@ -39,7 +40,6 @@ def workaround_cp_bug_1200(func): # pragma: no cover (just a workaround)
    # Even if chunked responses are disabled, you may still miss miss
    # LookupError, or UnicodeError exceptions due to CherryPy bug
    # #1200.  This throws them as generic Exceptions insteads.
-    import functools
    import traceback
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
@@ -51,6 +51,20 @@ def workaround_cp_bug_1200(func): # pragma: no cover (just a workaround)
                            traceback.format_exc())
    return wrapper

+def exception_to_httperror(response = "400 Bad Request"):
+    """Return a decorator that catches Exception and throws
+    a HTTPError describing it instead"""
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except Exception as e:
+                message = sprintf("%s: %s", type(e).__name__, str(e))
+                raise cherrypy.HTTPError(response, message)
+        return wrapper
+    return decorator
+
 # CherryPy apps
 class Root(NilmApp):
    """Root application for NILM database"""
@@ -104,26 +118,20 @@ class Stream(NilmApp):
    # /stream/create?path=/newton/prep&layout=PrepData
    @cherrypy.expose
    @cherrypy.tools.json_out()
+    @exception_to_httperror()
    def create(self, path, layout):
        """Create a new stream in the database.  Provide path
        and one of the nilmdb.layout.layouts keys.
        """
-        try:
-            return self.db.stream_create(path, layout)
-        except Exception as e:
-            message = sprintf("%s: %s", type(e).__name__, e.message)
-            raise cherrypy.HTTPError("400 Bad Request", message)
+        return self.db.stream_create(path, layout)

    # /stream/destroy?path=/newton/prep
    @cherrypy.expose
    @cherrypy.tools.json_out()
+    @exception_to_httperror()
    def destroy(self, path):
        """Delete a stream and its associated data."""
-        try:
-            return self.db.stream_destroy(path)
-        except Exception as e:
-            message = sprintf("%s: %s", type(e).__name__, e.message)
-            raise cherrypy.HTTPError("400 Bad Request", message)
+        return self.db.stream_destroy(path)

    # /stream/get_metadata?path=/newton/prep
    # /stream/get_metadata?path=/newton/prep&key=foo&key=bar
@@ -152,30 +160,24 @@ class Stream(NilmApp):
    # /stream/set_metadata?path=/newton/prep&data=<json>
    @cherrypy.expose
    @cherrypy.tools.json_out()
+    @exception_to_httperror()
    def set_metadata(self, path, data):
        """Set metadata for the named stream, replacing any
        existing metadata.  Data should be a json-encoded
        dictionary"""
-        try:
-            data_dict = json.loads(data)
-            self.db.stream_set_metadata(path, data_dict)
-        except Exception as e:
-            message = sprintf("%s: %s", type(e).__name__, e.message)
-            raise cherrypy.HTTPError("400 Bad Request", message)
+        data_dict = json.loads(data)
+        self.db.stream_set_metadata(path, data_dict)
        return "ok"

    # /stream/update_metadata?path=/newton/prep&data=<json>
    @cherrypy.expose
    @cherrypy.tools.json_out()
+    @exception_to_httperror()
    def update_metadata(self, path, data):
        """Update metadata for the named stream.  Data
        should be a json-encoded dictionary"""
-        try:
-            data_dict = json.loads(data)
-            self.db.stream_update_metadata(path, data_dict)
-        except Exception as e:
-            message = sprintf("%s: %s", type(e).__name__, e.message)
-            raise cherrypy.HTTPError("400 Bad Request", message)
+        data_dict = json.loads(data)
+        self.db.stream_update_metadata(path, data_dict)
        return "ok"

    # /stream/insert?path=/newton/prep
--- a/nilmdb/utils/init.py
+++ b/nilmdb/utils/init.py
@@ -6,3 +6,4 @@ from .serializer import Serializer
 from .lrucache import lru_cache
 from .diskusage import du
 from .mustclose import must_close
+from .urllib import urlencode
--- a/nilmdb/utils/timer.py
+++ b/nilmdb/utils/timer.py
@@ -5,6 +5,7 @@
 #   with nilmdb.Timer("flush"):
 #       foo.flush()

+from __future__ import print_function
 import contextlib
 import time

@@ -18,4 +19,4 @@ def Timer(name = None, tosyslog = False):
        import syslog
        syslog.syslog(msg)
    else:
-        print msg
+        print(msg)
--- a/nilmdb/utils/urllib.py
+++ b/nilmdb/utils/urllib.py
@@ -0,0 +1,68 @@
+from __future__ import absolute_import
+from urllib import quote_plus, _is_unicode
+
+# urllib.urlencode insists on encoding Unicode as ASCII.  This is an
+# exact copy of that function, except we encode it as UTF-8 instead.
+
+def urlencode(query, doseq=0):
+    """Encode a sequence of two-element tuples or dictionary into a URL query string.
+
+    If any values in the query arg are sequences and doseq is true, each
+    sequence element is converted to a separate parameter.
+
+    If the query arg is a sequence of two-element tuples, the order of the
+    parameters in the output will match the order of parameters in the
+    input.
+    """
+
+    if hasattr(query,"items"):
+        # mapping objects
+        query = query.items()
+    else:
+        # it's a bother at times that strings and string-like objects are
+        # sequences...
+        try:
+            # non-sequence items should not work with len()
+            # non-empty strings will fail this
+            if len(query) and not isinstance(query[0], tuple):
+                raise TypeError
+            # zero-length sequences of all types will get here and succeed,
+            # but that's a minor nit - since the original implementation
+            # allowed empty dicts that type of behavior probably should be
+            # preserved for consistency
+        except TypeError:
+            ty,va,tb = sys.exc_info()
+            raise TypeError, "not a valid non-string sequence or mapping object", tb
+
+    l = []
+    if not doseq:
+        # preserve old behavior
+        for k, v in query:
+            k = quote_plus(str(k))
+            v = quote_plus(str(v))
+            l.append(k + '=' + v)
+    else:
+        for k, v in query:
+            k = quote_plus(str(k))
+            if isinstance(v, str):
+                v = quote_plus(v)
+                l.append(k + '=' + v)
+            elif _is_unicode(v):
+                # is there a reasonable way to convert to ASCII?
+                # encode generates a string, but "replace" or "ignore"
+                # lose information and "strict" can raise UnicodeError
+                v = quote_plus(v.encode("utf-8","strict"))
+                l.append(k + '=' + v)
+            else:
+                try:
+                    # is this a sufficient test for sequence-ness?
+                    len(v)
+                except TypeError:
+                    # not a sequence
+                    v = quote_plus(str(v))
+                    l.append(k + '=' + v)
+                else:
+                    # loop over the sequence
+                    for elt in v:
+                        l.append(k + '=' + quote_plus(str(elt)))
+    return '&'.join(l)
--- a/tests/test_client.py
+++ b/tests/test_client.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import nilmdb
 from nilmdb.utils.printf import *
 from nilmdb.client import ClientError, ServerError
@@ -82,6 +84,8 @@ class TestClient(object):
        # Bad layout type
        with assert_raises(ClientError):
            client.stream_create("/newton/prep", "NoSuchLayout")
+
+        # Create three streams
        client.stream_create("/newton/prep", "PrepData")
        client.stream_create("/newton/raw", "RawData")
        client.stream_create("/newton/zzz/rawnotch", "RawNotchedData")
@@ -277,3 +281,40 @@ class TestClient(object):
                              "end": "123" }, retjson=False)
        if "transfer-encoding: chunked" not in client.http._headers.lower():
            warnings.warn("Non-chunked HTTP response for /stream/extract")
+
+    def test_client_7_unicode(self):
+        # Basic Unicode tests
+        client = nilmdb.Client(url = "http://localhost:12380/")
+
+        # Delete streams that exist
+        for stream in client.stream_list():
+            client.stream_destroy(stream[0])
+
+        # Database is empty
+        eq_(client.stream_list(), [])
+
+        # Create Unicode stream, match it
+        raw = [ u"/düsseldorf/raw", u"uint16_6" ]
+        prep = [ u"/düsseldorf/prep", u"uint16_6" ]
+        client.stream_create(*raw)
+        eq_(client.stream_list(), [raw])
+        eq_(client.stream_list(layout=raw[1]), [raw])
+        eq_(client.stream_list(path=raw[0]), [raw])
+        client.stream_create(*prep)
+        eq_(client.stream_list(), [prep, raw])
+
+        # Set / get metadata with Unicode keys and values
+        eq_(client.stream_get_metadata(raw[0]), {})
+        eq_(client.stream_get_metadata(prep[0]), {})
+        meta1 = { u"alpha": u"α",
+                  u"β": u"beta" }
+        meta2 = { u"alpha": u"α" }
+        meta3 = { u"β": u"beta" }
+        client.stream_set_metadata(prep[0], meta1)
+        client.stream_update_metadata(prep[0], {})
+        client.stream_update_metadata(raw[0], meta2)
+        client.stream_update_metadata(raw[0], meta3)
+        eq_(client.stream_get_metadata(prep[0]), meta1)
+        eq_(client.stream_get_metadata(raw[0]), meta1)
+        eq_(client.stream_get_metadata(raw[0], [ "alpha" ]), meta2)
+        eq_(client.stream_get_metadata(raw[0], [ "alpha", "β" ]), meta1)
--- a/tests/test_cmdline.py
+++ b/tests/test_cmdline.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 import nilmdb
 from nilmdb.utils.printf import *
 import nilmdb.cmdline
@@ -13,7 +15,7 @@ import threading
 import urllib2
 from urllib2 import urlopen, HTTPError
 import Queue
-import cStringIO
+import StringIO
 import shlex

 from test_helpers import *
@@ -45,13 +47,18 @@ def setup_module():
 def teardown_module():
    server_stop()

+# Add an encoding property to StringIO so Python will convert Unicode
+# properly when writing or reading.
+class UTF8StringIO(StringIO.StringIO):
+    encoding = 'utf-8'
+
 class TestCmdline(object):

    def run(self, arg_string, infile=None, outfile=None):
        """Run a cmdline client with the specified argument string,
        passing the given input.  Returns a tuple with the output and
        exit code"""
-        #print "TZ=UTC ./nilmtool.py " + arg_string
+        # printf("TZ=UTC ./nilmtool.py %s\n", arg_string)
        class stdio_wrapper:
            def __init__(self, stdin, stdout, stderr):
                self.io = (stdin, stdout, stderr)
@@ -62,15 +69,18 @@ class TestCmdline(object):
                ( sys.stdin, sys.stdout, sys.stderr ) = self.saved
        # Empty input if none provided
        if infile is None:
-            infile = cStringIO.StringIO("")
+            infile = UTF8StringIO("")
        # Capture stderr
-        errfile = cStringIO.StringIO()
+        errfile = UTF8StringIO()
        if outfile is None:
            # If no output file, capture stdout with stderr
            outfile = errfile
        with stdio_wrapper(infile, outfile, errfile) as s:
            try:
-                nilmdb.cmdline.Cmdline(shlex.split(arg_string)).run()
+                # shlex doesn't support Unicode very well.  Encode the
+                # string as UTF-8 explicitly before splitting.
+                args = shlex.split(arg_string.encode('utf-8'))
+                nilmdb.cmdline.Cmdline(args).run()
                sys.exit(0)
            except SystemExit as e:
                exitcode = e.code
@@ -298,16 +308,9 @@ class TestCmdline(object):
        eq_(cmd.parse_time("hi there 20120405 1400-0400 testing! 123"), test)
        eq_(cmd.parse_time("20120405 1800 UTC"), test)
        eq_(cmd.parse_time("20120405 1400-0400 UTC"), test)
-        with assert_raises(ValueError):
-            print cmd.parse_time("20120405 1400-9999")
-        with assert_raises(ValueError):
-            print cmd.parse_time("hello")
-        with assert_raises(ValueError):
-            print cmd.parse_time("-")
-        with assert_raises(ValueError):
-            print cmd.parse_time("")
-        with assert_raises(ValueError):
-            print cmd.parse_time("14:00")
+        for badtime in [ "20120405 1400-9999", "hello", "-", "", "14:00" ]:
+            with assert_raises(ValueError):
+                x = cmd.parse_time(badtime)
        eq_(cmd.parse_time("snapshot-20120405-140000.raw.gz"), test)
        eq_(cmd.parse_time("prep-20120405T1400"), test)

@@ -519,3 +522,21 @@ class TestCmdline(object):
            # Make sure it was created empty
            self.ok("list --detail --path " + path)
            self.contain("(no intervals)")
+
+    def test_cmdline_11_unicode(self):
+        # Unicode paths.
+        self.ok("destroy /newton/asdf/qwer")
+        self.ok("destroy /newton/prep")
+        self.ok("destroy /newton/raw")
+        self.ok("destroy /newton/zzz")
+
+        self.ok(u"create /düsseldorf/raw uint16_6")
+        self.ok("list --detail")
+        self.contain(u"/düsseldorf/raw uint16_6")
+        self.contain("(no intervals)")
+
+        # Unicode metadata
+        self.ok(u"metadata /düsseldorf/raw --set α=beta 'γ=δ'")
+        self.ok(u"metadata /düsseldorf/raw --update 'α=β ε τ α'")
+        self.ok(u"metadata /düsseldorf/raw")
+        self.match(u"α=β ε τ α\nγ=δ\n")
Author	SHA1	Message	Date
Jim Paris	c083d63c96	Tests for Unicode compliance	2013-01-03 17:03:52 -05:00
Jim Paris	0221e3ea21	Update commandline test helpers to better handle Unicode We replace cStringIO with StringIO subclass that forces UTF-8 encoding, and explicitly convert commandlines to UTF-8 before shlex. These changes will only affect tests, not normal commandline operation.	2013-01-03 17:03:52 -05:00
Jim Paris	f5fd2b064e	Replace urllib.encode() with a version that encodes Unicode as UTF-8 instead	2013-01-03 17:02:38 -05:00
Jim Paris	06e91a6a98	Always use function version of print()	2013-01-03 17:02:38 -05:00
Jim Paris	41b3f3c018	Always use UTF-8 for filenames in nilmdb.bulkdata	2013-01-03 17:02:38 -05:00
Jim Paris	842076fef4	Cleanup server error handling with decorator	2013-01-03 17:02:38 -05:00