Geoyi · January 26, 2018 12:27 · Jun 4, 2017 · Jun 4, 2017
diff --git a/np_to_tfrecords.py b/np_to_tfrecords.py
@@ -6,8 +6,8 @@
 def np_to_tfrecords(X, Y, file_path_prefix, verbose=True):
     """
     Converts a Numpy array (or two Numpy arrays) into a tfrecord file.
-    For supervised learning, fed training inputs to X and training labels to Y.
-    For unsupervised learning, only fed training inputs to X, and fed None to Y.
+    For supervised learning, feed training inputs to X and training labels to Y.
+    For unsupervised learning, only feed training inputs to X, and feed None to Y.
     The length of the first dimensions of X and Y should be the number of samples.
     
     Parameters

diff --git a/np_to_tfrecords.py b/np_to_tfrecords.py
@@ -0,0 +1,110 @@
+import numpy as np
+import tensorflow as tf
+
+__author__ = "Sangwoong Yoon"
+
+def np_to_tfrecords(X, Y, file_path_prefix, verbose=True):
+    """
+    Converts a Numpy array (or two Numpy arrays) into a tfrecord file.
+    For supervised learning, fed training inputs to X and training labels to Y.
+    For unsupervised learning, only fed training inputs to X, and fed None to Y.
+    The length of the first dimensions of X and Y should be the number of samples.
+    
+    Parameters
+    ----------
+    X : numpy.ndarray of rank 2
+        Numpy array for training inputs. Its dtype should be float32, float64, or int64.
+        If X has a higher rank, it should be rshape before fed to this function.
+    Y : numpy.ndarray of rank 2 or None
+        Numpy array for training labels. Its dtype should be float32, float64, or int64.
+        None if there is no label array.
+    file_path_prefix : str
+        The path and name of the resulting tfrecord file to be generated, without '.tfrecords'
+    verbose : bool
+        If true, progress is reported.
+    
+    Raises
+    ------
+    ValueError
+        If input type is not float (64 or 32) or int.
+    
+    """
+    def _dtype_feature(ndarray):
+        """match appropriate tf.train.Feature class with dtype of ndarray. """
+        assert isinstance(ndarray, np.ndarray)
+        dtype_ = ndarray.dtype
+        if dtype_ == np.float64 or dtype_ == np.float32:
+            return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array))
+        elif dtype_ == np.int64:
+            return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array))
+        else:  
+            raise ValueError("The input should be numpy ndarray. \
+                               Instaed got {}".format(ndarray.dtype))
+
+    assert isinstance(X, np.ndarray)
+    assert len(X.shape) == 2  # If X has a higher rank, 
+                               # it should be rshape before fed to this function.
+    assert isinstance(Y, np.ndarray) or Y is None
+
+    # load appropriate tf.train.Feature class depending on dtype
+    dtype_feature_x = _dtype_feature(X)
+    if Y is not None:
+        assert X.shape[0] == Y.shape[0]
+        assert len(Y.shape) == 2
+        dtype_feature_y = _dtype_feature(Y)            
+
+    # Generate tfrecord writer
+    result_tf_file = file_path_prefix + '.tfrecords'
+    writer = tf.python_io.TFRecordWriter(result_tf_file)
+    if verbose:
+        print "Serializing {:d} examples into {}".format(X.shape[0], result_tf_file)
+
+    # iterate over each sample,
+    # and serialize it as ProtoBuf.
+    for idx in range(X.shape[0]):
+        x = X[idx]
+        if Y is not None:
+            y = Y[idx]
+
+        d_feature = {}
+        d_feature['X'] = dtype_feature_x(x)
+        if Y is not None:
+            d_feature['Y'] = dtype_feature_y(y)
+
+        features = tf.train.Features(feature=d_feature)
+        example = tf.train.Example(features=features)
+        serialized = example.SerializeToString()
+        writer.write(serialized)
+
+    if verbose:
+        print "Writing {} done!".format(result_tf_file)
+
+
+#################################    
+##      Test and Use Cases     ##
+#################################
+
+# 1-1. Saving a dataset with input and label (supervised learning)
+xx = np.random.randn(10,5)
+yy = np.random.randn(10,1)
+np_to_tfrecords(xx, yy, 'test1', verbose=True)
+
+# 1-2. Check if the data is stored correctly
+# open the saved file and check the first entries
+for serialized_example in tf.python_io.tf_record_iterator('test1.tfrecords'):
+    example = tf.train.Example()
+    example.ParseFromString(serialized_example)
+    x_1 = np.array(example.features.feature['X'].float_list.value)
+    y_1 = np.array(example.features.feature['Y'].float_list.value)
+    break
+
+# the numbers may be slightly different because of the floating point error.
+print xx[0]
+print x_1
+print yy[0]
+print y_1
+
+
+# 2. Saving a dataset with only inputs (unsupervised learning)
+xx = np.random.randn(100,100)
+np_to_tfrecords(xx, None, 'test2', verbose=True)
No results found