import numpy as np import tensorflow as tf __author__ = "Sangwoong Yoon" def np_to_tfrecords(X, Y, file_path_prefix, verbose=True): """ Converts a Numpy array (or two Numpy arrays) into a tfrecord file. For supervised learning, feed training inputs to X and training labels to Y. For unsupervised learning, only feed training inputs to X, and feed None to Y. The length of the first dimensions of X and Y should be the number of samples. Parameters ---------- X : numpy.ndarray of rank 2 Numpy array for training inputs. Its dtype should be float32, float64, or int64. If X has a higher rank, it should be rshape before fed to this function. Y : numpy.ndarray of rank 2 or None Numpy array for training labels. Its dtype should be float32, float64, or int64. None if there is no label array. file_path_prefix : str The path and name of the resulting tfrecord file to be generated, without '.tfrecords' verbose : bool If true, progress is reported. Raises ------ ValueError If input type is not float (64 or 32) or int. """ def _dtype_feature(ndarray): """match appropriate tf.train.Feature class with dtype of ndarray. """ assert isinstance(ndarray, np.ndarray) dtype_ = ndarray.dtype if dtype_ == np.float64 or dtype_ == np.float32: return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array)) elif dtype_ == np.int64: return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array)) else: raise ValueError("The input should be numpy ndarray. \ Instaed got {}".format(ndarray.dtype)) assert isinstance(X, np.ndarray) assert len(X.shape) == 2 # If X has a higher rank, # it should be rshape before fed to this function. assert isinstance(Y, np.ndarray) or Y is None # load appropriate tf.train.Feature class depending on dtype dtype_feature_x = _dtype_feature(X) if Y is not None: assert X.shape[0] == Y.shape[0] assert len(Y.shape) == 2 dtype_feature_y = _dtype_feature(Y) # Generate tfrecord writer result_tf_file = file_path_prefix + '.tfrecords' writer = tf.python_io.TFRecordWriter(result_tf_file) if verbose: print "Serializing {:d} examples into {}".format(X.shape[0], result_tf_file) # iterate over each sample, # and serialize it as ProtoBuf. for idx in range(X.shape[0]): x = X[idx] if Y is not None: y = Y[idx] d_feature = {} d_feature['X'] = dtype_feature_x(x) if Y is not None: d_feature['Y'] = dtype_feature_y(y) features = tf.train.Features(feature=d_feature) example = tf.train.Example(features=features) serialized = example.SerializeToString() writer.write(serialized) if verbose: print "Writing {} done!".format(result_tf_file) ################################# ## Test and Use Cases ## ################################# # 1-1. Saving a dataset with input and label (supervised learning) xx = np.random.randn(10,5) yy = np.random.randn(10,1) np_to_tfrecords(xx, yy, 'test1', verbose=True) # 1-2. Check if the data is stored correctly # open the saved file and check the first entries for serialized_example in tf.python_io.tf_record_iterator('test1.tfrecords'): example = tf.train.Example() example.ParseFromString(serialized_example) x_1 = np.array(example.features.feature['X'].float_list.value) y_1 = np.array(example.features.feature['Y'].float_list.value) break # the numbers may be slightly different because of the floating point error. print xx[0] print x_1 print yy[0] print y_1 # 2. Saving a dataset with only inputs (unsupervised learning) xx = np.random.randn(100,100) np_to_tfrecords(xx, None, 'test2', verbose=True)