Sunday, November 22, 2020

User-facing tools should have user-understandable errors

Here's some simple Tensorflow code to create a CNN, and train it. 
import tensorflow as tf
import tensorflow.keras as keras

def create_model(optimizer="sgd"):
    deep_model = keras.models.Sequential([
        keras.layers.Conv2D(64, 7, activation="relu", padding="same", 
                            input_shape=[1, 28, 28], name="input"),
        keras.layers.MaxPooling2D(1,name="firstPool"),
        keras.layers.Conv2D(128, 3, activation="relu", padding="same", 
                            name="first_conv_1"),
        keras.layers.Conv2D(128, 3, activation="relu", padding="same", 
                            name="first_conv_2"),

        keras.layers.MaxPooling2D(1, name="secondPool"),
        keras.layers.Conv2D(256, 3, activation="relu", padding="same", 
                            name="second_conv_1"),
        keras.layers.Conv2D(256, 3, activation="relu", padding="same", 
                            name="second_conv_2"),

        keras.layers.MaxPooling2D(1, name="thirdPool"),

        keras.layers.Flatten(name="flatten"),
        keras.layers.Dense(128, activation="relu", name="pre-bottneck"),

        keras.layers.Dropout(0.5, name="bottleneckDropout"),
        keras.layers.Dense(64, activation="relu", name="bottleneck"),

        keras.layers.Dropout(0.5, name="outputDropout"),
        keras.layers.Dense(10, activation="softmax", name="output"),
    ])
    
    deep_model.compile(loss="sparse_categorical_crossentropy",
                      optimizer=optimizer,
                      metrics=["accuracy"])

    return deep_model

def fit_model(model, X_train, y_train, X_valid, y_valid, epochs):
    history_conv = model.fit(X_train, y_train, validation_data=[X_valid, y_valid],
                             epochs=epochs, verbose=0)
    return history_conv

def plot_history(history, name):
    c10.plot_training(history, name, show=True)
    
model = create_model()
history = fit_model(model, X_train, y_train, X_valid, y_valid, epochs=10)
plot_history(history, "naive_deep_mnist")

When you run it, it causes this enormous error
  ---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-26-95a7830ebd3c> in <module>
     41 
     42 model = create_model()
---> 43 history = fit_model(model, X_train, y_train, X_valid, y_valid, epochs=10)
     44 plot_history(history, "naive_deep_mnist")

<ipython-input-26-95a7830ebd3c> in fit_model(model, X_train, y_train, X_valid, y_valid, epochs)
     34 
     35 def fit_model(model, X_train, y_train, X_valid, y_valid, epochs):
---> 36     history_conv = model.fit(X_train, y_train, validation_data=[X_valid, y_valid], epochs=epochs, verbose=0)
     37     return history_conv
     38 

/usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
    106   def _method_wrapper(self, *args, **kwargs):
    107     if not self._in_multi_worker_mode():  # pylint: disable=protected-access
--> 108       return method(self, *args, **kwargs)
    109 
    110     # Running inside `run_distribute_coordinator` already.

/usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
   1096                 batch_size=batch_size):
   1097               callbacks.on_train_batch_begin(step)
-> 1098               tmp_logs = train_function(iterator)
   1099               if data_handler.should_sync:
   1100                 context.async_wait()

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
    778       else:
    779         compiler = "nonXla"
--> 780         result = self._call(*args, **kwds)
    781 
    782       new_tracing_count = self._get_tracing_count()

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
    821       # This is the first call of __call__, so we have to initialize.
    822       initializers = []
--> 823       self._initialize(args, kwds, add_initializers_to=initializers)
    824     finally:
    825       # At this point we know that the initialization is complete (or less

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
    694     self._graph_deleter = FunctionDeleter(self._lifted_initializer_graph)
    695     self._concrete_stateful_fn = (
--> 696         self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
    697             *args, **kwds))
    698 

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
   2853       args, kwargs = None, None
   2854     with self._lock:
-> 2855       graph_function, _, _ = self._maybe_define_function(args, kwargs)
   2856     return graph_function
   2857 

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
   3211 
   3212       self._function_cache.missed.add(call_context_key)
-> 3213       graph_function = self._create_graph_function(args, kwargs)
   3214       self._function_cache.primary[cache_key] = graph_function
   3215       return graph_function, args, kwargs

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
   3063     arg_names = base_arg_names + missing_arg_names
   3064     graph_function = ConcreteFunction(
-> 3065         func_graph_module.func_graph_from_py_func(
   3066             self._name,
   3067             self._python_function,

/usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
    984         _, original_func = tf_decorator.unwrap(python_func)
    985 
--> 986       func_outputs = python_func(*func_args, **func_kwargs)
    987 
    988       # invariant: `func_outputs` contains only Tensors, CompositeTensors,

/usr/local/lib/python3.8/dist-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
    598         # __wrapped__ allows AutoGraph to swap in a converted function. We give
    599         # the function a weak reference to itself to avoid a reference cycle.
--> 600         return weak_wrapped_fn().__wrapped__(*args, **kwds)
    601     weak_wrapped_fn = weakref.ref(wrapped_fn)
    602 

/usr/local/lib/python3.8/dist-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
    971           except Exception as e:  # pylint:disable=broad-except
    972             if hasattr(e, "ag_error_metadata"):
--> 973               raise e.ag_error_metadata.to_exception(e)
    974             else:
    975               raise

ValueError: in user code:

    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/training.py:747 train_step
        y_pred = self(x, training=True)
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/base_layer.py:975 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /usr/local/lib/python3.8/dist-packages/tensorflow/python/keras/engine/input_spec.py:191 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_17 is incompatible with the layer: : expected min_ndim=4, found ndim=3. Full shape received: [None, 28, 28]


 Don't worry if you don't understand the error. In fact, that's the whole point. The error is the entire stack trace of the Python execution, with no human-readable information on the failing component.



Lesson: Tools are meant to be helpful to humans. When the failure is longer and less understandable than the entire source, the tool is unhelpful.

The problems are two-fold:
  • Instead of referring to the layer name, the stack trace points to 'sequential_17'. All the layers have names, and none of them is called sequential_17.
  • Clearly the input layer is to blame: it's the only 28x28 input layer. But why are the dimensions expected to be 4? I suspect the input data needs to be reshaped. But it isn't clear what it should be reshaped to.
The right thing here is for model.compile(...) and model.fit(...) to run a pass on all the layers, validating their input parameters for common mistakes. If the inputs fail, then it should list out the layer that caused the failure. It should also validate the sizes of previous outputs and present inputs, ensuring that they conform.

This error is after many minutes of getting the correct model specified. If you want a real headache of an error, run this code without 'name=' on one of the Dropout layers.


User-facing tools should have user-facing errors.




In case you came here because you are facing the same problem with a Conv2D layer, the answer is relatively simple. For grayscale images, you still need the input_shape to be (28, 28, 1). This is because the convolution works on layers, and monochrome images have a single layer. When you do that, you also need to reshape your input to be (size, 28, 28, 1) from (size, 28, 28). 
# X_train.shape = (55000,28,28)
X_train = X_train.reshape(55000,28,28,1)
# X_valid.shape = (5000,28,28)
X_valid = X_valid.reshape(5000,28,28,1)


# Creating the model

deep_model = keras.models.Sequential([
    keras.layers.Conv2D(32, 4, activation="relu", padding="same", 
                        input_shape=(28, 28, 1), name="input"),
# ... and other layers.

# Training
deep_model.fit(x=X_train, ...