diff --git a/vision/conv_mnist/conv_mnist.jl b/vision/conv_mnist/conv_mnist.jl
index 67319143..668dd1ad 100644
--- a/vision/conv_mnist/conv_mnist.jl
+++ b/vision/conv_mnist/conv_mnist.jl
@@ -1,6 +1,7 @@
-# Classification of MNIST dataset using a convnet, a variant of the original LeNet
+# Classification of MNIST dataset using a convolutional network,
+# which is a variant of the original LeNet from 1998.
 
-using MLDatasets, Flux, CUDA, BSON  # this will install everything if necc.
+using MLDatasets, Flux, BSON, CUDA  # this will install everything if necc.
 
 #===== DATA =====#
 
@@ -26,7 +27,8 @@ loader()  # returns a DataLoader, with first element a tuple like this:
 
 x1, y1 = first(loader()); # (28×28×1×64 Array{Float32, 3}, 10×64 OneHotMatrix(::Vector{UInt32}))
 
-# If you are using a GPU, these should be CuArray{Float32, 3} etc. 
+# If you are using a GPU, these should be CuArray{Float32, 3} etc.
+# If not, the `gpu` function does nothing (except complain the first time).
 
 #===== MODEL =====#
 
@@ -44,6 +46,8 @@ lenet = Chain(
     Dense(84 => 10),
 ) |> gpu
 
+# Notice that most of the parameters are in the final Dense layers.
+
 y1hat = lenet(x1)  # try it out
 
 softmax(y1hat)
@@ -63,7 +67,7 @@ hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9))
 using Statistics: mean  # standard library
 
 function loss_and_accuracy(model, data::MNIST=test_data)
-    (x,y) = only(loader(data; batchsize=0))  # batchsize=0 means one big batch
+    (x,y) = only(loader(data; batchsize=length(data)))  # make one big batch
     ŷ = model(x)
     loss = Flux.logitcrossentropy(ŷ, y)  # did not include softmax in the model
     acc = round(100 * mean(Flux.onecold(ŷ) .== Flux.onecold(y)); digits=2)
@@ -91,6 +95,7 @@ opt_rule = OptimiserChain(WeightDecay(settings.lambda), Adam(settings.eta))
 opt_state = Flux.setup(opt_rule, lenet);
 
 for epoch in 1:settings.epochs
+    # @time will show a much longer time for the first epoch, due to compilation
     @time for (x,y) in loader(batchsize=settings.batchsize)
         grads = Flux.gradient(m -> Flux.logitcrossentropy(m(x), y), lenet)
         Flux.update!(opt_state, lenet, grads[1])
@@ -101,7 +106,7 @@ for epoch in 1:settings.epochs
         loss, acc, _ = loss_and_accuracy(lenet)
         test_loss, test_acc, _ = loss_and_accuracy(lenet, test_data)
         @info "logging:" epoch acc test_acc
-        nt = (; epoch, loss, acc, test_loss, test_acc)
+        nt = (; epoch, loss, acc, test_loss, test_acc)  # make a NamedTuple
         push!(train_log, nt)
     end
     if epoch % 5 == 0
@@ -118,16 +123,16 @@ hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9))
 
 #===== INSPECTION =====#
 
-using ImageInTerminal, ImageCore
+using ImageCore, ImageInTerminal
 
-xtest, ytest = only(loader(test_data, batchsize=0))
+xtest, ytest = only(loader(test_data, batchsize=length(test_data)));
 
 # There are many ways to look at images, you won't need ImageInTerminal if working in a notebook
 # ImageCore.Gray is a special type, whick interprets numbers between 0.0 and 1.0 as shades:
 
-xtest[:,:,1,5] .|> Gray |> transpose  # should display a 4
+xtest[:,:,1,5] .|> Gray |> transpose |> cpu
 
-Flux.onecold(ytest, 0:9)[5]  # it's coded as being a 4
+Flux.onecold(ytest, 0:9)[5]  # true label, should match!
 
 # Let's look for the image whose classification is least certain.
 # First, in each column of probabilities, ask for the largest one.
@@ -137,33 +142,18 @@ ptest = softmax(lenet(xtest))
 max_p = maximum(ptest; dims=1)
 _, i = findmin(vec(max_p))
 
-xtest[:,:,1,i] .|> Gray |> transpose
+xtest[:,:,1,i] .|> Gray |> transpose |> cpu
 
 Flux.onecold(ytest, 0:9)[i]  # true classification
+ptest[:,i]  # probabilities of all outcomes
 Flux.onecold(ptest[:,i], 0:9)  # uncertain prediction
 
-# Next, let's look for the most confident, yet wrong, prediction.
-# Often this will look quite ambiguous to you too.
-
-iwrong = findall(Flux.onecold(lenet(xtest)) .!= Flux.onecold(ytest))
-
-max_p = maximum(ptest[:,iwrong]; dims=1)
-_, k = findmax(vec(max_p))  # now max not min
-i = iwrong[k]
-
-xtest[:,:,1,i] .|> Gray |> transpose
-
-Flux.onecold(ytest, 0:9)[i]  # true classification
-Flux.onecold(ptest[:,i], 0:9)  # prediction
-
 #===== SIZES =====#
 
-# Maybe... at first I had this above, but it makes things long.
-
 # A layer like Conv((5, 5), 1=>6) takes 5x5 patches of an image, and matches them to each
 # of 6 different 5x5 filters, placed at every possible position. These filters are here:
 
-Conv((5, 5), 1=>6).weights |> summary  # 5×5×1×6 Array{Float32, 4}
+Conv((5, 5), 1=>6).weight |> summary  # 5×5×1×6 Array{Float32, 4}
 
 # This layer can accept any size of image; let's trace the sizes with the actual input:
 
@@ -172,19 +162,19 @@ Conv((5, 5), 1=>6).weights |> summary  # 5×5×1×6 Array{Float32, 4}
 julia> x1 |> size
 (28, 28, 1, 64)
 
-julia> conv_layers[1](x1) |> size
+julia> lenet[1](x1) |> size  # after Conv((5, 5), 1=>6, relu),
 (24, 24, 6, 64)
 
-julia> conv_layers[1:2](x1) |> size
+julia> lenet[1:2](x1) |> size  # after MaxPool((2, 2))
 (12, 12, 6, 64)
 
-julia> conv_layers[1:3](x1) |> size
+julia> lenet[1:3](x1) |> size  # after Conv((5, 5), 6 => 16, relu)
 (8, 8, 16, 64)
 
-julia> conv_layers(x1) |> size
+julia> lenet[1:4](x1) |> size  # after MaxPool((2, 2))
 (4, 4, 16, 64)
 
-julia> conv_layers(x1) |> Flux.flatten |> size
+julia> lenet[1:5](x1) |> size  # after Flux.flatten 
 (256, 64)
 
 =#
@@ -193,4 +183,3 @@ julia> conv_layers(x1) |> Flux.flatten |> size
 # This 256 must match the Dense(256 => 120). (See Flux.outputsize for ways to automate this.)
 
 #===== THE END =====#
-