borglab · dellaert · Nov 13, 2020 · Nov 13, 2020 · Nov 14, 2020 · Nov 14, 2020
diff --git a/Examples/BeeTrackingTool/main.swift b/Examples/BeeTrackingTool/main.swift
@@ -9,7 +9,7 @@ import TensorFlow
 
 struct BeeTrackingTool: ParsableCommand {
   static var configuration = CommandConfiguration(
-    subcommands: [TrainRAE.self, InferTrackRAE.self, InferTrackRawPixels.self])
+    subcommands: [TrainRAE.self, InferTrackRAE.self, InferTrackRawPixels.self, NaiveRae.self])
 }
 
 /// The dimension of the hidden layer in the appearance model.
@@ -164,35 +164,150 @@ struct InferTrackRAE: ParsableCommand {
 
 /// Infers a track on a VOT video, using the raw pixel tracker.
 struct InferTrackRawPixels: ParsableCommand {
-  @Option(help: "Base directory of the VOT dataset")
-  var votBaseDirectory: String
+  func run() {
 
-  @Option(help: "Name of the VOT video to use")
-  var videoName: String
+    func rawPixelTracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
+      var tracker = makeRawPixelTracker(frames: frames, target: frames[0].patch(at: start))
+      tracker.optimizer.precision = 1e0
+      let prediction = tracker.infer(knownStart: Tuple1(start.center))
+      return tracker.frameVariableIDs.map { varIds in
+        let poseId = varIds.head
+        return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
+      }
+    }
 
-  @Option(help: "How many frames to track")
-  var frameCount: Int = 50
+    var dataset = OISTBeeVideo()!
+    // Only do inference on the interesting tracks.
+    dataset.tracks = [3, 5, 6, 7].map { dataset.tracks[$0] }
+    let trackerEvaluationDataset = TrackerEvaluationDataset(dataset)
+    let eval = trackerEvaluationDataset.evaluate(
+      rawPixelTracker, sequenceCount: dataset.tracks.count, deltaAnchor: 100, outputFile: "rawpixel.json")
+    print(eval.trackerMetrics.accuracy)
+    print(eval.trackerMetrics.robustness)
+  }
+}
 
-  @Flag(help: "Print progress information")
+/// Tracking with a Naive Bayes with RAE
+struct NaiveRae: ParsableCommand {
+  @Option(help: "Where to load the RAE weights")
+  var loadWeights: String
+
+  @Option(help: "The dimension of the latent code in the RAE appearance model")
+  var kLatentDimension: Int
+
+  @Option(help: "The dimension of the hidden code in the RAE appearance model")
+  var kHiddenDimension = 100
+
+  @Flag
   var verbose: Bool = false
 
-  func run() {
-    let video = VOTVideo(votBaseDirectory: votBaseDirectory, videoName: videoName)!
-    let videoSlice = video[0..<min(video.frames.count, frameCount)]
+  @Option
+  var outputFile: String
 
-    let startPose = videoSlice.track[0].center
-    let startPatch = videoSlice.frames[0].patch(at: videoSlice.track[0])
+  @Option
+  var truncate: Int
 
-    var tracker = makeRawPixelTracker(frames: videoSlice.frames, target: startPatch)
+  /// Returns predictions for `videoName` using the raw pixel tracker.
+  func naiveRaeTrack(dataset dataset_: OISTBeeVideo) {
+    var dataset = dataset_
+    dataset.labels = dataset.labels.map {
+      $0.filter({ $0.label == .Body })
+    }
+    // Make batch and do RAE
+    let (batch, _) = dataset.makeBatch(appearanceModelSize: (40, 70), batchSize: 200)
+    var statistics = FrameStatistics(batch)
+    statistics.mean = Tensor(62.26806976644069)
+    statistics.standardDeviation = Tensor(37.44683834503672)
+
+    let backgroundBatch = dataset.makeBackgroundBatch(
+      patchSize: (40, 70), appearanceModelSize: (40, 70),
+      statistics: statistics,
+      batchSize: 300
+    )
 
-    if verbose { tracker.optimizer.verbosity = .SUMMARY }
+    let (imageHeight, imageWidth, imageChannels) =
+      (batch.shape[1], batch.shape[2], batch.shape[3])
+
+    if verbose { print("Loading RAE model, \(batch.shape)...") }
+
+    let np = Python.import("numpy")
 
-    let prediction = tracker.infer(knownStart: Tuple1(startPose))
+    var rae = DenseRAE(
+      imageHeight: imageHeight, imageWidth: imageWidth, imageChannels: imageChannels,
+      hiddenDimension: kHiddenDimension, latentDimension: kLatentDimension
+    )
+    rae.load(weights: np.load(loadWeights, allow_pickle: true))
+
+    if verbose { print("Fitting Naive Bayes model") }
+
+    var (foregroundModel, backgroundModel) = (
+      MultivariateGaussian(
+        dims: TensorShape([kLatentDimension]),
+        regularizer: 1e-3
+      ), GaussianNB(
+        dims: TensorShape([kLatentDimension]),
+        regularizer: 1e-3
+      )
+    )
+
+    let batchPositive = rae.encode(batch)
+    foregroundModel.fit(batchPositive)
+
+    let batchNegative = rae.encode(backgroundBatch)
+    backgroundModel.fit(batchNegative)
+
+    if verbose {
+      print("Foreground: \(foregroundModel)")
+      print("Background: \(backgroundModel)")
+    }
 
-    let boxes = tracker.frameVariableIDs.map { frameVariableIDs -> OrientedBoundingBox in
-      let poseID = frameVariableIDs.head
-      return OrientedBoundingBox(
-        center: prediction[poseID], rows: video.track[0].rows, cols: video.track[0].cols)
+    func tracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
+      var tracker = makeNaiveBayesAETracker(
+        model: rae,
+        statistics: statistics,
+        frames: frames,
+        targetSize: (start.rows, start.cols),
+        foregroundModel: foregroundModel, backgroundModel: backgroundModel
+      )
+      tracker.optimizer.cgls_precision = 1e-5
+      tracker.optimizer.precision = 1e-3
+      tracker.optimizer.max_iteration = 200
+      let prediction = tracker.infer(knownStart: Tuple1(start.center))
+      return tracker.frameVariableIDs.map { varIds in
+        let poseId = varIds.head
+        return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
+      }
+    }
+
+    // Only do inference on the interesting tracks.
+    var evalDataset = OISTBeeVideo(truncate: truncate)!
+    evalDataset.tracks = [3, 5, 6, 7].map { evalDataset.tracks[$0] }
+    let trackerEvaluationDataset = TrackerEvaluationDataset(evalDataset)
+    let eval = trackerEvaluationDataset.evaluate(
+      tracker, sequenceCount: evalDataset.tracks.count, deltaAnchor: 500, outputFile: outputFile)
+    print(eval.trackerMetrics.accuracy)
+    print(eval.trackerMetrics.robustness)
+  }
+
+  func run() {
+    if verbose {
+      print("Loading dataset...")
+    }
+
+    startTimer("DATASET_LOAD")
+    let dataset: OISTBeeVideo = OISTBeeVideo(deferLoadingFrames: true)!
+    stopTimer("DATASET_LOAD")
+
+    if verbose {
+      print("Tracking...")
+    }
+
+    startTimer("RAE_TRACKING")
+    naiveRaeTrack(dataset: dataset)
+    stopTimer("RAE_TRACKING")
+
+    if verbose {
+      printTimers()
     }
 
     print(boxes.count)

diff --git a/Examples/OISTVisualizationTool/main.swift b/Examples/OISTVisualizationTool/main.swift
@@ -24,7 +24,7 @@ import Foundation
 
 struct OISTVisualizationTool: ParsableCommand {
   static var configuration = CommandConfiguration(
-    subcommands: [VisualizeTrack.self, ViewFrame.self, RawTrack.self, PpcaTrack.self, NaiveRae.self, TrainRAE.self, NaivePca.self])
+    subcommands: [VisualizePrediction.self, VisualizeTrack.self, ViewFrame.self, RawTrack.self, PpcaTrack.self, NaiveRae.self, TrainRAE.self, NaivePca.self])
 }
 
 /// View a frame with bounding boxes
@@ -510,26 +510,23 @@ struct TrainRAE: ParsableCommand {
 ///
 /// Tracking with a Naive Bayes with RAE
 struct NaivePca: ParsableCommand {
-  @Option(help: "Where to load the RAE weights")
-  var loadWeights: String = "./oist_rae_weight.npy"
-
-  @Option(help: "Which bounding box to track")
-  var boxId: Int = 0
-
-  @Option(help: "Track for how many frames")
-  var trackFrames: Int = 10
-
-  @Option(help: "Track the target from frame x")
-  var trackStartFrame: Int = 250
-
   @Option(help: "The dimension of the latent code in the RAE appearance model")
-  var kLatentDimension = 10
+  var kLatentDimension = 20
 
   @Flag(help: "Print progress information")
   var verbose: Bool = false
 
+  @Flag(help: "Use random projections instead of learned PPCA vectors")
+  var randomProjections: Bool = false
+
+  @Option
+  var outputFile: String
+
+  @Option
+  var truncate: Int
+
   /// Returns predictions for `videoName` using the raw pixel tracker.
-  func naivePpcaTrack(dataset dataset_: OISTBeeVideo, length: Int, startFrom: Int) -> [OrientedBoundingBox] {
+  func naivePpcaTrack(dataset dataset_: OISTBeeVideo) {
     var dataset = dataset_
     dataset.labels = dataset.labels.map {
       $0.filter({ $0.label == .Body })
@@ -549,7 +546,11 @@ struct NaivePca: ParsableCommand {
     var ppca = PPCA(latentSize: kLatentDimension)
 
     ppca.train(images: batch)
-
+
+    if randomProjections {
+      ppca.W_inv = Tensor(randomNormal: ppca.W_inv!.shape)
+    }
+
     if verbose { print("Fitting Naive Bayes model") }
 
     var (foregroundModel, backgroundModel) = (
@@ -573,48 +574,32 @@ struct NaivePca: ParsableCommand {
       print("Background: \(backgroundModel)")
     }
 
-    if verbose { print("Loading video frames...") }
-    startTimer("VIDEO_LOAD")
-    // Load the video and take a slice of it.
-    let videos = (0..<length).map { (i) -> Tensor<Float> in
-      return withDevice(.cpu) { dataset.loadFrame(dataset.frameIds[startFrom + i])! }
-    }
-    stopTimer("VIDEO_LOAD")
-
-    let startPose = dataset.labels[startFrom][boxId].location.center
-
-    if verbose {
-      print("Creating tracker, startPose = \(startPose)")
-    }
-
-    startTimer("MAKE_GRAPH")
-    var tracker = makeNaiveBayesPCATracker(
-      model: ppca,
-      statistics: statistics,
-      frames: videos,
-      targetSize: (dataset.labels[startFrom][boxId].location.rows, dataset.labels[startFrom][boxId].location.cols),
-      foregroundModel: foregroundModel, backgroundModel: backgroundModel
-    )
-    stopTimer("MAKE_GRAPH")
-
-    if verbose { print("Starting Optimization...") }
-    if verbose { tracker.optimizer.verbosity = .SUMMARY }
-
-    tracker.optimizer.cgls_precision = 1e-7
-    tracker.optimizer.precision = 1e-4
-    tracker.optimizer.max_iteration = 200
-
-    startTimer("GRAPH_INFER")
-    let prediction = tracker.infer(knownStart: Tuple1(startPose))
-    stopTimer("GRAPH_INFER")
-
-    let boxes = tracker.frameVariableIDs.map { frameVariableIDs -> OrientedBoundingBox in
-      let poseID = frameVariableIDs.head
-      return OrientedBoundingBox(
-        center: prediction[poseID], rows: dataset.labels[startFrom][boxId].location.rows, cols: dataset.labels[startFrom][boxId].location.cols)
+    func tracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
+      var tracker = makeNaiveBayesPCATracker(
+        model: ppca,
+        statistics: statistics,
+        frames: frames,
+        targetSize: (start.rows, start.cols),
+        foregroundModel: foregroundModel, backgroundModel: backgroundModel
+      )
+      tracker.optimizer.cgls_precision = 1e-9
+      tracker.optimizer.precision = 1e-6
+      tracker.optimizer.max_iteration = 200
+      let prediction = tracker.infer(knownStart: Tuple1(start.center))
+      return tracker.frameVariableIDs.map { varIds in
+        let poseId = varIds.head
+        return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
+      }
     }
 
-    return boxes
+    // Only do inference on the interesting tracks.
+    var evalDataset = OISTBeeVideo(truncate: truncate)!
+    evalDataset.tracks = [3, 5, 6, 7].map { evalDataset.tracks[$0] }
+    let trackerEvaluationDataset = TrackerEvaluationDataset(evalDataset)
+    let eval = trackerEvaluationDataset.evaluate(
+      tracker, sequenceCount: evalDataset.tracks.count, deltaAnchor: 500, outputFile: outputFile)
+    print(eval.trackerMetrics.accuracy)
+    print(eval.trackerMetrics.robustness)
   }
 
   func run() {
@@ -631,25 +616,8 @@ struct NaivePca: ParsableCommand {
     }
 
     startTimer("PPCA_TRACKING")
-    var bboxes: [OrientedBoundingBox]
-    bboxes = naivePpcaTrack(dataset: dataset, length: trackFrames, startFrom: trackStartFrame)
+    naivePpcaTrack(dataset: dataset)
     stopTimer("PPCA_TRACKING")
-
-    let frameRawId = dataset.frameIds[trackStartFrame + trackFrames]
-    let image = dataset.loadFrame(frameRawId)!
-
-    if verbose {
-      print("Creating output plot")
-    }
-    startTimer("PLOTTING")
-    plot(image, boxes: bboxes.indices.map {
-      ("\($0)", bboxes[$0])
-    }, margin: 10.0, scale: 0.5).show()
-    stopTimer("PLOTTING")
-
-    if verbose {
-      printTimers()
-    }
   }
 }
 
@@ -666,6 +634,32 @@ struct VisualizeTrack: ParsableCommand {
   }
 }
 
+struct VisualizePrediction: ParsableCommand {
+  @Option
+  var prediction: String
+
+  @Option
+  var subsequenceIndex: Int = 0
+
+  // TODO: I think I should save this in the prediction so that we do not need to specify it!
+  @Option
+  var startFrame: Int
+
+  @Option
+  var output: String
+
+  func run() {
+    let dataset = OISTBeeVideo(deferLoadingFrames: true)!
+    let decoder = JSONDecoder()
+    let data = try! Data(contentsOf: URL(fileURLWithPath: prediction))
+    let sequence = try! decoder.decode(SequenceEvaluationResults.self, from: data)
+
+    let track = OISTBeeTrack(
+      startFrameIndex: startFrame, boxes: sequence.subsequences[subsequenceIndex].prediction)
+    track.render(to: output, video: dataset)
+  }
+}
+
 // It is important to set the global threadpool before doing anything else, so that nothing
 // accidentally uses the default threadpool.
 ComputeThreadPools.global =

diff --git a/Sources/BeeTracking/OISTBeeVideo+Batches.swift b/Sources/BeeTracking/OISTBeeVideo+Batches.swift
@@ -91,12 +91,12 @@ extension OISTBeeVideo {
             Double.random(in: Double(maxSide)..<Double(frame.shape[1] - maxSide), using: &deterministicEntropy),
             Double.random(in: Double(maxSide)..<Double(frame.shape[0] - maxSide), using: &deterministicEntropy))
 
-          // Conservatively reject any point that could possibly overlap with any of the labels.
-          for label in labels {
-            if (label.location.center.t - location).norm < Double(maxSide) {
-              continue
-            }
-          }
+          // // Conservatively reject any point that could possibly overlap with any of the labels.
+          // for label in labels {
+          //   if (label.location.center.t - location).norm < Double(maxSide) {
+          //     continue
+          //   }
+          // }
 
           // The point was not rejected, so return it.
           return location