Skip to content
155 changes: 135 additions & 20 deletions Examples/BeeTrackingTool/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import TensorFlow

struct BeeTrackingTool: ParsableCommand {
static var configuration = CommandConfiguration(
subcommands: [TrainRAE.self, InferTrackRAE.self, InferTrackRawPixels.self])
subcommands: [TrainRAE.self, InferTrackRAE.self, InferTrackRawPixels.self, NaiveRae.self])
}

/// The dimension of the hidden layer in the appearance model.
Expand Down Expand Up @@ -164,35 +164,150 @@ struct InferTrackRAE: ParsableCommand {

/// Infers a track on a VOT video, using the raw pixel tracker.
struct InferTrackRawPixels: ParsableCommand {
@Option(help: "Base directory of the VOT dataset")
var votBaseDirectory: String
func run() {

@Option(help: "Name of the VOT video to use")
var videoName: String
func rawPixelTracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
var tracker = makeRawPixelTracker(frames: frames, target: frames[0].patch(at: start))
tracker.optimizer.precision = 1e0
let prediction = tracker.infer(knownStart: Tuple1(start.center))
return tracker.frameVariableIDs.map { varIds in
let poseId = varIds.head
return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
}
}

@Option(help: "How many frames to track")
var frameCount: Int = 50
var dataset = OISTBeeVideo()!
// Only do inference on the interesting tracks.
dataset.tracks = [3, 5, 6, 7].map { dataset.tracks[$0] }
let trackerEvaluationDataset = TrackerEvaluationDataset(dataset)
let eval = trackerEvaluationDataset.evaluate(
rawPixelTracker, sequenceCount: dataset.tracks.count, deltaAnchor: 100, outputFile: "rawpixel.json")
print(eval.trackerMetrics.accuracy)
print(eval.trackerMetrics.robustness)
}
}

@Flag(help: "Print progress information")
/// Tracking with a Naive Bayes with RAE
struct NaiveRae: ParsableCommand {
@Option(help: "Where to load the RAE weights")
var loadWeights: String

@Option(help: "The dimension of the latent code in the RAE appearance model")
var kLatentDimension: Int

@Option(help: "The dimension of the hidden code in the RAE appearance model")
var kHiddenDimension = 100

@Flag
var verbose: Bool = false

func run() {
let video = VOTVideo(votBaseDirectory: votBaseDirectory, videoName: videoName)!
let videoSlice = video[0..<min(video.frames.count, frameCount)]
@Option
var outputFile: String

let startPose = videoSlice.track[0].center
let startPatch = videoSlice.frames[0].patch(at: videoSlice.track[0])
@Option
var truncate: Int

var tracker = makeRawPixelTracker(frames: videoSlice.frames, target: startPatch)
/// Returns predictions for `videoName` using the raw pixel tracker.
func naiveRaeTrack(dataset dataset_: OISTBeeVideo) {
var dataset = dataset_
dataset.labels = dataset.labels.map {
$0.filter({ $0.label == .Body })
}
// Make batch and do RAE
let (batch, _) = dataset.makeBatch(appearanceModelSize: (40, 70), batchSize: 200)
var statistics = FrameStatistics(batch)
statistics.mean = Tensor(62.26806976644069)
statistics.standardDeviation = Tensor(37.44683834503672)

let backgroundBatch = dataset.makeBackgroundBatch(
patchSize: (40, 70), appearanceModelSize: (40, 70),
statistics: statistics,
batchSize: 300
)

if verbose { tracker.optimizer.verbosity = .SUMMARY }
let (imageHeight, imageWidth, imageChannels) =
(batch.shape[1], batch.shape[2], batch.shape[3])

if verbose { print("Loading RAE model, \(batch.shape)...") }

let np = Python.import("numpy")

let prediction = tracker.infer(knownStart: Tuple1(startPose))
var rae = DenseRAE(
imageHeight: imageHeight, imageWidth: imageWidth, imageChannels: imageChannels,
hiddenDimension: kHiddenDimension, latentDimension: kLatentDimension
)
rae.load(weights: np.load(loadWeights, allow_pickle: true))

if verbose { print("Fitting Naive Bayes model") }

var (foregroundModel, backgroundModel) = (
MultivariateGaussian(
dims: TensorShape([kLatentDimension]),
regularizer: 1e-3
), GaussianNB(
dims: TensorShape([kLatentDimension]),
regularizer: 1e-3
)
)

let batchPositive = rae.encode(batch)
foregroundModel.fit(batchPositive)

let batchNegative = rae.encode(backgroundBatch)
backgroundModel.fit(batchNegative)

if verbose {
print("Foreground: \(foregroundModel)")
print("Background: \(backgroundModel)")
}

let boxes = tracker.frameVariableIDs.map { frameVariableIDs -> OrientedBoundingBox in
let poseID = frameVariableIDs.head
return OrientedBoundingBox(
center: prediction[poseID], rows: video.track[0].rows, cols: video.track[0].cols)
func tracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
var tracker = makeNaiveBayesAETracker(
model: rae,
statistics: statistics,
frames: frames,
targetSize: (start.rows, start.cols),
foregroundModel: foregroundModel, backgroundModel: backgroundModel
)
tracker.optimizer.cgls_precision = 1e-5
tracker.optimizer.precision = 1e-3
tracker.optimizer.max_iteration = 200
let prediction = tracker.infer(knownStart: Tuple1(start.center))
return tracker.frameVariableIDs.map { varIds in
let poseId = varIds.head
return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
}
}

// Only do inference on the interesting tracks.
var evalDataset = OISTBeeVideo(truncate: truncate)!
evalDataset.tracks = [3, 5, 6, 7].map { evalDataset.tracks[$0] }
let trackerEvaluationDataset = TrackerEvaluationDataset(evalDataset)
let eval = trackerEvaluationDataset.evaluate(
tracker, sequenceCount: evalDataset.tracks.count, deltaAnchor: 500, outputFile: outputFile)
print(eval.trackerMetrics.accuracy)
print(eval.trackerMetrics.robustness)
}

func run() {
if verbose {
print("Loading dataset...")
}

startTimer("DATASET_LOAD")
let dataset: OISTBeeVideo = OISTBeeVideo(deferLoadingFrames: true)!
stopTimer("DATASET_LOAD")

if verbose {
print("Tracking...")
}

startTimer("RAE_TRACKING")
naiveRaeTrack(dataset: dataset)
stopTimer("RAE_TRACKING")

if verbose {
printTimers()
}

print(boxes.count)
Expand Down
142 changes: 68 additions & 74 deletions Examples/OISTVisualizationTool/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import Foundation

struct OISTVisualizationTool: ParsableCommand {
static var configuration = CommandConfiguration(
subcommands: [VisualizeTrack.self, ViewFrame.self, RawTrack.self, PpcaTrack.self, NaiveRae.self, TrainRAE.self, NaivePca.self])
subcommands: [VisualizePrediction.self, VisualizeTrack.self, ViewFrame.self, RawTrack.self, PpcaTrack.self, NaiveRae.self, TrainRAE.self, NaivePca.self])
}

/// View a frame with bounding boxes
Expand Down Expand Up @@ -510,26 +510,23 @@ struct TrainRAE: ParsableCommand {
///
/// Tracking with a Naive Bayes with RAE
struct NaivePca: ParsableCommand {
@Option(help: "Where to load the RAE weights")
var loadWeights: String = "./oist_rae_weight.npy"

@Option(help: "Which bounding box to track")
var boxId: Int = 0

@Option(help: "Track for how many frames")
var trackFrames: Int = 10

@Option(help: "Track the target from frame x")
var trackStartFrame: Int = 250

@Option(help: "The dimension of the latent code in the RAE appearance model")
var kLatentDimension = 10
var kLatentDimension = 20

@Flag(help: "Print progress information")
var verbose: Bool = false

@Flag(help: "Use random projections instead of learned PPCA vectors")
var randomProjections: Bool = false

@Option
var outputFile: String

@Option
var truncate: Int

/// Returns predictions for `videoName` using the raw pixel tracker.
func naivePpcaTrack(dataset dataset_: OISTBeeVideo, length: Int, startFrom: Int) -> [OrientedBoundingBox] {
func naivePpcaTrack(dataset dataset_: OISTBeeVideo) {
var dataset = dataset_
dataset.labels = dataset.labels.map {
$0.filter({ $0.label == .Body })
Expand All @@ -549,7 +546,11 @@ struct NaivePca: ParsableCommand {
var ppca = PPCA(latentSize: kLatentDimension)

ppca.train(images: batch)


if randomProjections {
ppca.W_inv = Tensor(randomNormal: ppca.W_inv!.shape)
}

if verbose { print("Fitting Naive Bayes model") }

var (foregroundModel, backgroundModel) = (
Expand All @@ -573,48 +574,32 @@ struct NaivePca: ParsableCommand {
print("Background: \(backgroundModel)")
}

if verbose { print("Loading video frames...") }
startTimer("VIDEO_LOAD")
// Load the video and take a slice of it.
let videos = (0..<length).map { (i) -> Tensor<Float> in
return withDevice(.cpu) { dataset.loadFrame(dataset.frameIds[startFrom + i])! }
}
stopTimer("VIDEO_LOAD")

let startPose = dataset.labels[startFrom][boxId].location.center

if verbose {
print("Creating tracker, startPose = \(startPose)")
}

startTimer("MAKE_GRAPH")
var tracker = makeNaiveBayesPCATracker(
model: ppca,
statistics: statistics,
frames: videos,
targetSize: (dataset.labels[startFrom][boxId].location.rows, dataset.labels[startFrom][boxId].location.cols),
foregroundModel: foregroundModel, backgroundModel: backgroundModel
)
stopTimer("MAKE_GRAPH")

if verbose { print("Starting Optimization...") }
if verbose { tracker.optimizer.verbosity = .SUMMARY }

tracker.optimizer.cgls_precision = 1e-7
tracker.optimizer.precision = 1e-4
tracker.optimizer.max_iteration = 200

startTimer("GRAPH_INFER")
let prediction = tracker.infer(knownStart: Tuple1(startPose))
stopTimer("GRAPH_INFER")

let boxes = tracker.frameVariableIDs.map { frameVariableIDs -> OrientedBoundingBox in
let poseID = frameVariableIDs.head
return OrientedBoundingBox(
center: prediction[poseID], rows: dataset.labels[startFrom][boxId].location.rows, cols: dataset.labels[startFrom][boxId].location.cols)
func tracker(_ frames: [Tensor<Float>], _ start: OrientedBoundingBox) -> [OrientedBoundingBox] {
var tracker = makeNaiveBayesPCATracker(
model: ppca,
statistics: statistics,
frames: frames,
targetSize: (start.rows, start.cols),
foregroundModel: foregroundModel, backgroundModel: backgroundModel
)
tracker.optimizer.cgls_precision = 1e-9
tracker.optimizer.precision = 1e-6
tracker.optimizer.max_iteration = 200
let prediction = tracker.infer(knownStart: Tuple1(start.center))
return tracker.frameVariableIDs.map { varIds in
let poseId = varIds.head
return OrientedBoundingBox(center: prediction[poseId], rows: start.rows, cols: start.cols)
}
}

return boxes
// Only do inference on the interesting tracks.
var evalDataset = OISTBeeVideo(truncate: truncate)!
evalDataset.tracks = [3, 5, 6, 7].map { evalDataset.tracks[$0] }
let trackerEvaluationDataset = TrackerEvaluationDataset(evalDataset)
let eval = trackerEvaluationDataset.evaluate(
tracker, sequenceCount: evalDataset.tracks.count, deltaAnchor: 500, outputFile: outputFile)
print(eval.trackerMetrics.accuracy)
print(eval.trackerMetrics.robustness)
}

func run() {
Expand All @@ -631,25 +616,8 @@ struct NaivePca: ParsableCommand {
}

startTimer("PPCA_TRACKING")
var bboxes: [OrientedBoundingBox]
bboxes = naivePpcaTrack(dataset: dataset, length: trackFrames, startFrom: trackStartFrame)
naivePpcaTrack(dataset: dataset)
stopTimer("PPCA_TRACKING")

let frameRawId = dataset.frameIds[trackStartFrame + trackFrames]
let image = dataset.loadFrame(frameRawId)!

if verbose {
print("Creating output plot")
}
startTimer("PLOTTING")
plot(image, boxes: bboxes.indices.map {
("\($0)", bboxes[$0])
}, margin: 10.0, scale: 0.5).show()
stopTimer("PLOTTING")

if verbose {
printTimers()
}
}
}

Expand All @@ -666,6 +634,32 @@ struct VisualizeTrack: ParsableCommand {
}
}

struct VisualizePrediction: ParsableCommand {
@Option
var prediction: String

@Option
var subsequenceIndex: Int = 0

// TODO: I think I should save this in the prediction so that we do not need to specify it!
@Option
var startFrame: Int

@Option
var output: String

func run() {
let dataset = OISTBeeVideo(deferLoadingFrames: true)!
let decoder = JSONDecoder()
let data = try! Data(contentsOf: URL(fileURLWithPath: prediction))
let sequence = try! decoder.decode(SequenceEvaluationResults.self, from: data)

let track = OISTBeeTrack(
startFrameIndex: startFrame, boxes: sequence.subsequences[subsequenceIndex].prediction)
track.render(to: output, video: dataset)
}
}

// It is important to set the global threadpool before doing anything else, so that nothing
// accidentally uses the default threadpool.
ComputeThreadPools.global =
Expand Down
12 changes: 6 additions & 6 deletions Sources/BeeTracking/OISTBeeVideo+Batches.swift
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,12 @@ extension OISTBeeVideo {
Double.random(in: Double(maxSide)..<Double(frame.shape[1] - maxSide), using: &deterministicEntropy),
Double.random(in: Double(maxSide)..<Double(frame.shape[0] - maxSide), using: &deterministicEntropy))

// Conservatively reject any point that could possibly overlap with any of the labels.
for label in labels {
if (label.location.center.t - location).norm < Double(maxSide) {
continue
}
}
// // Conservatively reject any point that could possibly overlap with any of the labels.
// for label in labels {
// if (label.location.center.t - location).norm < Double(maxSide) {
// continue
// }
// }

// The point was not rejected, so return it.
return location
Expand Down
Loading