diff --git a/Gemfile b/Gemfile new file mode 100644 index 000000000..8af267397 --- /dev/null +++ b/Gemfile @@ -0,0 +1,28 @@ +source "https://rubygems.org" +ruby RUBY_VERSION + +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +# bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! +gem "jekyll", "3.4.3" + +# This is the default theme for new Jekyll sites. You may change this to anything you like. +gem "minima", "~> 2.0" + +# If you want to use GitHub Pages, remove the "gem "jekyll"" above and +# uncomment the line below. To upgrade, run `bundle update github-pages`. +# gem "github-pages", group: :jekyll_plugins + +# If you have any plugins, put them here! +group :jekyll_plugins do + gem "jekyll-feed", "~> 0.6" +end + +# Windows does not include zoneinfo files, so bundle the tzinfo-data gem +gem 'tzinfo-data', platforms: [:mingw, :mswin, :x64_mingw, :jruby] + diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 000000000..334bd69f0 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,57 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.5.0) + public_suffix (~> 2.0, >= 2.0.2) + colorator (1.1.0) + ffi (1.9.18) + forwardable-extended (2.6.0) + jekyll (3.4.3) + addressable (~> 2.4) + colorator (~> 1.0) + jekyll-sass-converter (~> 1.0) + jekyll-watch (~> 1.1) + kramdown (~> 1.3) + liquid (~> 3.0) + mercenary (~> 0.3.3) + pathutil (~> 0.9) + rouge (~> 1.7) + safe_yaml (~> 1.0) + jekyll-feed (0.9.2) + jekyll (~> 3.3) + jekyll-sass-converter (1.5.0) + sass (~> 3.4) + jekyll-watch (1.5.0) + listen (~> 3.0, < 3.1) + kramdown (1.13.2) + liquid (3.0.6) + listen (3.0.8) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + mercenary (0.3.6) + minima (2.1.0) + jekyll (~> 3.3) + pathutil (0.14.0) + forwardable-extended (~> 2.6) + public_suffix (2.0.5) + rb-fsevent (0.9.8) + rb-inotify (0.9.8) + ffi (>= 0.5.0) + rouge (1.11.1) + safe_yaml (1.0.4) + sass (3.4.23) + +PLATFORMS + ruby + +DEPENDENCIES + jekyll (= 3.4.3) + jekyll-feed (~> 0.6) + minima (~> 2.0) + tzinfo-data + +RUBY VERSION + ruby 2.4.0p0 + +BUNDLED WITH + 1.14.6 diff --git a/Readme.md b/Readme.md index bb80f7388..a55daadf7 100644 --- a/Readme.md +++ b/Readme.md @@ -1,4 +1,3 @@ -# My blog - -This is my blog, uses [Jekyll](http://jekyllrb.com/). I was tired of bloated, slow Wordpress that locked up all my content. +# Personal Website +Here I document some of the projects I have done. I am using the template created by Andrej Karpathy. diff --git a/_config.yml b/_config.yml index 607158c3c..af4f91e82 100644 --- a/_config.yml +++ b/_config.yml @@ -1,11 +1,11 @@ # Site settings -title: Andrej Karpathy blog +title: Sagar Pathrudkar email: -description: "Musings of a Computer Scientist." +description: "wondering about wondering" baseurl: "" -url: "http://karpathy.github.io" -twitter_username: karpathy -github_username: karpathy +url: "http://sagarpath.github.io" +twitter_username: sagarpath +github_username: sagarpath # Build settings markdown: kramdown diff --git a/_includes/head.html b/_includes/head.html index c03f34c97..c35c3702c 100644 --- a/_includes/head.html +++ b/_includes/head.html @@ -5,7 +5,6 @@ - diff --git a/_includes/head.html~ b/_includes/head.html~ new file mode 100644 index 000000000..c03f34c97 --- /dev/null +++ b/_includes/head.html~ @@ -0,0 +1,23 @@ + + + + {% if page.title %}{{ page.title }}{% else %}{{ site.title }}{% endif %} + + + + + + + + + + + + diff --git a/_includes/header.html b/_includes/header.html index 17ed39c9a..6f7d4aa77 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -3,12 +3,12 @@
- - - +
- {{ site.title }} +
diff --git a/_layouts/page.html~ b/_layouts/page.html~ new file mode 100644 index 000000000..18f600c79 --- /dev/null +++ b/_layouts/page.html~ @@ -0,0 +1,37 @@ +--- +layout: default +--- +
+ +
+

{{ page.title }}

+
+ +
+ {{ content }} +
+ + + {% if page.comments %} +
+ + + comments powered by Disqus + {% endif %} + +
+ + +{% if page.mathjax %} + +{% endif %} diff --git a/_layouts/post.html b/_layouts/post.html index 7f0c63b8a..764786c42 100644 --- a/_layouts/post.html +++ b/_layouts/post.html @@ -17,22 +17,7 @@

{{ page.title }}

{% endif %} - - {% if page.comments %} -
- - - comments powered by Disqus - {% endif %} + - \ No newline at end of file + diff --git a/_layouts/post.html~ b/_layouts/post.html~ new file mode 100644 index 000000000..764786c42 --- /dev/null +++ b/_layouts/post.html~ @@ -0,0 +1,23 @@ +--- +layout: default +--- +
+ +
+

{{ page.title }}

+

{{ page.date | date: "%b %-d, %Y" }}{% if page.author %} • {{ page.author }}{% endif %}{% if page.meta %} • {{ page.meta }}{% endif %}

+
+ +
+ {{ content }} +
+ + + {% if page.mathjax %} + + {% endif %} + + + + +
diff --git a/_posts/2011-04-27-manually-classifying-cifar10.markdown b/_posts/2011-04-27-manually-classifying-cifar10.markdown deleted file mode 100644 index c9abaa1d1..000000000 --- a/_posts/2011-04-27-manually-classifying-cifar10.markdown +++ /dev/null @@ -1,43 +0,0 @@ ---- -layout: post -comments: true -title: "Lessons learned from manually classifying CIFAR-10" -excerpt: "CIFAR-10 is a popular dataset small dataset for testing out Computer Vision Deep Learning learning methods. We're seeing a lot of improvements. But what is the human baseline?" -date: 2011-04-27 22:00:00 ---- - -### CIFAR-10 - -> Note, this post is from 2011 and slightly outdated in some places. - -
- -**Statistics**. CIFAR-10 consists of 50,000 training images, all of them in 1 of 10 categories (displayed left). The test set consists of 10,000 novel images from the same categories, and the task is to classify each to its category. The state of the art is currently at about 80% classification accuracy (4000 centroids), achieved by [Adam Coates et al. (PDF)](http://ai.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf). This paper achieved the accuracy by using whitening, k-means to learn many centroids, and then using a soft activation function as features. - -**State of the Art performance.** By the way, running their method with 1600 centroids gives 77% classification accuracy. If you set the clusters to be random the accuracy becomes 70%, and if you set the clusters to be random patches from the training set, the accuracy goes up to 74%. It seems like the whole purpose of k-means is to nicely spread out the clusters around the data. I'm guessing that the 70% random clusters performance might be because many of the clusters are relatively too far away from data manifolds, and never become activated -- it's as if you had much fewer clusters to begin with. - -**Human Accuracy.** Over the weekend I wanted to see what kind of classification accuracy a human would achieve on this dataset. I set out to write some quick MATLAB code that would provide the interface to do this. It showed one image at a time and allowed me to press a key from 0-9 indicating my belief about its class category. My classification accuracy ended up at about **94%** on 400 images. Why not 100%? Because some images are really unfair! To give you an idea, here are some questionable images from CIFAR-10: - - -> CIFAR-10 human accuracy is approximately 94% - -### Observations -A few observations I derived from this exercise: - -- The objects within classes in this dataset can be extremely varied. For example the "bird" class contains many different types of bird (both big birds and small). Not only are there many types of bird, but the occur at many possible magnifications, all possible angles and all possible poses. Sometimes only parts of the bird are shown. The poses problem is even worse for the dog/cat category, because these animals occur at many many different types of poses, and sometimes only the head is shown. Or left part of the body, etc. - -- My classification method felt strangely dichotomous. Sometimes you can clearly see the animal or object and classify it based very highly-informative distinct parts (for example, you find ears of a cat). Other times, my recognition was purely based on context and the overall cues in the image such as the colors. - -- The CIFAR-10 dataset is too small to properly contain examples of everything that it is asking for in the test set. I base this conclusion at least on my multiple ways of visualizing the nearest image in the training set. - -- I don't quite understand how Adam Coates et al. perform so well on this dataset (80%) with their method. My guess is that it works along the following lines: looking at the image squinting your eyes you can almost always narrow down the category to about 2 or 3. The final disambiguation probably comes from finding very good specific informative patches (like a patch of some kind of fur, or pointy ear part, etc.). The k-means dictionary must be catching these cases and the SVM likely picks up on them. - -- My impression from this exercise is that it will be hard to go above 80%, but I suspect improvements might be possible up to range of about 85-90%, depending on how wrong I am about the lack of training data. (**2015 update**: Obviously this prediction was way off, with state of the art now in 95%, as seen in this [Kaggle competition leaderboard](https://www.kaggle.com/c/cifar-10/leaderboard). I'm impressed!) - -I encourage people to try this for themselves (see my code, above), as it is very interesting and fun! I have trouble exactly articulating what I learned, but overall I feel like I gained more intuition for image classification tasks and more appreciation for the difficulty of the problem at hand. - -Finally, here is an example of my debugging interface: - - -The Matlab code used to generate these results can be found [here](http://cs.stanford.edu/people/karpathy/cifar10inspect.zip) - diff --git a/_posts/2012-10-22-state-of-computer-vision.markdown b/_posts/2012-10-22-state-of-computer-vision.markdown deleted file mode 100644 index 23d4e2376..000000000 --- a/_posts/2012-10-22-state-of-computer-vision.markdown +++ /dev/null @@ -1,36 +0,0 @@ ---- -layout: post -comments: true -title: "The state of Computer Vision and AI: we are really, really far away." -excerpt: "A depressing look at the state of Computer Vision Research and AI in general. For those who like to think that AI is anywhere close." -date: 2012-10-22 22:00:00 ---- - - -The picture above is funny. - -But for me it is also one of those examples that make me sad about the outlook for AI and for Computer Vision. What would it take for a computer to understand this image as you or I do? I challenge you to think explicitly of all the pieces of knowledge that have to fall in place for it to make sense. Here is my short attempt: - -- You recognize it is an image of a bunch of people and you understand they are in a hallway -- You recognize that there are 3 mirrors in the scene so some of those people are "fake" replicas from different viewpoints. -- You recognize Obama from the few pixels that make up his face. It helps that he is in his suit and that he is surrounded by other people with suits. -- You recognize that there's a person standing on a scale, even though the scale occupies only very few white pixels that blend with the background. But, you've used the person's pose and knowledge of how people interact with objects to figure it out. -- You recognize that Obama has his foot positioned just slightly on top of the scale. Notice the language I'm using: It is in terms of the 3D structure of the scene, not the position of the leg in the 2D coordinate system of the image. -- You know how physics works: Obama is leaning in on the scale, which applies a force on it. Scale measures force that is applied on it, that's how it works => it will over-estimate the weight of the person standing on it. -- The person measuring his weight is not aware of Obama doing this. You derive this because you know his pose, you understand that the field of view of a person is finite, and you understand that he is not very likely to sense the slight push of Obama's foot. -- You understand that people are self-conscious about their weight. You also understand that he is reading off the scale measurement, and that shortly the over-estimated weight will confuse him because it will probably be much higher than what he expects. In other words, you reason about implications of the events that are about to unfold seconds after this photo was taken, and especially about the thoughts and how they will develop inside people's heads. You also reason about what pieces of information are available to people. -- There are people in the back who find the person's imminent confusion funny. In other words you are reasoning about state of mind of people, and their view of the state of mind of another person. That's getting frighteningly meta. --  Finally, the fact that the perpetrator here is the president makes it maybe even a little more funnier. You understand what actions are more or less likely to be undertaken by different people based on their status and identity. - -I could go on, but the point here is that you've used a HUGE amount of information in that half second when you look at the picture and laugh. Information about the 3D structure of the scene, confounding visual elements like mirrors, identities of people, affordances and how people interact with objects, physics (how a particular instrument works,  leaning and what that does), people, their tendency to be insecure about weight, you've reasoned about the situation from the point of view of the person on the scale, what he is aware of, what his intents are and what information is available to him, and you've reasoned about people reasoning about people. You've also thought about the dynamics of the scene and made guesses about how the situation will unfold in the next few seconds visually, how it will unfold in the thoughts of people involved, and you reasoned about how likely or unlikely it is for people of particular identity/status to carry out some action. Somehow all these things come together to "make sense" of the scene. - -It is mind-boggling that all of the above inferences unfold from a brief glance at a 2D array of R,G,B values. The core issue is that the pixel values are just a tip of a huge iceberg and deriving the entire shape and size of the icerberg from prior knowledge is the most difficult task ahead of us. How can we even begin to go about writing an algorithm that can reason about the scene like I did? Forget for a moment the inference algorithm that is capable of putting all of this together; How do we even begin to gather data that can support these inferences (for example how a scale works)? How do we go about even giving the computer a chance? - -Now consider that the state of the art techniques in Computer Vision are tested on things like Imagenet (task of assigning 1-of-k labels for entire images), or Pascal VOC detection challenge (+ include bounding boxes). There is also quite a bit of work on pose estimation, action recognition, etc., but it is all specific, disconnected, and only half works. I hate to say it but the state of CV and AI is pathetic when we consider the task ahead, and when we think about how we can ever go from here to there. The road ahead is long, uncertain and unclear.   - -I've seen some arguments that all we need is lots more data from images, video, maybe text and run some clever learning algorithm: maybe a better objective function, run SGD, maybe anneal the step size, use adagrad, or slap an L1 here and there and everything will just pop out. If we only had a few more tricks up our sleeves! But to me, examples like this illustrate that we are missing many crucial pieces of the puzzle and that a central problem will be as much about obtaining the right training data in the right form to support these inferences as it will be about making them. - -Thinking about the complexity and scale of the problem further, a seemingly inescapable conclusion for me is that we may also need embodiment, and that the only way to build computers that can interpret scenes like we do is to allow them to get exposed to all the years  of (structured, temporally coherent) experience we have,  ability to interact with the world, and some magical active learning/inference architecture that I can barely even imagine when I think backwards about what it should be capable of. - -In any case, we are very, very far and this depresses me. What is the way forward? :( Maybe I should just do a startup. I have a really cool idea for a mobile local social iPhone app. - diff --git a/_posts/2013-11-23-chrome-extension-programming.markdown b/_posts/2013-11-23-chrome-extension-programming.markdown deleted file mode 100644 index 70a5b20e4..000000000 --- a/_posts/2013-11-23-chrome-extension-programming.markdown +++ /dev/null @@ -1,178 +0,0 @@ ---- -layout: post -comments: true -title: "Chrome Extension Programming: Illustrating a Basic Survival Skill with a Twitter Case Study" -excerpt: "I illustrate a very valuable skill (Chrome Extension Programming) using a Twitter Case study. We will give Twitter a face lift, get it to refresh new tweets automatically, and highlight tweets from people who rarely tweet. All with a few lines of Javascript!" -date: 2013-11-23 20:00:00 ---- - -### Extension Hacking -I wanted to share a few examples of a powerful skill that I've been gradually picking up over the last year. It is simply the ability to quickly hack together custom browser extensions in Chrome and using them to customize my favorite websites. Writing extensions is very fast: you need a short manifest file that contains some boring meta information, a few js/html files with your code in a folder, and then you simply activate the folder as an extension from the Extensions menu with a few clicks. In general, you can do a lot of fancy things with extensions: - -- add buttons, context-menu items -- modify functionality of Omnibox -- create extension-specific webpages that display various data/settings -- your extension is allowed to have local (or synced), persistent storage of data -- you can run almost arbitrary Javascript over the DOM of any webpage - -I can't stress how powerful the last item is. You can run Javascript. On top of any webpage. You can Read the page DOM. You can write to it, automatically, on load of the webpage or even periodically! This gives you complete freedom in modifying any webpage to your tastes: remove annoying content, add new features, log/scrape website data, change the layout, etc. It's completely crazy! - -I'll walk you through some examples with possible mods of Twitter just to give you a glimpse of how easy and powerful this can be. Twitter is fun and I use it often, but their website is annoying, has some ugly elements, and sometimes lacks certain functionality I would like it to have. A normal person would request features and wait, but with the dark arts of extension hacking we can do much better. Lets get right to it. - -### Fixing the Ugly - -A recent change on Twitter added this ugly text, visible by default and always, on every single tweet: - - -I understand it gets people to accidentally click on these more and pads Twitter's engagement numbers, but it's useless, ugly, and it just takes up too much space. Lets right click on one of these and choose Inspect Element. This opens up the HTML of the page and here we see the culprit DOM elements: - - - -So we have a list `` with items `
  • ` one for each of Reply, Retweet, Favorite and More. Inside every of them they have a ``(anchor that processes the click action), deeper we have a `` that becomes the icon, and finally followed by the ugly text wrapped in ``. That looks easy enough, we will find all these elements based on their  class attribute, descend down to find the text and get rid of it.  So we create a new folder for TwitterClean extension, copy paste some manifest boring code and set it up to load a javascript file anytime twitter loads. For example, right after twitter.com page loads, lets execute: - -```javascript -var clean_twitter = function(){ - var ugly = []; - ugly.push('.action-reply-container'); - ugly.push('.action-rt-container'); - ugly.push('.action-del-container'); - ugly.push('.action-fav-container'); - ugly.push('.more-tweet-actions'); - - for(var i=0;i<ugly.length;i++) { - var u = $(ugly[i]).find('b'); - u.text(''); - } -} -``` - -Load the Extension, refresh Twitter and poof! All the text is gone and we're just left with the icons. These suffice. Oh and while we're at code we run automatically on load of twitter.com, lets slip this one is as well: - -```javascript -$('.promoted-tweet').hide(); // oops! -``` - -I'll let you figure out what that single naughty line of code does for you :) - -### Loading new tweets automatically - -Here's another annoyance: you have your Twitter running on your side monitor and new tweets come in, but Twitter doesn't load them automatically! It just shows this: - - - -That's the passive aggressive look of Twitter telling you that there are two more tweets to show, but also refusing to actually show them. That would be too useful to their users. Instead, they want you to stop what you're doing and click the button to load the new tweets. Luckily, you are skilled at extension hacking so you can simply right click the caption, go to Inspect Element, and see that the <div> element that tells you there are more tweets has class "js-new-tweets-bar". Easy enough: - -```javascript -var periodic = function() { - L = document.getElementsByClassName('js-new-tweets-bar'); - if(L.length > 0){ - L[0].click(); - } -} -setInterval(periodic, 1000); -``` - -When this gets run when **twitter.com** loads, it sets up the code to look for the annoying bar every second (1000 milliseconds) and then runs its click event handler which loads the new tweets. That's all it takes, and now your tweets are streaming down automatically whenever they are available without you having to explicitly refresh them all the time. We've only written code for 5 minutes and in that time we tweaked the way Twitter looked, removed some "functionality" and added some functionality! We're on a roll! Let's do something fancier now. - -### Highlighting tweets from rare tweeters (wait, or tweepers?) - -One day I decided to collect tweets on my timeline over a period of a week using Twitter's REST API and saw that 30 accounts make up 50% of everything I see on Twitter. Since I follow 384 accounts in total, that's only 7%! Unfortunately, for Twitter every tweet is created equal, which means that this annoying social media guru person who tweets 100 times a day completely drowns tweets coming from your other friends who believe that one should also have something worthy of tweeting too. Okay well it's not exactly like that but I wished there was a mechanism for highlighting the very infrequent tweeters and seeing that low frequency content. Twitter will never implement this because it makes Zero sense for their revenue model, but luckily, we can hack this together quite easily! First, here's a function that goes through all tweets on your timeline, looks at who tweeted, and "charges" every unique tweet to the originating user: - -```javascript -var charge_tweets = function() { - - // get all tweets in twitter timeline - var items = $('.tweet'); - for(var i=0;i<items.length;i++) { - var it = items[i]; - - // extract information from tweet HTML - var original_user = $(it).attr('data-screen-name'); - var retweeter = $(it).attr('data-retweeter'); - var tweet_id = $(it).attr('data-tweet-id'); - - // a bit of logic - var charged_user = original_user; - if(typeof retweeter !== 'undefined') { - charged_user = retweeter; - } - - // charge tweet to the user - if(charge.hasOwnProperty(charged_user)) { - var L = charge[charged_user]; - if($.inArray(tweet_id, L) === -1) { - L.push(tweet_id); - } - } else { - charge[charged_user] = [tweet_id]; - } - } -}; -``` - -Basically, it turns out every tweet has class "tweet", so it is trivial to iterate over them as seen above. Similarly, by inspecting the way the HTML is laid out, it turns out we can simply scrape the user and the (unique) tweet id and use it to build up a dictionary of `user_string -> [tweet id, ...]`. Of course, we will have to let this accumulate for a few days before it measures a good tweeting frequency distribution for all people we follow as we visit Twitter again and again always seeing new tweets from more people. But this also means we have to load and save the **charge** dictionary from Chrome's local extension storage or otherwise we would lose all our charging work whenever we close the Tab! Easy enough: - -```javascript -var save_charge = function() { - chrome.storage.local.set({'charge': charge}); -} - -var load_charge = function() { - chrome.storage.local.get('charge', function (result) { - if(result.charge) { - charge = result.charge; - console.log('loaded tweet frequency stats:'); - console.log(charge); - } else { - console.log('no tweet frequency to load'); - } - }); -} -``` - -Now we just make sure to run load_charge() at start up, and save_charge() anytime there are new tweets and our charge dictionary changes. Based on this charge dictionary we can easily find, say, the 50th percentile frequency, and highlight any tweet that comes from a user who tweets less often than 50% of the users we follow: - -```javascript -var display_charges = function() { - - var items = $('.tweet'); - for(var i=0;i<items.length;i++) { - var it = items[i]; - - // ... as above and then: - - var charged_tweets = charge[charged_user]; - var charge_count = charged_tweets.length; - - // adjust highlight color of the tweet according to rareness - if(charge_percentile > 0) { - var ratio = charge_count / charge_percentile; - var x = Math.floor(Math.min(ratio,1)*255); - $(it).css('background-color', 'rgb(255,255,' + x + ')'); - } - } -} -``` - -This is just one possibility out of many. Here, ratio will be low for users who rarely tweet, and we're setting their tweet to be yellow based on their rareness. Very hard to not notice on your timeline! :) And while we're at it, why not also fit in: - -```javascript -var VIP = ['elonmusk']; -if($.inArray(charged_user, VIP) !== -1) { - $(it).css('background-color', 'rgb(150,255,150)'); -} -``` - -This way, Elon Musk's (or your other Twitter favorites) tweets will always glow a vibrant, green color that is hard to notice! Nice. Here's what we get: - - - -Just look at that! Mashable and some person who needed every single one of his followers to know "Aarrrgh" look normal, Elon's tweets are hard to miss green, and someone who doesn't tweet relatively as often is highlighted a bit as yellow. - -### Summary - -It took us ~100 lines and 10 minutes of Javascript (with a bit of practice) and we tweaked Twitter's look, removed err... undesirable content, made Twitter autorefresh, and added an entirely new feature that highlights infrequent tweepers! - -Yet we've only barely scratched the surface. If you're  comfortable with navigating HTML of pages with Chrome's awesome inspector and writing HTML/Javascript/CSS, these quick hacks have the potential to significantly improve your online experience by giving you powerful options for customizing your favorite sites. And if you are not comfortable, perhaps it's time to head over to Chrome Extensions "Getting Started" and write a few hacks :) - -Oh, and if you'd like the full code of the above, you may find it here: LINK (Note it is a bit rough around the edges, but then it is a quick hack after all!). Let me know if you have any issues on [@karpathy](https://twitter.com/karpathy), and until later! diff --git a/_posts/2013-11-27-quantifying-hacker-news.markdown b/_posts/2013-11-27-quantifying-hacker-news.markdown deleted file mode 100644 index 78a76bcec..000000000 --- a/_posts/2013-11-27-quantifying-hacker-news.markdown +++ /dev/null @@ -1,37 +0,0 @@ ---- -layout: post -comments: true -title: "Quantifying Hacker News with 50 days of data" -excerpt: "I scraped Hacker News Front Page and New Page every minute for 50 days and analyzed the results. How do stories rise and fall on Hacker News? What makes a successful post? Find out in this post :)" -date: 2013-11-27 20:00:00 ---- - -### Quantifying Hacker News -I thought it would be fun to analyze the activity on one of my favorite sources of interesting links and information, Hacker News. My source of data is a script I've set up some time in August that downloads HN (the Front page and the New stories page) every minute. We will be interested in visualizing the stories as they get upvoted during the day, figuring out which domains/users are most popular, what topics are most popular, and the best time to post a story. I'm making all my data and code (Python data collection scripts + IPython Notebook for analysis) available in case you'd like to carry out a similar analysis. - -### Data collection protocol -I set up a very simple python script that scrapes the HN **front page** and the **new stories page** every minute. A single day of data begins at 4am (PST) and ends at 4am the next day. The .html files are saved compressed as gzipped pickles and one day occupies roughly 10mb in this format. I had bring down my machine for a few days a few times so there are some gaps in the data, but in the end we get 47 days of data from period between August 22 and October 30. - -### Raw HTML data parsing -The parsing Python script uses **BeautifulSoup** to convert the raw HTML into a more structured JSON. This script was by the way by no means simple to write -- HN is based on unstructured tables and I had to discover many strange edge cases in its behavior along the way. At the end I ended up with a 100-line ugliest-parsing-function-ever (really, I'm not proud of it) but it works and outputs something like the following for a single story at a specific snapshot: - -``` -{ -'domain': u'play.google.com', 'title': u'Nexus 5', -'url': u'https://play.google.com/store/devices/details?id=nexus_5_black_16gb', -'num_comments': 42, 'rank': 1, 'points': 65, -'user': u'sonier', 'minutes_ago': 39, 'id': u'6648519' -} -``` - -We get 60 such entries every minute (30 for front page and 30 for new page) and these are again all saved to disk. We are now ready to bring out the IPython Notebook and get to the juicy analysis! - -### The Analysis: Detailed analysis - -Head over to the IPython Notebook rendered as HTML for the analysis: - - - - - -Note: I had the entire dataset and .ipynb Ipython Notebook source available for download but recently took it down to save space on my host (sorry). diff --git a/_posts/2014-04-26-datascience-weekly-interview.markdown b/_posts/2014-04-26-datascience-weekly-interview.markdown deleted file mode 100644 index 18288628c..000000000 --- a/_posts/2014-04-26-datascience-weekly-interview.markdown +++ /dev/null @@ -1,13 +0,0 @@ ---- -layout: post -comments: true -title: "Interview with Data Science Weekly on Neural Nets and ConvNetJS" -excerpt: "I gave a (long) interview about my background and perspectives on neural nets." -date: 2014-04-26 22:54:00 ---- - -I thought I should link this: I've given an interview ~two months ago about ConvNetJS, some of my background and a few perspectives on neural net trends and where the field seems to be going, at least in academia. Find it here: - -[http://www.datascienceweekly.org/blog/14-training-deep-learning-models-browser-andrej-karpathy-interview](http://www.datascienceweekly.org/blog/14-training-deep-learning-models-browser-andrej-karpathy-interview) - - \ No newline at end of file diff --git a/_posts/2014-07-01-switching-to-jekyll.markdown b/_posts/2014-07-01-switching-to-jekyll.markdown deleted file mode 100644 index aac30b904..000000000 --- a/_posts/2014-07-01-switching-to-jekyll.markdown +++ /dev/null @@ -1,74 +0,0 @@ ---- -layout: post -comments: true -title: "Switching Blog from Wordpress to Jekyll" -excerpt: "I can't believe I lasted this long on Wordpress. I am switching permanently to Jekyll for hosting my blog, and so should you :) Details inside." -date: 2014-07-01 20:00:00 ---- - -Inspired by [Mark Reid's](https://twitter.com/mdreid) blog post [Switching from Jekyll to Hakyll](http://mark.reid.name/blog/switching-to-hakyll.html) I decided to abandon Wordpress and give Jekyll a try (note, I currently do not yet feel pro enough to switch to Haskell-based Hakyll). I can confidently say that I could not be happier about this decision. - -### Wordpress Monster - -*"So what's wrong with Wordpress?"* You may ask. Let's see, everything: - -- Wordpress blogs are clunky, slow and bloated. -- Wordpress is dynamically rendered with **.php**. There are really only few niche applications where this is necessary. Dynamic code execution exposes your blog to hackers and exploits: zero-day attacks, viruses, etc. My own blog was hacked ~2 months ago and all my posts had been infected with spammy content that kept re-inserting itself magically when I removed it. -- Wordpress is popular among the masses of people who don't know any better, and therefore attracts the largest amount of spammers. -- Your posts are stuck forever in an ugly, Wordpress-specific SQL database (ew). You can't easily import/export posts. You do not really own your content in raw and nimble form. -- Wordpress is blocked in China. - -> Wordpress is a bloated, clunky, slow, vulnerable, closed mess. - -### Jekyll <3 - -[Jekyll](http://jekyllrb.com/) describes itself as a tool for building *"Simple, blog-aware, static sites"*, and was originally written by one of the Github co-founders, [Tom Preston-Werner](http://tom.preston-werner.com/). It is flat and transparent: Your blog workspace is a single folder with a config file, and a few folders for CSS and HTML templates. All my content, for example, lives in two folders: - -1. My blog posts are just files in a single folder `_posts`, written in [Markdown](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet). Including this post, of course. -2. My images are in a single folder `assets`. - -That's it. You call `$ jekyll build` from command line and it will automatically render all posts it finds in your `_posts` folder from markdown to HTML, wraps it with header/footer templates, creates the parent index page that lists all your posts and outputs everything into a directory `_site`. The `_site` directory holds your entire webpage as static content. It can then be uploaded to a webserver wherever you like. - -The entire code base consists of like 7 files. It's easy to see how the HTML templates get composed to your final site. It's trivial to tweak the CSS or any of the HTML templates. For example, I added **Google Analytics** tracking code to all my pages by tweaking the html template, and also **Disqus** comments to all my posts by tweaking the posts template with the Disqus Javascript code. - -#### Github integration - -Lastly, as you might expect Jekyll is tightly integrated with Github: create a repository that looks like `username.github.io` and add your files to the repo. Github will automatically compile your files with Jekyll and make the `_site` folder available. For example, mine lives on [karpathy.github.io](http://karpathy.github.io/). Thus, Github makes sure that your blog is beautifully backed up **forever in simple markdown**, and also **hosts your content**! - -> Jekyll strikes the balance: It's packed with just the right amount of features. - -#### Example workflow -To give a flavor for the workflow, to add a new blog post I proceed as follows: - -```bash -$ cd _posts -$ vim 2014-07-02-example-page.markdown -``` - -Now we write the blog post in markdown, here's an example file: - -```bash ---- -layout: post -title: "Post title" -excerpt: "A nice post" -date: 2014-07-02 10:00:00 ---- - -Hello world, this is **markdown**. - -``` - -Lets pop back out to console now. I could preview the changes in a local webserver with `$ jekyll serve --watch` (the watch switch refreshes any updated files as you write them). Now let's just push it live: - -```bash -$ cd .. -$ git add . -$ git commit -m "new blog post" -$ git push -``` - -After the last command, Github will see that my repo has changed and automatically refreshes [karpathy.github.io](http://karpathy.github.io/) to point to the newly generated `_site`. My post is live! - -Anyway, that's just a brief taste. Check out [Jekyll](http://jekyllrb.com/) and get blogging in a sane way! - diff --git a/_posts/2014-07-02-visualizing-top-tweeps-with-t-sne-in-Javascript.markdown b/_posts/2014-07-02-visualizing-top-tweeps-with-t-sne-in-Javascript.markdown deleted file mode 100644 index 7be11bec2..000000000 --- a/_posts/2014-07-02-visualizing-top-tweeps-with-t-sne-in-Javascript.markdown +++ /dev/null @@ -1,101 +0,0 @@ ---- -layout: post -comments: true -title: "Visualizing Top Tweeps with t-SNE, in Javascript" -excerpt: "A writeup of a recent mini-project: I scraped tweets of the top 500 Twitter accounts and used t-SNE to visualize the accounts so that people who tweet similar things are nearby. My final Javascript implementation of t-SNE is released on Github as tsnejs." -date: 2014-07-02 17:22:24 ---- - - - -I was recently looking into various ways of embedding unlabeled, high-dimensional data in 2 dimensions for visualization. A wide variety of methods have been proposed for this task. [This Review paper](http://homepage.tudelft.nl/19j49/Matlab_Toolbox_for_Dimensionality_Reduction_files/TR_Dimensiereductie.pdf) from 2009 contains nice references to many of them (PCA, Kernel PCA, Isomap, LLE, Autoencoders, etc.). If you have Matlab available, the [Dimensionality Reduction Toolbox](http://homepage.tudelft.nl/19j49/Matlab_Toolbox_for_Dimensionality_Reduction.html) has a nice implementation of many of these methods. Scikit Learn also has a brief section on [Manifold Learning](http://scikit-learn.org/stable/modules/manifold.html) along with the implementation. - -Among these algorithms, t-SNE comes across as one that has a pleasing, intuitive formulation, simple gradient and nice properties. Here is a [Google Tech Talks video](http://www.youtube.com/watch?v=RJVL80Gg3lA) of Laurens van der Maaten (the author) explaining the method. I set out to re-implement t-SNE from scratch since doing so is the best way of learning something that I know of, and what better language to do this in than - Javascript! :) - -Long story short, I've implemented t-SNE in JS, released it as [tsnejs on Github](https://github.com/karpathy/tsnejs), and created a small demo that uses the library to visualize the top twitter accounts based on what they talk about. In this post, I thought it might be fun to document a small 1-day project like this, from beginning to end. This also gives me an opportunity to describe some of my projects toolkit, which others might find useful. - -### Final demo -First, take a look at the [final demo](http://cs.stanford.edu/people/karpathy/tsnejs/). To create this demo I found the top 500 most followed accounts on Twitter, downloaded 200 of their tweets and then measured differences in what they tweet about. These differences are then fed to t-SNE to produce a 2-dimensional visualization, where nearby people tweet similar things. Fun! - -### Fetching top tweeps -We first have to identify the top 500 tweeps. I googled "top twitter accounts" and found http://twitaholic.com/ , which lists them out. However, the accounts are embedded in the webpage and we need to extract them in structured format. For this, I love a recent YC startup [Kimono](https://www.kimonolabs.com/); I use it extensively to scrape structured data from websites. It lets you click the elements of interest (the Twitter handles in this case), and extracts them out in JSON. Easy as pie! - -### Collecting tweets -Now we have a list of top 500 tweeps and we'd like to obtain their tweets to get an idea about what they tweet about. My library of choice for this task is [Tweepy](https://github.com/tweepy/tweepy). Their documentation is quite terrible but if you browse the source code things seem relatively simple. Here's an example call to get 200 tweets for a given user: - -```python -tweets = tweepy.Cursor(api.user_timeline, screen_name=user).items(200) -``` - -We iterate this over all users, extract the tweet text, and dumpt it all into files, one per account. I had to be careful with two annoyances in process: - -- Twitter puts severe rate limits on API calls, so this actually took several hours to collect, wrapped up in try catch blocks and `time.sleep` calls. -- The returned text is in Unicode, which leads to trouble if you're going to try to write it to file. - -One solution for the second annoyance is to use the codecs library: - -```python -import codecs -codecs.open(filename, 'w', 'utf-8').write(tweet_texts) -``` - -Oh, and lets also grab and save the Twitter profile pictures, which we'll use in the visualization. An example for one user might be: - -```python -import urllib # yes I know this is deprecated -userobj = api.get_user(screen_name = user) -urllib.urlretrieve(imgname, userobj.profile_image_url) # save image to disk -``` - -I should mention that I write a lot of quick and dirty Python code in [IPython Notebooks](http://ipython.org/notebook.html), which I very warmly recommend. If you're writing all your Python in text editors, you're seriously missing out. - -### Quantifying Tweep differences -We now have 500 tweeps and their 200 most recent tweets concatenated in 500 files. We'd now like to find who tweets about similar things. [Scikit learn](http://scikit-learn.org/stable/) is very nice for quick NLP tasks like this. In particular, we load up all the files and create a 500-long array where every element are the 200 concatenated tweets. Then we use the [TfidfVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) class to extract all words and bigrams from the text data, and to turn every user's language into one tfidf vector. This vector is a fingerprint of the language that each person uses. Here's how we can simply wire this up: - -```python -from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer -vectorizer = TfidfVectorizer(min_df=2, stop_words = 'english',\ -strip_accents = 'unicode', lowercase=True, ngram_range=(1,2),\ -norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) -X = vectorizer.fit_transform(user_language_array) -D = -(X * X.T).todense() # Distance matrix: dot product between tfidf vectors -``` - -In the above, `user_language_array` is the 500-element array that has the concatenated tweets. The `TfidfVectorizer` class looks through all tweets and takes note of all words (unigrams) and word bigrams (i.e. series of two words). It builds a dictionary out of all unigram/bigrams and essentially counts up how often every person uses each one. Here's an example of some tweet text converted to unigram/bigrams: - - - -The tfidf vectors are returned stacked up as rows inside `X`, which has size `500 x 87,342`. Every one of the 87,342 dimensions corresponds to some unigram or bigram. For example, the 10,000th dimension could correspond to the frequency of usage of the unigram "YOLO". The vectors are L2 normalized, so the dot product between these vectors is related to the angle between any two vectors. This can be interpreted as the similarity of language. Finally, we dump the matrix and the usernames into a JSON file, and we're ready to load things up in Javascript! - -### The Visualization parts -We now create an .html file and import [jQuery](http://jquery.com/) (as always), and [d3js](http://d3js.org/), which I like to use for any kind of plotting. We load up the JSON that stores our distances and usernames with jQuery, and use d3js to initialize the SVG element that will hold all the users. For starters, we plot the users at random position but we will soon arrange them so that similar users cluster nearby with t-SNE. Inspect the code on the [demo page](http://cs.stanford.edu/people/karpathy/tsnejs/) to see the jQuery and d3js parts (Ctrl+U). In the code, we see a few things I like to use: - -- I like to use **Google Fonts** to get prettier-than-default fonts. Here, for example I'm importing Roboto, and then using it in the CSS. -- Next, we see an import of **syntaxhighlighter** code which dynamically highlights code on your page. -- Then we see **Google tracking JS code**, which lets me track statistics for the website on Google Analytics. -- I didn't use **Bootstrap** on this website because it's very small and simple, but normally I would because this makes your website right away work nicely on mobile. - -### t-SNE - - - -Finally we get to the meat! We need to arrange the users in our d3js plot so that similar users appear nearby. The t-SNE cost function was described in this [2008 paper by van der Maaten and Hinton](http://jmlr.csail.mit.edu/papers/volume9/vandermaaten08a/vandermaaten08a.pdf). Similar to many other methods, we set up two distance metrics in the original and the embedded space and minimize their difference. In t-SNE in particular, the original space distance is based on a Gaussian distribution and the embedded space is based on the heavy-tailed Student-t distribution. The KL-divergence formulation has the nice property that it is asymmetric in how it penalizes distances between the two spaces: - -- If two points are close in the original space, there is a strong attractive force between the the points in the embedding -- Conversely, if the two points are far apart in the original space, the algorithm is relatively free to place these points around. - -Thus, the algorithm preferentially cares about preserving the local structure of the high-dimensional data. Conveniently, the authors link to multiple implementations of t-SNE on [their website](http://homepage.tudelft.nl/19j49/t-SNE.html), which allows us to see some code for reference as well (if you're like me, reading code can be much easier than reading text descriptions). We're ready to write up the Javascript version! - -The final code can be seen in [tsne.js file](https://github.com/karpathy/tsnejs), on Github. Note how we're wrapping all the JS code into a function closure so that we don't pollute the global namespace. This is a very common trick in Javascript that is essentially used to implement classes. Note also the large number of utility boring code I had to include up top because Javascript is not exactly intended for math :) The core function where all magic happens is `costGrad()`, which computes the cost function and the gradient of the objective. The correct implementation of this function is double checked with `debugGrad()` gradient check. Once the analytic gradient checks out compared to numeric gradient, we're good to go! We set up a piece of Javascript to call our `step()` function repeatedly (`setInterval()` call), and we plot the solution as it gets computed. - -Phew! Final result, again: [t-SNE demo](http://cs.stanford.edu/people/karpathy/tsnejs/). - -I hope some of the references were useful. If you use tsnejs to embed some of your data, let me know! - -## Bonus: Word Embedding t-SNE Visualization - -I created another demo, this time to visualize word vector embeddings. Head [over here](http://cs.stanford.edu/people/karpathy/tsnejs/wordvecs.html) to see it. The word embeddings are trained as described in this [ACL 2012 paper](http://www.socher.org/index.php/Main/ImprovingWordRepresentationsViaGlobalContextAndMultipleWordPrototypes). - -The (unsupervised) objective function makes it so that words that are interchangable (i.e. occur in very similar surrounding context) are close in the embedding. This comes across in the visualization! - - diff --git a/_posts/2014-07-03-feature-learning-escapades.markdown b/_posts/2014-07-03-feature-learning-escapades.markdown deleted file mode 100644 index 871e63613..000000000 --- a/_posts/2014-07-03-feature-learning-escapades.markdown +++ /dev/null @@ -1,122 +0,0 @@ ---- -layout: post -comments: true -title: "Feature Learning Escapades" -excerpt: "Some reflections on the last two years of my research: The Quest for Unsupervised Feature Learning algorithms for visual data. Where it was, where it is, and where it's going. Maybe." -date: 2014-07-03 10:00:00 ---- - -My summer internship work at Google has turned into a CVPR 2014 Oral titled **"Large-scale Video Classification with Convolutional Neural Networks"** [(project page)](http://cs.stanford.edu/people/karpathy/deepvideo/). Politically correct, professional, and carefully crafted scientific exposition in the paper and during my oral presentation at CVPR last week is one thing, but I thought this blog might be a nice medium to also give a more informal and personal account of the story behind the paper and how it fits into a larger context. - - -### Act I: Google Research Summer Internship 2011 - -
    - -
    Hanging out at Google in 2011
    -
    - -The thread of this paper begins in Summer of 2011, when I accepted a summer internship offer from Google Research. My project involved Deep Learning for videos, as part of a great team that was at the time only a few people but would later grow to become [Google Brain](http://en.wikipedia.org/wiki/Google_Brain). - -The goal of the project was to learn spatio-temporal features that could support a variety of video classification tasks. The problem, of course, is that videos are a giant 3-dimensional block of pixel values which is useless in its raw form if you're trying to classify what objects/concepts occur within. Computer Vision researchers have come up with [many ingenious ways](http://hal.archives-ouvertes.fr/inria-00439769/) of computing hand-crafted features over these pixels to transform the representation into one that is more directly useful to a classification task, but we were interested in learning features from raw data with Deep Learning. - -> " Deep Learning landscape was very different at that time. " - -It's interesting to note that the Deep Learning landscape was very different at that time. Everyone was excited primarily about **Unsupervised Learning**: The idea of training enormous autoencoders that gobble up all of internet data and automagically create powerful representations that support a large variety of transfer learning tasks. A lot of this ambition was motivated by: - -1. **Human learning** (people learn unsupervised, and so should our algorithms (the argument goes)) -2. Parallel **work in academia**, where resurgence of Deep Learning methods starting around 2006 has largely consisted of algorithms with a significant unsupervised learning component (RBMs, autoencoders, etc.). -3. **Practical considerations**: unlabeled videos are much easier to obtain than labeled data - wouldn't it be nice if we didn't have to worry about labels? - -Around this time I became particularly interested in videos because I convinced myself through various thought experiments and neuroscience papers that if unsupervised learning in visual domain was to ever work, it would involve video data. Somehow. I thought the best shot might be some kind of Deep, [Slow Feature Analysis](http://www.scholarpedia.org/article/Slow_feature_analysis) objective, but I ended up working on architectures more similar to CVPR 2011 ["Learning hierarchical invariant spatio-temporal features for action recognition with independent subspace analysis"](http://cs.stanford.edu/~quocle/LeZouYeungNg11.pdf). However, the summer was over before we could get something interesting scaled over a chunk of YouTube. - - -### Act II: Unsupervised Learnings at Stanford - -I left Google that fall and joined Stanford as a PhD student. I was swayed by my project at Google and felt eager to continue working on Unsupervised Feature Learning in visual domains. - -**Images.** One of my first rotations was with Andrew Ng, who was also at the time interested in Unsupervised Learning in images. I joined forces with his student Adam Coates who worked on doing so with simple, explicit methods (e.g. k-means). The NIPS paper [Emergence of Object-Selective Features in Unsupervised Feature Learning](http://cs.stanford.edu/people/karpathy/nips2012.pdf) was the result of our efforts, but I didn't fully believe that the formulation made sense. The algorithm's only cue for building invariance was through similarity: Things that looked similar (in L2 distance) would group together and become invariants in the layer above. That alone can't be right, I thought. - -
    - -
    NIPS 2012 paper: Learning Features by linking similar patches into invariants.
    -
    - -> " I couldn't see how Unsupervised Learning based solely on images could work. " - -More generally, I couldn't see how Unsupervised Learning based solely on images could work. To an unsupervised algorithm, a patch of pixels with a face on it is exactly as exciting as a patch that contains some weird edge/corner/grass/tree noise stuff. The algorithm shouldn't worry about the latter but it should spent *extra* effort worrying about the former. But you would never know this if all you had was a billion patches! It all comes down to this question: if all you have are pixels and nothing else, what distinguishes images of a face, or objects from a random bush, or a corner in the ceilings of a room? I'll come back to this. - -**3D.** At this point it was time for my next rotation. I felt frustrated by working with pixels and started thinking about another line of attack to unsupervised learning. A few ideas that have been dormant in my brain until then centered around the fact that we live and perceive a 3D world. Perhaps images and videos were too hard. Wouldn't it be more natural if our unsupervised learning algorithms reasoned about 3D structures and arrangements rather than 2D grids of brightness? I felt that humans had advantage in their access to all this information during learning from stereo and structure from motion (and also Active Learning). Were we not giving our algorithms a fair chance when we throw grids of pixels at them and expect something interesting to happen? - -> " Were we not giving our algorithms a fair chance? " - -This was also around the time when the Kinect came out, so I thought I'd give 3D a shot. I rotated with [Vladlen Koltun](http://vladlen.info/) in Graphics and later with [Fei-Fei](http://vision.stanford.edu/feifeili/) over the summer. I spent a lot of my time wrestling with Kinect, 3D data, Kinect Fusion, Point Clouds, etc. There were four challenges to learning from 3D data that I eventually discovered: - -1. There is no obvious/clean way to plug a neural network into 3D data. -2. Reasoning about the difference between occluded / empty space is a huge pain. -3. It is very hard to collect data at scale. Neural nets love data and here I was playing around with datasets on order of 100 scenes, with no ideas about how this could be possibly scale. -4. I was working with fully static 3D environments. No movement, no people, no fun. - -I ended up doing a bit of Unsupervised Object Discovery in my 3D meshes and publishing it at a robotics conference, where it was most relevant (Object Discovery in 3D scenes via Shape Analysis). I was happy that I found a very simple, efficient and surprisingly effective way of computing objectness over 3D meshes, but it wasn't what I set out to do. I followed up on the project a bit while working with Sebastian Thrun for my last rotation, but I remained unsatisfied and unfulfilled. There was no brain stuff, no huge datasets to learn from, and even if it all worked, it would work on static, boring scenes. - -
    - -
    ICRA 2013 paper: Highlighted mesh parts are discovered objects.
    -
    - -This was a low point for me in my PhD. I kept thinking about ways of making unsupervised feature learning work, but kept coming across roadblocks-- both practical but more worryingly, philosophical. I was getting a little burnt out. - - -### Act III: Computer Vision Upside Down - -Around this time I joined Fei-Fei's lab and looked around for a research direction related to Computer Vision. I wanted my work to involve elements of deep learning and feature learning, but at this time deep learning was not a hot topic in Computer Vision. Many people were skeptical of the endeavor: Deep Learning papers had trouble getting accepted to Computer Vision conferences (see for example, famously Yann LeCun's public letter to CVPR AC). The other issue was that I felt a little stuck and unsure about how to proceed. - -**AlexNets.** It was around this time that the paper that would change the course of my research, and also the course of Computer Vision came out. I'm referring to "ImageNet Classification with Deep Convolutional Neural Networks", in which a Convolutional Neural Network (CNN) significantly outperformed state of the art methods on the ImageNet Classification Challenge. The described CNN architecture has become known as the *"AlexNet"*, after the first author of the paper: [Alex Krizhevsky](http://www.cs.toronto.edu/~kriz/). ConvNets have come of age and leapfrogged from working on toy MNIST/CIFAR-10 datasets just a year ago, to suddenly running on large images and beating methods that have been developed for years. I did not expect such an astronomical leap and I think neither did most of the community. - -
    - -
    Learned CNN features. Visualization taken from a nice paper by Zeiler and Fergus 2013, Visualizing and Understanding Convolutional Networks". The features become more extended and complex deeper in the network.
    -
    - -**Transfer Learning.** The impressive performance of the AlexNet was interesting by itself, but the second unintuitive finding that followed was that the ImageNet-pretrained representation proved extremely potent in transfer learning tasks to other datasets. Suddenly, people were taking an ImageNet-pretrained CNN, chopping off the classifier layer on top, treating the layers immediately before as a fixed feature extractor and beating state of the art methods across many different datasets (see DeCAF, Overfeat, and Razavian et al.). I still find this rather astonishing. In some parallel universe, I can image a CNN that performs very well on ImageNet but doesn't necessarily transfer to strange datasets of sculptures, birds and other things, and I'm nodding along and saying that that's to be expected. But that seems to not be the universe we live in. - -
    - -
    A small crop of a 6000x6000 image that shows how CNN arranges the visual world (with t-SNE). Find full images here.
    -
    - -**Fully Supervised.** A crucial observation to make here is that AlexNet was trained in a fully supervised regime on the (labeled) [ImageNet challenge](http://www.image-net.org/challenges/LSVRC/2014/) dataset. There are no unsupervised components to be found. Where does that leave us with unsupervised learning? The main purpose of unsupervised learning was to learn powerful representations from unlabeled data, which could then be used in transfer learning settings on datasets that don't have that many labels. But what we're seeing instead is that training on huge, supervised datasets is successfully filling the main intended role of unsupervised learning. This suggests an alternative route to powerful, generic representations that points in the complete opposite direction: Instead of doing unsupervised learning on unlabeled data, perhaps we should train on all the supervised data we have, at the same time, with multi-task objective functions. - -> "... training on huge, supervised datasets is successfully filling the main intended role of unsupervised learning." - -This brings me back to my point made above: if all you have is pixels, what is the difference between an image of a face and an image of a random corner or a part of a road, or a tree? I struggled with this question for a long time and the ironic answer I'm slowly converging on is: nothing. In absence of labels, there is no difference. So unless we want our algorithms to develop powerful features for faces (and things we care about a lot) alongside powerful features for a sea of background garbage, we may have to pay in labels. - -### Act IV: Google Research Summer Internship 2013 - -When I entered Google the second time this summer, the landscape was very different than what I had seen in 2011. I left 2 years ago implementing unsupervised learning algorithms for learning spatio-temporal (video) features in baby Google Brain. Everything had a researchy feel and we were thinking carefully about loss functions, algorithms, etc. When I came back, I found people buzzing around with their engineer hats on, using adolescent Google Brain to obtain great results across many datasets with huge, 1980-style, fully supervised neural nets (similar to the AlexNet). Supervised, vanilla feed-forward Neural Nets became a hammer and everyone was eager to find all the nails and pick all the low-hanging fruit. This is the atmosphere that surrounded me when I started my second project on learning video features. The recipe was simpler. Are you interested in training nice features for X? - -1. Get a large amount of labeled data in X domain -2. Train a very large network with supervised objective -3. ??? -4. Profit - -In my case, number 4 turned out to be ["Large-scale Video Classification with Convolutional Neural Networks"](http://cs.stanford.edu/people/karpathy/deepvideo/), in which we trained large Spatio-Temporal Convolutional Neural Networks. Ironically, the dataset we chose to use (Sports videos) turned out to be a little too easy to learn rich, spatio-temporal features since the network could get very far ignoring much of the motion and relying mostly on static appearances (e.g. if you're trying to tell difference between tennis and swimming, you need not be concerned with minute movement details). But I expect to see considerable improvements in the coming years. (From others, as I am no longer working on videos.) - -
    - -
    Spatio-Temporal CNN predicting Sports on videos. (Blue = ground truth, Green = correct prediction, Red = incorrect)
    -
    - - -### Act V: Forward - -Two years ago, I spent a lot of my mental efforts trying to, at least conceptually, crack the problem of learning about the visual world unsupervised. We were going to feed an algorithm with lots and lots of visual data from the internet (images, videos, 3D or whatever else), and it would automatically discover representations that would support a variety of tasks as rich as those that we humans are capable of. - -But as I reflect on the last two years of my own research, my thought experiments and the trends I'm seeing emerge in current academic literature, I am beginning to suspect that this dream may never be fulfilled - at least in the form we originally intended. Large-scale supervised data (even if weakly labeled) is turning out to be a critical component of many of the most successful applications of Deep Learning. In fact, I'm seeing indications of reversal of the strategy altogether: Instead of learning powerful features with no labels, we might end up learning them from ALL the labels in huge, multi-task (and even multi-modal) networks, gobbling up as many labels as we can get our hands on. This could take form of a Convolutional Network where gradients from multiple distinct datasets flow through the same network and update shared parameters. - ->" Instead of learning powerful features with no labels, we might end up learning them from ALL the labels " - -*"But wait, humans learn unsupervised - why give up? We might just be missing something conceptually!"*, I've heard some of my friends argue. The premise may, unfortunately be false: humans have temporally contiguous RGBD perception and take heavy advantage of Active Learning, Curriculum Learning, and Reinforcement Learning, with help from various pre-wired neural circuits. Imagine a (gruesome) experiment in which we'd sit a toddler in front of a monitor and flash random internet images at him/her for months. Would we expect them to develop the same understanding of the visual world? Because that's what we're currently trying to get working with computers. - -The strengths, weaknesses and types of data practically available to humans and computers are fundamentally misaligned. Thus, it is unfair to draw direct comparisons and extreme caution should be taken when drawing inspiration from human learning. Perhaps one day when robotics advances to a point where we have masses of embodied robots interacting with the 3-dimensional visual world like we do, I will give unsupervised learning another shot. I suspect it will feature heavy use of temporal information and reinforcement learning. But until then, lets collect some data, train some huge, fully-supervised, multi-task, multi-modal nets and... "profit" :) - diff --git a/_posts/2014-08-03-quantifying-productivity.markdown b/_posts/2014-08-03-quantifying-productivity.markdown deleted file mode 100644 index 617286d2c..000000000 --- a/_posts/2014-08-03-quantifying-productivity.markdown +++ /dev/null @@ -1,118 +0,0 @@ ---- -layout: post -comments: true -title: "Quantifying Productivity" -excerpt: "Describing a new pet project that tracks active windows and keystroke frequencies over the duration of a day (on Ubuntu/OSX) and creates pretty HTML visualizations of the data. This allows me to gain nice insights into my productivity. Code on Github." -date: 2014-08-03 15:00:00 ---- - -I'm always on a lookout for interesting datasets to collect, analyze and interpret. And what better dataset to collect/analyze than the *meta-dataset* of my own activity collecting/analyzing other datasets? How much time do I **really* spend working per day? How do I spend most of that time? What makes me productive? These are all relatively important questions that I'd like answers to, and since I prefer my answers based on data and not confirmation-bias-susceptible personal anecdotes, I wrote [ulogme](https://github.com/karpathy/ulogme). - -> "I prefer my answers based on data, not confirmation-bias-susceptible personal anecdotes" - -I've now collected my computer usage data over a period of almost **3 months**. In this post I'll highlight some of the features of the project, some of the insights I was able to derive so far and some thoughts about where I hope I can take it next. And who knows, maybe by the end of the post you'll want to become a user yourself :) - -### What's out there already -The idea of tracking and visualizing your computer activity is not at all new. It has been around in various shapes and forms in [Quantified Self](http://en.wikipedia.org/wiki/Quantified_Self) circles and several programs already exist that try to fill this need. Among the few better known ones are [RescueTime](https://www.rescuetime.com/) and [Toggl](https://www.toggl.com/), but there are literally tens to hundreds of other quite terrible copies. Among all of these, I couldn't find anything that satisfies a few very simple, basic requirements: - -- The user interface must be **web-based** because it's 2014 -- Everything must be **open source** and **free** -- The data must never leave the **local machine** (No cloud mambo jambo - too personal!) -- It must be easily **customizable** and look **pretty** - -Nothing like this (by far, actually) exists, so I set out to implement my own solution. - -### Brief Tour of ulogme : Single Day View - -ulogme is small and simple: There are two *backend* components: a tracking script that records activity and a small local web server wrapper that serves the activity logs to the *frontend* (visualization pages). The tracking script currently records active window titles (at frequency of once every 2 seconds) and keystroke typing frequency. - -Lets go through a brief overview of some of the resulting visualizations and features. First there is the **single day view**. Lets look at my August 1st, for example. The header tells us the day of the recording and there is space for a short "blog" post that can be written up for each day: - -
    - -
    Header: day information, refresh button, buttons for going between days, and a little editable "blog" post for the day.
    -
    - -Now we start to get to the meat. It looks like I was in the office from 10AM to 8PM on this day. Now, remember that we record keystrokes and window titles throughout. What follows is the keystroke breakdown for the day: - -
    - -
    Keystroke statistics for the day.
    -
    - -We see that I spent most of the day coding in Sublime Text 2 (which I use to write Python/JS/C++) and Gmail - Looks like I wrote quite a bit of email! Next, ulogme shows the *barcode of the day*, as I like to call it. This is a breakdown of all the windows on that day: - -
    - -
    Barcode of the day. Mousing over any of these strips reveals the exact window title.
    -
    - -This view is a little dense so let me unpack it one by one: - -- The **Notes** feature (on top) allows me to enter arbitrary notes for any time of day. Notice I also wrote an (optional) feature that looks for notes about coffee and calculates my levels of caffeine based on *actual* half-life of coffee. I am curious what caffeine does to my productivity! -- I group my windows into *display groups* **barcodes**, where the first group involves fun (Gmail/Chrome/Non-coding files opened in Sublime Text 2 - such as *.markdown* for blogging) and the second the group involves work (Matlab/Ipython Notebook with.js/.css/.cpp/.h/.py files, or PDF files opened (papers)). Looks like I spent roughly half of the day on work. -- **Hacking Streak** is a nifty feature that tries to identify contiguous hacking activity and correlates reasonably with my productivity. It looks for active windows that constitute work (I define this in settings) and then for continuous keystrokes above some typing frequency threshold. This indicates that I'm in a state of *hacking*, and the streak gets gradually interrupted if I switch windows to non-working titles, or if I stop writing code. The longest one visible here was 22 minutes and when I hover over the active title at that time, I see that it was me adding a feature to *ulogme*. The longest I've seen anyone get is a lab mate beta tester friend with an intense 50-minute hacking streak. - -In the end, ulogme shows the final breakdown of titles that occupied me on this day: - -
    - -
    The final breakdown of active window titles.
    -
    - -That's interesting, it looks like I actually only spent 10% of my day in Gmail. So even though I wrote a lot, it was just a few emails and chats I quickly sent out. - -### Brief Tour of ulogme : Global Overview - -Insights for one day are interesting, but everything becomes signficiantly more meaningful when it is put in context of a large number of days. Perhaps you noticed the *"Overview"* link on the header; Clicking this takes you to the overview page of ulogme that takes the statistics for all days and puts them together. I recorded my activity for almost 3 months now. Here is the delicious data visualized for the entire period (with some overlayed annotations): - -
    - -
    Total amount of time per day spent in various applications over a period of three months. The titles on top are clickable and toggle on/off the visualization of any one of the titles.
    -
    - -SO AWESOME. There are many fun things to note: - -- Note the **deadline mode** right before NIPS paper deadline on June 6th. I was frantically writing Latex for the most part :) -- Right after the deadline, you see a dip in activity. This is because I was mostly on my laptop preparing things for the CVPR conference where I had to give a talk. This points to one issue with *ulogme* - there is no syncing across machines right now. -- Notice a few dips on Sundays -- apparently Sundays are my rest days :) -- Am I just hallucinating this, or is there a fairly significant jump in activity right after breaks (note very high bars right after CVPR and vacation.) This needs more data but it would be interesting if vacations actually made me more productive. We'd have to measure more than just time spent on computer, though. -- When I toggle off all non-working titles, the visualizaiton (not shown) reveals that I only spend somewhat depressingly little time *actually* working. Many days I come into lab in the morning and leave late at night to go straight to sleep, but even these days sometimes add up to only roughly 5-6 hours of actual coding. I was very surprised about this initially and went looking for bugs, but it is true upon closer inspection - there is a short commute, lunch, dinner, random reading groups, meetings, random slacking off on the internet, gmail, etc etc... it all builds up quite quickly! Depressing to see that quantified. - -Next, ulogme gives me nice breakdown for both keystrokes and time spent in every window, across all time: - -
    - -
    Summary of keys and time per window across all 3 months.
    -
    - -This is a little incomplete because I do some hacking on my laptop, but it paints an interesting picture nonetheless. It looks like I spent a good chunk of desktop time in Matlab, but seemingly I spend the most amount of time in Chrome screwing around and browsing the internet. Great. - -**What it takes to write a paper.** Note that, interestingly, my total time for Latex is **35** hours - this is how long it takes to write a paper! Additionally, I pressed **225,149** keys in my Latex editor and the `$ wc -l` on my paper `.tex` file reveals that it has **40,192** characters. Some of it is template code but, at least approximately, this means that it takes about **5.6** characters for every one character in the final paper! - -> It takes 35 hours and 225,149 keys to write a 40,192-character NIPS paper (i.e. 5.6 characters must be typed for every one final character.) - -The final visualization is too long to paste here entirely, but I will show a snippet: - -
    - -
    Keystroke frequencies visualized for every day, along with the marginal sums on top and right.
    -
    - -This visualization seems to suggest that I do most of my work between 10AM and 8PM, and a very productive day is about 50,000 keystrokes. You can also see a bit of my post-NIPS refactory period with much lower keystroke activity. - -In the end, ulogme tells me that over the last 3 months I've pressed a total of **1,608,943** keys over **83** days, or approximately **19,384** per day. - -### Going forward - -Going forward, I'm hoping to make ulogme into a nice, open-sourced pet project. The code is all available on [Github](https://github.com/karpathy/ulogme) under *MIT License* and anyone is welcome try it out (if you're on Ubuntu or OSX - Windows is not supported, and if you're using a modern browser). - -And if you're feeling extra adventurous, I warmly welcome pull requests for new features or bug fixes. The code base is a mix of Python, Javascript and I use [d3.js](http://d3js.org/) for all visualizations. The project is in fairly early stages and the code is not among the nicest I've produced, but I've started fairly major refactoring efforts to make the onboarding process easier. - -In longer term, I'm hoping that ulogme codebase will evolve to become beautifully modular set of *data view plugins* that could be customized, stacked up and composed in the user interface as desired. - -Im summary, I feel I've gained quite a few insights into my own work habits by just visualizating the data, but there is much more work to be done on the analysis side as well. The holy grail here is still not implemented: What are the correlated of my productivity? Does sleeping more help? Does drinking coffee help? Do vacations or breaks help at all? All of these questions have answers and I can't wait to find them, in the data. - - - - diff --git a/_posts/2014-09-02-what-i-learned-from-competing-against-a-convnet-on-imagenet.markdown b/_posts/2014-09-02-what-i-learned-from-competing-against-a-convnet-on-imagenet.markdown deleted file mode 100644 index 3b1341061..000000000 --- a/_posts/2014-09-02-what-i-learned-from-competing-against-a-convnet-on-imagenet.markdown +++ /dev/null @@ -1,134 +0,0 @@ ---- -layout: post -comments: true -title: "What I learned from competing against a ConvNet on ImageNet" -excerpt: "The latest state of the art Image Classification networks have only 6.7% Hit@5 error on ILSVRC 2014 classification task. How do humans compare?" -date: 2014-09-02 20:00:00 ---- - -The results of the 2014 [ImageNet Large Scale Visual Recognition Challenge](http://www.image-net.org/challenges/LSVRC/2014/) (ILSVRC) were [published](http://www.image-net.org/challenges/LSVRC/2014/results) a few days ago. The New York Times [wrote about it](http://bits.blogs.nytimes.com/2014/08/18/computer-eyesight-gets-a-lot-more-accurate/) too. ILSVRC is one of the largest challenges in Computer Vision and every year teams compete to claim the state-of-the-art performance on the dataset. The challenge is based on a subset of the ImageNet dataset that was first collected by [Deng et al. 2009](http://www.image-net.org/papers/imagenet_cvpr09.pdf), and has been organized by our lab here at Stanford since 2010. This year, the challenge saw record participation with 50% more participants than last year, and records were shattered with staggering improvements in both classification and detection tasks. - -> (My personal) **ILSVRC 2014 TLDR**: 50% more teams. 50% improved classification and detection. ConvNet ensembles all over the place. Google team wins. - -Of course there's much more to it, and all details and takeaways will be discussed at length in Zurich, at the upcoming [ECCV 2014 workshop](http://image-net.org/challenges/LSVRC/2014/eccv2014) happening on September 12. - -Additionally, we just (September 2nd) published an arXiv preprint describing the entire history of ILSVRC and a large amount of associated analysis, [check it out on arXiv](http://arxiv.org/abs/1409.0575). This post will zoom in on a portion of the paper that I contributed to (Section 6.4 Human accuracy on large-scale image classification) and describe some of its context. - -#### ILSVRC Classification Task - -For the purposes of this post, I would like to focus, in particular, on image classification because this task is the common denominator for many other Computer Vision tasks. The classification task is made up of 1.2 million images in the training set, each labeled with one of 1000 categories that cover a wide variety of objects, animals, scenes, and even some abstract geometric concepts such as *"hook"*, or *"spiral"*. The 100,000 test set images are released with the dataset, but the labels are withheld to prevent teams from overfitting on the test set. The teams have to predict 5 (out of 1000) classes and an image is considered to be correct if at least one of the predictions is the ground truth. The test set evaluation is carried out on our end by comparing the predictions to our own set of ground truth labels. - -
    - -
    Example images from the classification task. Find full-scale images here.
    -
    - -#### GoogLeNet's Impressive Performance - -I was looking at the results about a week ago and became particularly intrigued by GoogleLeNet's winning submission for the classification task, which achieved a Hit@5 error rate of only 6.7% on the ILSVRC test set. I was relatively familiar with the scope and difficulty of the classification task: these are unconstrained internet images. They are a jungle of viewpoints, lighting conditions, and variations of all imaginable types. This begged the question: *How do humans compare?* - -There are now several tasks in Computer Vision where the performance of our models is close to human, or even *superhuman*. Examples of these tasks include face verification, various medical imaging tasks, Chinese character recognition, etc. However, many of these tasks are fairly constrained in that they assume input images from a very particular distribution. For example, face verification models might assume as input only aligned, centered, and normalized images. In many ways, ImageNet is harder since the images come directly from the "jungle of the interwebs". Is it possible that our models are reaching human performance on such an unconstrained task? - -#### Computing Human Accuracy - -In short, I thought that the impressive performance by the winning team would only make sense if it was put in perspective with human accuracy. I was also in the unique position of being able to evaluate it (given that I share office space with ILSVRC organizers), so I set out to quantify the human accuracy and characterize the differences between human predictions with those of the winning model. - -*Wait, isn't human accuracy 100%?* Thank you, good question. It's not, because the ILSVRC dataset was not labeled in the same way we are classifying it here. For example, to collect the images for the class "Border Terrier" the organizers searched the query on internet and retrieved a large collection of images. These were then filtered a bit with humans by asking them a binary "Is this a Border Terrier or not?". Whatever made it through became the "Border Terrier" class, and similar for all the other 1000 images. Therefore, the data was not collected in a discriminative but a binary manner, and is also subject to mistakes and inaccuracies. Some images can sometimes also contain multiple of the ILSVRC classes, etc. - -*CIFAR-10 digression.* It's fun to note that about 4 years ago I performed a similar (but much quicker and less detailed) human classification accuracy analysis on CIFAR-10. This was back when the state of the art was at 77% by Adam Coates, and my own accuracy turned out to be 94%. I think the best ConvNets now get about 92%. The post about that can be found [here]({% post_url 2011-04-27-manually-classifying-cifar10 %}). I never imagined I'd be doing the same for ImageNet a few years down the road :) - -There's one issue to clarify on. You may ask: *But wait, the ImageNet test set labels were obtained from humans in the first place. Why go about re-labeling it all over again? Isn't human performance 0% by definition?* Kind of, but not really. It is important to keep in mind that ImageNet was annotated as a binary ask. For example, to collect images of the dog class "Kelpie", the query was submitted to search engines and then humans on Amazon Mechanical Turk were used for the binary task of filtering out the noise. The ILSVRC classification task, on the other hand, is 1000-way classification. It's not a binary task such as the one used to collect the data. - -#### Labeling Interface - -I developed a labeling interface that would help us evaluate the human performance. It looked similar to, but not identical, to the screenshot below: - -
    - -
    A crop of a screenshot of the labeling interface for the ILSVRC validation data. Try it out for yourself.
    -
    - -The interface consisted of the test image on the left, and 1000 classes listed on the right. Each class was followed by 13 example images from the training set so that the categories were easier for a human to scan visually. The categories were also sorted in the topological order of the ImageNet hierarchy, which places semantically similar concepts nearby in the list. For example, all motor vehicle-related classes are arranged contiguously in the list. Finally, the interface is web-based so it is easy to naturally scroll through the classes, or search for them by text. - -**Try it out!** I'm making the [the labeling interface](http://cs.stanford.edu/people/karpathy/ilsvrc/) available to everyone so that you can also try labeling ILSVRC yourselves and draw your own conclusions. There are a few modifications in this version from the one we used to collect the data. I added two buttons (Show answer, and Show google prediction), and of course, the images shown in this version are the *validation* images, not the test set images. The GoogLeNet validation set predictions were graciously provided by the Google team. - -#### Roadblocks along the way - -**It was hard.** As I beta-tested the interface, the task of labeling images with 5 out of 1000 categories quickly turned out to be extremely challenging, even for some friends in the lab who have been working on ILSVRC and its classes for a while. First we thought we would put it up on AMT. Then we thought we could recruit paid undergrads. Then I organized a labeling party of intense labeling effort only among the (expert labelers) in our lab. Then I developed a modified interface that used GoogLeNet predictions to prune the number of categories from 1000 to only about 100. It was still too hard - people kept missing categories and getting up to ranges of 13-15% error rates. In the end I realized that to get anywhere competitively close to GoogLeNet, it was most efficient if I sat down and went through the painfully long training process and the subsequent careful annotation process myself. - -**It took a while.** I ended up training on 500 validation images and then switched to the test set of 1500 images. The labeling happened at a rate of about 1 per minute, but this decreased over time. I only enjoyed the first ~200, and the rest I only did *#forscience*. (In the end we convinced one more expert labeler to spend a few hours on the annotations, but they only got up to 280 images, with less training, and only got to about 12%). The labeling time distribution was strongly bimodal: Some images are easily recognized, while some images (such as those of fine-grained breeds of dogs, birds, or monkeys) can require multiple minutes of concentrated effort. I became very good at identifying breeds of dogs. - -**It was worth it.** Based on the sample of images I worked on, the GoogLeNet classification error turned out to be 6.8% (the error on the full test set of 100,000 images is 6.7%). My own error in the end turned out to be **5.1%**, approximately 1.7% better. If you crunch through the statistical significance calculations (i.e. comparing the two proportions with a Z-test) under the null hypothesis of them being equal, you get a one-sided p-value of 0.022. In other words, the result is statistically significant based on a relatively commonly used threshold of 0.05. Lastly, I found the experience to be quite educational: After seeing so many images, issues, and ConvNet predictions you start to develop a really good model of the failure modes. - -> My error turned out to be 5.1%, compared to GoogLeNet error of 6.8%. Still a bit of a gap to close (and more). - -
    - -
    Representative example of practical frustrations of labeling ILSVRC classes. Aww, a cute dog! Would you like to spend 5 minutes scrolling through 120 breeds of dog to guess what species it is?
    -
    - -#### Analysis of errors - -We inspected both human and GoogLeNet errors to gain an understanding of common error types and how they compare. The analysis and insights below were derived specifically from GoogLeNet predictions, but I suspect that many of the same errors may be present in other methods. Let me copy paste the analysis from our [ILSVRC paper](http://arxiv.org/abs/1409.0575): - -**Types of error that both GoogLeNet human are susceptible to:** - -1. **Multiple objects.** Both GoogLeNet and humans struggle with images that contain multiple ILSVRC classes (usually many more than five), with little indication of which object is the focus of the image. This error is only present in the Classification setting, since every image is constrained to have exactly one correct label. In total, we attribute 24 (24%) of GoogLeNet errors and 12 (16%) of human errors to this category. It is worth noting that humans can have a slight advantage in this error type, since it can sometimes be easy to identify the most salient object in the image. - -2. **Incorrect annotations.** We found that approximately 5 out of 1500 images (0.3%) were incorrectly annotated in the ground truth. This introduces an approximately equal number of errors for both humans and GoogLeNet. - -**Types of error that GoogLeNet is more susceptible to than human:** - -1. **Object small or thin.** GoogLeNet struggles with recognizing objects that are very small or thin in the image, even if that object is the only object present. Examples of this include an image of a standing person wearing sunglasses, a person holding a quill in their hand, or a small ant on a stem of a flower. We estimate that approximately 22 (21%) of GoogLeNet errors fall into this category, while none of the human errors do. In other words, in our sample of images, no image was mislabeled by a human because they were unable to identify a very small or thin object. This discrepancy can be attributed to the fact that a human can very effectively leverage context and affordances to accurately infer the identity of small objects (for example, a few barely visible feathers near person's hand as very likely belonging to a mostly occluded quill). - -2. **Image filters.** Many people enhance their photos with filters that distort the contrast and color distributions of the image. We found that 13 (13%) of the images that GoogLeNet incorrectly classified contained a filter. Thus, we posit that GoogLeNet is not very robust to these distortions. In comparison, only one image among the human errors contained a filter, but we do not attribute the source of the error to the filter. - -3. **Abstract representations.** We found that GoogLeNet struggles with images that depict objects of interest in an abstract form, such as 3D-rendered images, paintings, sketches, plush toys, or statues. An example is the abstract shape of a bow drawn with a light source in night photography, a 3D-rendered robotic scorpion, or a shadow on the ground, of a child on a swing. We attribute approximately 6 (6%) of GoogLeNet errors to this type of error and believe that humans are significantly more robust, with no such errors seen in our sample. - -4. **Miscellaneous sources.** Additional sources of error that occur relatively infrequently include extreme closeups of parts of an object, unconventional viewpoints such as a rotated image, images that can significantly benefit from the ability to read text (e.g. a featureless container identifying itself as "*face powder*"), objects with heavy occlusions, and images that depict a collage of multiple images. In general, we found that humans are more robust to all of these types of error. - -
    - -
    Representative validation images that highlight common sources of error. For each image, we display the ground truth in blue, and top 5 predictions from GoogLeNet follow (red = wrong, green = right). GoogLeNet predictions on the validation -set images were graciously provided by members of the GoogLeNet team. From left to right: Images that contain multiple objects, images of extreme closeups and uncharacteristic views, images with filters, images that significantly benefit from the ability to read text, images that contain very small and thin objects, images with abstract representations, and example of a fine-grained image that GoogLeNet correctly identifies but a human would have significant difficulty with.
    -
    - - -**Types of error that human is more susceptible to than GoogLeNet:** - -1. **Fine-grained recognition.** We found that humans are noticeably worse at fine-grained recognition (e.g. dogs, monkeys, snakes, birds), even when they are in clear view. To understand the difficulty, consider that there are more than 120 species of dogs in the dataset. We estimate that 28 (37%) of the human errors fall into this category, while only 7 (7%) of GoogLeNet erros do. - -2. **Class unawareness.** The annotator may sometimes be unaware of the ground truth class present as a label option. When pointed out as an ILSVRC class, it is usually clear that the label applies to the image. These errors get progressively less frequent as the annotator becomes more familiar with ILSVRC classes. Approximately 18 (24%) of the human errors fall into this category. - -3. **Insufficient training data.** Recall that the annotator is only presented with 13 examples of a class under every category name. However, 13 images are not always enough to adequately convey the allowed class variations. For example, a brown dog can be incorrectly dismissed as a "*Kelpie*" if all examples of a "*Kelpie*" feature a dog with black coat. However, if more than 13 images were listed it would have become clear that a "*Kelpie*" may have a brown coat. Approximately 4 (5%) of human errors fall into this category. - -#### Conclusions - -We investigated the performance of trained human annotators on a sample of up to 1500 ILSVRC test set images. Our results indicate that a trained human annotator is capable of outperforming the best model (GoogLeNet) by approximately 1.7% (p = 0.022). - -We expect that some sources of error may be relatively easily eliminated (e.g. robustness to filters, rotations, collages, effectively reasoning over multiple scales), while others may prove more elusive (e.g. identifying -abstract representations of objects). On the hand, a large majority of human errors come from fine-grained categories and class unawareness. We expect that the former can be significantly reduced with fine-grained expert annotators, while the latter could be reduced with more practice and greater familiarity with ILSVRC classes. - -It is clear that humans will soon only be able to outperform state of the art image classification models by use of significant effort, expertise, and time. One interesting follow-up question for future investigation is how computer-level accuracy compares with human-level accuracy on more complex image understanding tasks. - -> "It is clear that humans will soon only be able to outperform state of the art image classification models by use of significant effort, expertise, and time." - -As for my personal take-away from this week-long exercise, I have to say that, qualitatively, I was very impressed with the ConvNet performance. Unless the image exhibits some irregularity or tricky parts, the ConvNet confidently and robustly predicts the correct label. If you're feeling adventurous, try out [the labeling interface](http://cs.stanford.edu/people/karpathy/ilsvrc/) for yourself and draw your own conclusions. I can promise that you'll gain interesting qualitative insights into where state-of-the-art Computer Vision works, where it fails, and how. - -EDIT: additional discussions: - -- [Pierre's Google+](https://plus.google.com/u/0/+PierreSermanet/posts/6wZYMuXo8PU) -- [Reddit /r/MachineLearning](http://www.reddit.com/r/MachineLearning/comments/2fg0va/what_i_learned_from_competing_against_a_convnet/) - -UPDATE: - -- [ImageNet workshop page](http://image-net.org/challenges/LSVRC/2014/eccv2014) now has links to many of the teams' slides and videos. -- [GoogLeNet paper](http://arxiv.org/abs/1409.4842) on arXiv describes the details of their architecutre. - -UPDATE2 (14 Feb 2015): - -There have now been several reported results that surpass my 5.1% error on ImageNet. I'm astonished to see such rapid progress. At the same time, I think we should keep in mind the following: - -> Human accuracy is not a point. It lives on a tradeoff curve. - -We trade off human effort and expertise with the error rate: I am one point on that curve with 5.1%. My labmates with almost no training and less patience are another point, with even up to 15% error. And based on some calculations that consider my exact error types and hypothesizing which ones may be easier to fix than others, it's not unreasonable to suggest that an ensemble of very dedicated expert human labelers might push this down to 3%, with about 2% being an optimistic error rate lower bound. I know it's not as exciting as having a single number, but it's the right way of thinking about it. See more details in my recent [Google+ post](https://plus.google.com/+AndrejKarpathy/posts/dwDNcBuWTWf). diff --git a/_posts/2015-03-30-breaking-convnets.markdown b/_posts/2015-03-30-breaking-convnets.markdown deleted file mode 100644 index abf46c78b..000000000 --- a/_posts/2015-03-30-breaking-convnets.markdown +++ /dev/null @@ -1,168 +0,0 @@ ---- -layout: post -comments: true -title: "Breaking Linear Classifiers on ImageNet" -excerpt: "There have been a few recent papers that fool ConvNets by taking a correctly classified image and perturbing it in an imperceptible way to produce an image that is misclassified. In this post I show that ConvNets are an overkill: Simple linear classifiers are in fact susceptible to the same fooling strategy." -date: 2015-03-30 20:00:00 -mathjax: true ---- - -You've probably heard that Convolutional Networks work very well in practice and across a wide range of visual recognition problems. You may have also read articles and papers that claim to reach a near *"human-level performance"*. There are all kinds of caveats to that (e.g. see my G+ post on [Human Accuracy is not a point, it lives on a tradeoff curve](https://plus.google.com/+AndrejKarpathy/posts/dwDNcBuWTWf)), but that is not the point of this post. I do think that these systems now work extremely well across many visual recognition tasks, especially ones that can be posed as simple classification. - -Yet, a second group of seemingly baffling results has emerged that brings up an apparent contradiction. I'm referring to several people who have noticed that it is possible to take an image that a state-of-the-art Convolutional Network thinks is one class (e.g. "panda"), and it is possible to change it almost imperceptibly to the human eye in such a way that the Convolutional Network suddenly classifies the image as any other class of choice (e.g. "gibbon"). We say that we *break*, or *fool* ConvNets. See the image below for an illustration: - -
    - -
    Figure from Explaining and Harnessing Adversarial Examples by Goodfellow et al.
    -
    - -This topic has recently gained attention starting with [Intriguing properties of neural networks](http://arxiv.org/abs/1312.6199) by Szegedy et al. last year. They had a very similar set of images: - -
    - -
    - Take a correctly classified image (left image in both columns), and add a tiny distortion (middle) to fool the ConvNet with the resulting image (right). -
    -
    - -And a set of very closely related results was later followed by [Deep Neural Networks are Easily Fooled: High Confidence Predictions for Unrecognizable Images](http://arxiv.org/abs/1412.1897) by Nguyen et al. Instead of starting with correctly-classified images and fooling the ConvNet, they had many more examples of performing the same process starting from noise (and hence making the ConvNet confidently classify an incomprehensible noise pattern as some class), or evolving new funny-looking images that the ConvNet is slightly too certain about: - -
    - - -
    - These images are classified with >99.6% confidence as the shown class by a Convolutional Network. -
    -
    - -I should make the point quickly that these results are not completely new to Computer Vision, and that some have observed the same problems even with our older features, e.g. HOG features. See [Exploring the Representation Capabilities of the HOG Descriptor](http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=&arnumber=6130416&contentType=Conference+Publications&queryText%3Dexploring+representation+capabilities+of+HOG) for details. - -The conclusion seems to be that we can take any arbitrary image and classify it as whatever class we want by adding tiny, imperceptible noise patterns. Worse, it was found that a reasonable fraction of fooling images **generalize** across different Convolutional Networks, so this isn't some kind of fragile property of the new image or some overfitting property of the model. There's something more general about the type of introduced noise that seems to fool many other models. In some sense, it is much more accurate to speak about *fooling subspaces* rather than *fooling images*. The latter erroneously makes them seem like tiny points in the super-high-dimensional image space, perhaps similar to rational numbers along the real numbers, when instead they are better thought of as entire intervals. Of course, this work raises security concerns because an adversary could conceivably generate a fooling image of any class on their own computer and upload it to some service with a malicious intent, with a non-zero probability of it fooling the server-side model (e.g. circumventing racy filters). - -> What is going on? - -These results are interesting and worrying, but they have also led to a good amount of confusion among laymen. The most important point of this entire post is the following: - -**These results are not specific to images, ConvNets, and they are also not a "flaw" in Deep Learning**. A lot of these results were reported with ConvNets running on images because pictures are fun to look at and ConvNets are state-of-the-art, but in fact the core flaw extends to many other domains (e.g. speech recognition systems), and most importantly, also to simple, shallow, good old-fashioned Linear Classifiers (Softmax classifier, or Linear Support Vector Machines, etc.). This was pointed out and articulated in [Explaining and Harnessing Adversarial Examples](http://arxiv.org/abs/1412.6572) by Goodfellow et al. We'll carry out a few experiments very similar to the ones presented in this paper, and see that it is in fact this *linear* nature that is problematic. And because Deep Learning models use linear functions to build up the architecture, they inherit their flaw. However, Deep Learning by itself is not the cause of the issue. In fact, Deep Learning offers tangible hope for a solution, since we can use all the wiggle of composed functions to design more resistant architectures or objectives. - -### How fooling methods work - -ConvNets express a differentiable function from the pixel values to class scores. For example, a ConvNet might take a 227x227 image and transforms these ~100,000 numbers through a wiggly function (parameterized by several million parameters) to 1000 numbers that we interpret as the confidences for 1000 classes (e.g. the classes of ImageNet). - -
    - -
    - This ConvNet takes the image of a banana and applies a function to it to transform it to class scores (here 4 classes are shown). The function consists of several rounds of convolutions where the filter entries are parameters, and a few matrix multiplications, where the elements of the matrices are parameters. A typical ConvNet might have ~100 million parameters. -
    -
    - -We train a ConvNet with repeated process of sampling data, calculating the parameter gradients and performing a parameter update. That is, suppose we feed the ConvNet an image of a banana and compute the 1000 scores for the classes that the ConvNet assigns to this image. We then and ask the following question for every single parameter in the model: - -> Normal ConvNet training: "What happens to the score of the correct class when I wiggle this parameter?" - -This *wiggle influence*, of course, is just the gradient. For example, some parameter in some filter in some layer of the ConvNet might get the gradient of -3.0 computed during backpropagation. That means that increasing this parameter by a tiny amount, e.g. 0.0001, would have a *negative* influence on the banana score (due to the negative sign); In this case, we'd expect the banana score to *decrease* by approximately 0.0003. Normally we take this gradient and use it to perform a **parameter update**, which wiggles every parameter in the model a tiny amount in the *correct* direction, to increase the banana score. These parameter updates hence work in concert to slightly increase the score of the banana class for that one banana image (e.g. the banana score could go up from 30% to 34% or something). We then repeat this over and over on all images in the training data. - -Notice how this worked: we held the input image fixed, and we wiggled the model parameters to increase the score of whatever class we wanted (e.g. banana class). It turns out that we can easily flip this process around to create fooling images. (In practice in fact, absolutely no changes to a ConvNet code base are required.) That is, we will hold the model parameters fixed, and instead we're computing the gradient of all pixels in the input image on any class we might desire. For example, we can ask: - -> Creating fooling images: "What happens to the score of (whatever class you want) when I wiggle this pixel?" - -We compute the gradient just as before with backpropagation, and then we can perform an **image update** instead of a parameter update, with the end result being that we increase the score of whatever class we want. E.g. we can take the banana image and wiggle every pixel according to the gradient of that image on the cat class. This would change the image a tiny amount, but the score of *cat* would now increase. Somewhat unintuitively, it turns out that you don't have to change the image too much to toggle the image from being classified correctly as a banana, to being classified as anything else (e.g. cat). - -In short, to create a fooling image we start from whatever image we want (an actual image, or even a noise pattern), and then use backpropagation to compute the gradient of the image pixels on any class score, and nudge it along. We may, but do not have to, repeat the process a few times. You can interpret backpropagation in this setting as using dynamic programming to compute the most damaging local perturbation to the input. Note that this process is very efficient and takes negligible time if you have access to the parameters of the ConvNet (backprop is fast), but it is possible to do this even if you do not have access to the parameters but only to the class scores at the end. In this case, it is possible to compute the data gradient numerically, or to to use other local stochastic search strategies, etc. Note that due to the latter approach, even non-differentiable classifiers (e.g. Random Forests) are not safe (but I haven't seen anyone empirically confirm this yet). - -### Fooling a Linear Classifier on ImageNet - -As I mentioned before (and as described in more detail in [Goodfellow et al.](http://arxiv.org/abs/1412.6572)), it is the use of linear functions that makes our models susceptible for an attack. ConvNets, of course, do not express a linear function from images to class scores; They are a complex Deep Learning model that expresses a highly non-linear function. However, the components that make up a ConvNet *are* linear: Convolution of a filter with its input is a linear operation (we are sliding a filter through the input and computing dot products - a linear operation), and matrix multiplications are also a linear function. - -So here's a fun experiment we'll do. Lets forget about ConvNets - they are a distracting overkill as far as the core flaw goes. Instead, lets fool a linear classifier and lets also keep with the theme of breaking models on images because they are fun to look at. - -Here is the setup: - -- Take 1.2 million images in ImageNet -- Resize them to 64x64 (full-sized images would train longer) -- use [Caffe](http://caffe.berkeleyvision.org/) to train a Linear Classifier (e.g. Softmax). In other words we're going straight from data to the classifier with a single fully-connected layer. - -*Digression: Technical fun parts.* The fun part in actually doing this is that the standard AlexNetty ConvNet hyperparameters are of course completely inadequate. For example, normally you'd use weight decay of 0.0005 or so and learning rate of 0.01, and gaussian initialization drawn from a gaussian of 0.01 std. If you've trained linear classifiers before on this type of high-dimensional input (64x64x3 ~= 12K numbers), you'll know that your learning rate will probably have to be much lower, the regularization much larger, and initialization of 0.01 std will probably be inadequate. Indeed, starting Caffe training with default hyperparameters gives a starting loss of about 80, which right away tells you that the initialization is completely out of whack (initial ImageNet loss should be ballpark 7.0, which is -log(1/1000)). I scaled it down to 0.0001 std for Gaussian init which gives sensible starting loss. But then the loss right away explodes which tells you that the learning rate is way too high - I had to scale it all the way down to about 1e-7. Lastly, a weight decay of 0.0005 will give almost negligible regularization loss with 12K inputs - I had to scale it up to 100 to start getting reasonably-looking weights that aren't super-overfitted noise blobs. It's fun being a Neural Networks practitioner. - -A linear classifier over image pixels implies that every class score is computed as a dot product between all the image pixels (stretched as a large column) and a learnable weight vector, one for each class. With input images of size 64x64x3 and 1000 ImageNet classes we therefore have 64x64x3x1000 = 12.3 million weights (beefy linear model!), and 1000 biases. Training these parameters on ImageNet with a K40 GPU takes only a few tens of minutes. We can then visualize each of the learned weights by reshaping them as images: - -
    - -
    - Example linear classifiers for a few ImageNet classes. Each class' score is computed by taking a dot product between the visualized weights and the image. Hence, the weights can be thought of as a template: the images show what the classifier is looking for. For example, Granny Smith apples are green, so the linear classifier has positive weights in the green color channel and negative weights in blue and red channels, across all spatial positions. It is hence effectively counting the amount of green stuff in the middle. You can also see the learned templates for all imagenet classes for fun. -
    -
    - -By the way, I haven't seen anyone report linear classification accuracy on ImageNet before, but it turns out to be about 3.0% top-1 accuracy (and about 10% top-5) on ImageNet. I haven't done a completely exhaustive hyperparameter sweep but I did a few rounds of manual binary search. - -Now that we've trained the model parameters we can start to produce fooling images. This turns out to be quite trivial in the case of linear classifiers and no backpropagation is required. This is because when your score function is a dot product \\(s = w^Tx\\), then the gradient on the image \\(x\\) is simply \\(\nabla\_x s = w\\). That is, we take an image we would like to start out with, and then if we wanted to fool the model into thinking that it is some other class (e.g. goldfish), we have to take the weights corresponding to the desired class, and add some fraction of those weights to the image: - -
    - - - -
    - Fooled linear classifier: The starting image (left) is classified as a kit fox. That's incorrect, but then what can you expect from a linear classifier? However, if we add a small amount "goldfish" weights to the image (top row, middle), suddenly the classifier is convinced that it's looking at one with high confidence. We can distort it with the school bus template instead if we wanted to. Similar figures (but on the MNIST digits dataset) can be seen in Figure 2 of Goodfellow et al. -
    -
    - -We can also start from random noise and achieve the same effect: - -
    - - -
    - Same process but starting with a random image. -
    -
    - -Of course, these examples are not as impactful as the ones that use a ConvNet because the ConvNet gives state of the art performance while a linear classifier barely gets to 3% accuracy, but it illustrates the point that even with a simple, shallow function it is still possible to play around with the input in imperceptible ways and get almost arbitrary results. - -**Regularization**. There is one subtle comment to make regarding regularization strength. In my experiments above, increasing the regularization strength gave nicer, smoother and more diffuse weights but generalized to validation data *worse* than some of my best classifiers that displayed more noisy patterns. For example, the nice and smooth templates I've shown only achieve 1.6% accuracy. My best model that achieves 3.0% accuracy has noisier weights (as seen in the middle column of the fooling images). Another model with very low regularization reaches 2.8% and its fooling images are virtually indistinguishable yet produce 100% confidences in the wrong class. In particular: - -- High regularization gives smoother templates, but at some point starts to works worse. However, it is more resistant to fooling. (The fooling images look noticeably different from their original) -- Low regularization gives more noisy templates but seems to work better that all-smooth templates. It is less resistant to fooling. - -Intuitively, it seems that higher regularization leads to smaller weights, which means that one must change the image more dramatically to change the score by some amount. It's not immediately obvious if and how this conclusion translates to deeper models. - -
    - - -
    - Linear classifier with lower regularization (which leads to more noisy class weights) is easier to fool (top). Higher regularization produces more diffuse filters and is harder to fool (bottom). That is, it's harder to achieve very confident wrong answers (however, with weights so small it is hard to achieve very confident correct answers too). To flip the label to a wrong class, more visually obvious perturbations are also needed. Somewhat paradoxically, the model with the noisy weights (top) works quite a bit better on validation data (2.6% vs. 1.4% accuracy). -
    -
    - -### Toy Example - -We can understand this process in even more detail by condensing the problem to the smallest toy example that displays the problem. Suppose we train a binary logistic regression, where we define the probability of class 1 as \\(P(y = 1 \mid x; w,b) = \sigma(w^Tx + b)\\), where \\(\sigma(z) = 1/(1+e^{-z})\\) is the sigmoid function that squashes the class 1 score \\(s = w^Tx+b\\) into range between 0 and 1, where 0 is mapped to 0.5. This classifier hence decides that the class of the input is 1 if \\(s > 0\\), or equivalently if the class 1 probability is more than 50% (i.e. \\(\sigma(s) > 0.5\\)). Suppose further that we had the following setup: - -```javascript -x = [2, -1, 3, -2, 2, 2, 1, -4, 5, 1] // input -w = [-1, -1, 1, -1, 1, -1, 1, 1, -1, 1] // weight vector -``` - -If you do the dot product, you get `-3`. Hence, probability of class 1 is `1/(1+e^(-(-3))) = 0.0474`. In other words the classifier is 95% certain that this is example is class 0. We're now going to try to fool the classifier. That is, we want to find a tiny change to `x` in such a way that the score comes out much higher. Since the score is computed with a dot product (multiply corresponding elements in `x` and `w` then add it all up), with a little bit of thought it's clear what this change should be: In every dimension where the weight is positive, we want to slightly increase the input (to get slightly more score). Conversely, in every dimension where the weight is negative, we want the input to be slightly lower (again, to get slightly more score). In other words, an adversarial `xad` might be: - -```javascript -// xad = x + 0.5w gives: -xad = [1.5, -1.5, 3.5, -2.5, 2.5, 1.5, 1.5, -3.5, 4.5, 1.5] -``` - -Doing the dot product again we see that suddenly the score becomes 2. This is not surprising: There are 10 dimensions and we've tweaked the input by 0.5 in every dimension in such a way that we *gain* 0.5 in each one, adding up to a total of 5 additional score, rising it from -3 to 2. Now when we look at probability of class 1 we get `1/(1+e^(-2)) = 0.88`. That is, we tweaked the original `x` by a small amount and we improved the class 1 probability from 5% to 88%! Moreover, notice that in this case the input only had 10 dimensions, but an image might consist of many tens of thousands of dimensions, so you can afford to make tiny changes across all of them that all add up in concert in exactly the worst way to blow up the score of any class you wish. - -### Conclusions - -Several other related experiments can be found in [Explaining and Harnessing Adversarial Examples](http://arxiv.org/abs/1412.6572) by Goodfellow et al. This paper is a required reading on this topic. It was the first to articulate and point out the linear functions flaw, and more generally argued that there is a tension between models that are easy to train (e.g. models that use linear functions) and models that resist adversarial perturbations. - -As closing words for this post, the takeaway is that ConvNets still work very well in practice. Unfortunately, it seems that their competence is relatively limited to a small region around the data manifold that contains natural-looking images and distributions, and that once we artificially push images away from this manifold by computing noise patterns with backpropagation, we stumble into parts of image space where all bets are off, and where the linear functions in the network induce large subspaces of fooling inputs. - -With wishful thinking, one might hope that ConvNets would produce all-diffuse probabilities in regions outside the training data, but there is no part in an ordinary objective (e.g. mean cross-entropy loss) that explicitly enforces this constraint. Indeed, it seems that the class scores in these regions of space are all over the place, and worse, a straight-forward attempt to patch this up by introducing a background class and iteratively adding fooling images as a new *background* class during training are not effective in mitigating the problem. - -It seems that to fix this problem we need to change our objectives, our forward functional forms, or even the way we optimize our models. However, as far as I know we haven't found very good candidates for either. To be continued. - -#### Further Reading - -- Ian Goodfellow gave a talk on this work at the [RE.WORK Deep Learning Summit 2015](https://www.youtube.com/watch?v=Pq4A2mPCB0Y) -- You can fool ConvNets as part of [CS231n Assignment #3 IPython Notebook](http://cs231n.github.io/assignment3/). -- [IPython Notebook](http://cs.stanford.edu/people/karpathy/break_linear_classifier.ipynb) for this experiment. Also my [Caffe linear classifier](http://cs.stanford.edu/people/karpathy/caffe_linear_imagenet.zip) protos if you like. diff --git a/_posts/2015-05-21-rnn-effectiveness.markdown b/_posts/2015-05-21-rnn-effectiveness.markdown deleted file mode 100644 index 7b4190654..000000000 --- a/_posts/2015-05-21-rnn-effectiveness.markdown +++ /dev/null @@ -1,719 +0,0 @@ ---- -layout: post -comments: true -title: "The Unreasonable Effectiveness of Recurrent Neural Networks" -excerpt: "We'll train and sample from character-level RNN language models that learn to write poetry, latex math and code. We'll also analyze the models and get hints of future research directions." -date: 2015-05-21 11:00:00 -mathjax: true ---- - -There's something magical about Recurrent Neural Networks (RNNs). I still remember when I trained my first recurrent network for [Image Captioning](http://cs.stanford.edu/people/karpathy/deepimagesent/). Within a few dozen minutes of training my first baby model (with rather arbitrarily-chosen hyperparameters) started to generate very nice looking descriptions of images that were on the edge of making sense. Sometimes the ratio of how simple your model is to the quality of the results you get out of it blows past your expectations, and this was one of those times. What made this result so shocking at the time was that the common wisdom was that RNNs were supposed to be difficult to train (with more experience I've in fact reached the opposite conclusion). Fast forward about a year: I'm training RNNs all the time and I've witnessed their power and robustness many times, and yet their magical outputs still find ways of amusing me. This post is about sharing some of that magic with you. - -> We'll train RNNs to generate text character by character and ponder the question "how is that even possible?" - -By the way, together with this post I am also releasing [code on Github](https://github.com/karpathy/char-rnn) that allows you to train character-level language models based on multi-layer LSTMs. You give it a large chunk of text and it will learn to generate text like it one character at a time. You can also use it to reproduce my experiments below. But we're getting ahead of ourselves; What are RNNs anyway? - -## Recurrent Neural Networks - -**Sequences**. Depending on your background you might be wondering: *What makes Recurrent Networks so special*? A glaring limitation of Vanilla Neural Networks (and also Convolutional Networks) is that their API is too constrained: they accept a fixed-sized vector as input (e.g. an image) and produce a fixed-sized vector as output (e.g. probabilities of different classes). Not only that: These models perform this mapping using a fixed amount of computational steps (e.g. the number of layers in the model). The core reason that recurrent nets are more exciting is that they allow us to operate over *sequences* of vectors: Sequences in the input, the output, or in the most general case both. A few examples may make this more concrete: - -
    - -
    Each rectangle is a vector and arrows represent functions (e.g. matrix multiply). Input vectors are in red, output vectors are in blue and green vectors hold the RNN's state (more on this soon). From left to right: (1) Vanilla mode of processing without RNN, from fixed-sized input to fixed-sized output (e.g. image classification). (2) Sequence output (e.g. image captioning takes an image and outputs a sentence of words). (3) Sequence input (e.g. sentiment analysis where a given sentence is classified as expressing positive or negative sentiment). (4) Sequence input and sequence output (e.g. Machine Translation: an RNN reads a sentence in English and then outputs a sentence in French). (5) Synced sequence input and output (e.g. video classification where we wish to label each frame of the video). Notice that in every case are no pre-specified constraints on the lengths sequences because the recurrent transformation (green) is fixed and can be applied as many times as we like.
    -
    - -As you might expect, the sequence regime of operation is much more powerful compared to fixed networks that are doomed from the get-go by a fixed number of computational steps, and hence also much more appealing for those of us who aspire to build more intelligent systems. Moreover, as we'll see in a bit, RNNs combine the input vector with their state vector with a fixed (but learned) function to produce a new state vector. This can in programming terms be interpreted as running a fixed program with certain inputs and some internal variables. Viewed this way, RNNs essentially describe programs. In fact, it is known that [RNNs are Turing-Complete](http://binds.cs.umass.edu/papers/1995_Siegelmann_Science.pdf) in the sense that they can to simulate arbitrary programs (with proper weights). But similar to universal approximation theorems for neural nets you shouldn't read too much into this. In fact, forget I said anything. - -> If training vanilla neural nets is optimization over functions, training recurrent nets is optimization over programs. - -**Sequential processing in absence of sequences**. You might be thinking that having sequences as inputs or outputs could be relatively rare, but an important point to realize is that even if your inputs/outputs are fixed vectors, it is still possible to use this powerful formalism to *process* them in a sequential manner. For instance, the figure below shows results from two very nice papers from [DeepMind](http://deepmind.com/). On the left, an algorithm learns a recurrent network policy that steers its attention around an image; In particular, it learns to read out house numbers from left to right ([Ba et al.](http://arxiv.org/abs/1412.7755)). On the right, a recurrent network *generates* images of digits by learning to sequentially add color to a canvas ([Gregor et al.](http://arxiv.org/abs/1502.04623)): - -
    -
    - - -
    -
    Left: RNN learns to read house numbers. Right: RNN learns to paint house numbers.
    -
    - -The takeaway is that even if your data is not in form of sequences, you can still formulate and train powerful models that learn to process it sequentially. You're learning stateful programs that process your fixed-sized data. - -**RNN computation.** So how do these things work? At the core, RNNs have a deceptively simple API: They accept an input vector `x` and give you an output vector `y`. However, crucially this output vector's contents are influenced not only by the input you just fed in, but also on the entire history of inputs you've fed in in the past. Written as a class, the RNN's API consists of a single `step` function: - -```python -rnn = RNN() -y = rnn.step(x) # x is an input vector, y is the RNN's output vector -``` - -The RNN class has some internal state that it gets to update every time `step` is called. In the simplest case this state consists of a single *hidden* vector `h`. Here is an implementation of the step function in a Vanilla RNN: - -```python -class RNN: - # ... - def step(self, x): - # update the hidden state - self.h = np.tanh(np.dot(self.W_hh, self.h) + np.dot(self.W_xh, x)) - # compute the output vector - y = np.dot(self.W_hy, self.h) - return y -``` - -The above specifies the forward pass of a vanilla RNN. This RNN's parameters are the three matrices `W_hh, W_xh, W_hy`. The hidden state `self.h` is initialized with the zero vector. The `np.tanh` function implements a non-linearity that squashes the activations to the range `[-1, 1]`. Notice briefly how this works: There are two terms inside of the tanh: one is based on the previous hidden state and one is based on the current input. In numpy `np.dot` is matrix multiplication. The two intermediates interact with addition, and then get squashed by the tanh into the new state vector. If you're more comfortable with math notation, we can also write the hidden state update as \\( h\_t = \tanh ( W\_{hh} h\_{t-1} + W\_{xh} x\_t ) \\), where tanh is applied elementwise. - - -We initialize the matrices of the RNN with random numbers and the bulk of work during training goes into finding the matrices that give rise to desirable behavior, as measured with some loss function that expresses your preference to what kinds of outputs `y` you'd like to see in response to your input sequences `x`. - -**Going deep**. RNNs are neural networks and everything works monotonically better (if done right) if you put on your deep learning hat and start stacking models up like pancakes. For instance, we can form a 2-layer recurrent network as follows: - -```python -y1 = rnn1.step(x) -y = rnn2.step(y1) -``` - -In other words we have two separate RNNs: One RNN is receiving the input vectors and the second RNN is receiving the output of the first RNN as its input. Except neither of these RNNs know or care - it's all just vectors coming in and going out, and some gradients flowing through each module during backpropagation. - -**Getting fancy**. I'd like to briefly mention that in practice most of us use a slightly different formulation than what I presented above called a *Long Short-Term Memory* (LSTM) network. The LSTM is a particular type of recurrent network that works slightly better in practice, owing to its more powerful update equation and some appealing backpropagation dynamics. I won't go into details, but everything I've said about RNNs stays exactly the same, except the mathematical form for computing the update (the line `self.h = ... `) gets a little more complicated. From here on I will use the terms "RNN/LSTM" interchangeably but all experiments in this post use an LSTM. - -## Character-Level Language Models - -Okay, so we have an idea about what RNNs are, why they are super exciting, and how they work. We'll now ground this in a fun application: We'll train RNN character-level language models. That is, we'll give the RNN a huge chunk of text and ask it to model the probability distribution of the next character in the sequence given a sequence of previous characters. This will then allow us to generate new text one character at a time. - -As a working example, suppose we only had a vocabulary of four possible letters "helo", and wanted to train an RNN on the training sequence "hello". This training sequence is in fact a source of 4 separate training examples: 1. The probability of "e" should be likely given the context of "h", 2. "l" should be likely in the context of "he", 3. "l" should also be likely given the context of "hel", and finally 4. "o" should be likely given the context of "hell". - -Concretely, we will encode each character into a vector using 1-of-k encoding (i.e. all zero except for a single one at the index of the character in the vocabulary), and feed them into the RNN one at a time with the `step` function. We will then observe a sequence of 4-dimensional output vectors (one dimension per character), which we interpret as the confidence the RNN currently assigns to each character coming next in the sequence. Here's a diagram: - -
    - -
    An example RNN with 4-dimensional input and output layers, and a hidden layer of 3 units (neurons). This diagram shows the activations in the forward pass when the RNN is fed the characters "hell" as input. The output layer contains confidences the RNN assigns for the next character (vocabulary is "h,e,l,o"); We want the green numbers to be high and red numbers to be low.
    -
    - -For example, we see that in the first time step when the RNN saw the character "h" it assigned confidence of 1.0 to the next letter being "h", 2.2 to letter "e", -3.0 to "l", and 4.1 to "o". Since in our training data (the string "hello") the next correct character is "e", we would like to increase its confidence (green) and decrease the confidence of all other letters (red). Similarly, we have a desired target character at every one of the 4 time steps that we'd like the network to assign a greater confidence to. Since the RNN consists entirely of differentiable operations we can run the backpropagation algorithm (this is just a recursive application of the chain rule from calculus) to figure out in what direction we should adjust every one of its weights to increase the scores of the correct targets (green bold numbers). We can then perform a *parameter update*, which nudges every weight a tiny amount in this gradient direction. If we were to feed the same inputs to the RNN after the parameter update we would find that the scores of the correct characters (e.g. "e" in the first time step) would be slightly higher (e.g. 2.3 instead of 2.2), and the scores of incorrect characters would be slightly lower. We then repeat this process over and over many times until the network converges and its predictions are eventually consistent with the training data in that correct characters are always predicted next. - -A more technical explanation is that we use the standard Softmax classifier (also commonly referred to as the cross-entropy loss) on every output vector simultaneously. The RNN is trained with mini-batch Stochastic Gradient Descent and I like to use [RMSProp](http://arxiv.org/abs/1502.04390) or Adam (per-parameter adaptive learning rate methods) to stablilize the updates. - -Notice also that the first time the character "l" is input, the target is "l", but the second time the target is "o". The RNN therefore cannot rely on the input alone and must use its recurrent connection to keep track of the context to achieve this task. - -At **test time**, we feed a character into the RNN and get a distribution over what characters are likely to come next. We sample from this distribution, and feed it right back in to get the next letter. Repeat this process and you're sampling text! Lets now train an RNN on different datasets and see what happens. - -To further clarify, for educational purposes I also wrote a [minimal character-level RNN language model in Python/numpy](https://gist.github.com/karpathy/d4dee566867f8291f086). It is only about 100 lines long and hopefully it gives a concise, concrete and useful summary of the above if you're better at reading code than text. We'll now dive into example results, produced with the much more efficient Lua/Torch codebase. - -## Fun with RNNs - -All 5 example character models below were trained with the [code](https://github.com/karpathy/char-rnn) I'm releasing on Github. The input in each case is a single file with some text, and we're training an RNN to predict the next character in the sequence. - -### Paul Graham generator - -Lets first try a small dataset of English as a sanity check. My favorite fun dataset is the concatenation of [Paul Graham's essays](http://www.paulgraham.com/articles.html). The basic idea is that there's a lot of wisdom in these essays, but unfortunately Paul Graham is a relatively slow generator. Wouldn't it be great if we could sample startup wisdom on demand? That's where an RNN comes in. - -Concatenating all pg essays over the last ~5 years we get approximately 1MB text file, or about 1 million characters (this is considered a very small dataset by the way). *Technical:* Lets train a 2-layer LSTM with 512 hidden nodes (approx. 3.5 million parameters), and with dropout of 0.5 after each layer. We'll train with batches of 100 examples and truncated backpropagation through time of length 100 characters. With these settings one batch on a TITAN Z GPU takes about 0.46 seconds (this can be cut in half with 50 character BPTT at negligible cost in performance). Without further ado, lets see a sample from the RNN: - -*"The surprised in investors weren't going to raise money. I'm not the company with the time there are all interesting quickly, don't have to get off the same programmers. There's a super-angel round fundraising, why do you can do. If you have a different physical investment are become in people who reduced in a startup with the way to argument the acquirer could see them just that you're also the founders will part of users' affords that and an alternation to the idea. [2] Don't work at first member to see the way kids will seem in advance of a bad successful startup. And if you have to act the big company too."* - -Okay, clearly the above is unfortunately not going to replace Paul Graham anytime soon, but remember that the RNN had to learn English completely from scratch and with a small dataset (including where you put commas, apostrophes and spaces). I also like that it learns to support its own arguments (e.g. [2], above). Sometimes it says something that offers a glimmer of insight, such as *"a company is a meeting to think to investors"*. [Here's](http://cs.stanford.edu/people/karpathy/char-rnn/pg.txt) a link to 50K character sample if you'd like to see more. - -**Temperature.** We can also play with the temperature of the Softmax during sampling. Decreasing the temperature from 1 to some lower number (e.g. 0.5) makes the RNN more confident, but also more conservative in its samples. Conversely, higher temperatures will give more diversity but at cost of more mistakes (e.g. spelling mistakes, etc). In particular, setting temperature very near zero will give the most likely thing that Paul Graham might say: - -*"is that they were all the same thing that was a startup is that they were all the same thing that was a startup is that they were all the same thing that was a startup is that they were all the same"* - -looks like we've reached an infinite loop about startups. - -### Shakespeare - -It looks like we can learn to spell English words. But how about if there is more structure and style in the data? To examine this I downloaded all the works of Shakespeare and concatenated them into a single (4.4MB) file. We can now afford to train a larger network, in this case lets try a 3-layer RNN with 512 hidden nodes on each layer. After we train the network for a few hours we obtain samples such as: - -``` -PANDARUS: -Alas, I think he shall be come approached and the day -When little srain would be attain'd into being never fed, -And who is but a chain and subjects of his death, -I should not sleep. - -Second Senator: -They are away this miseries, produced upon my soul, -Breaking and strongly should be buried, when I perish -The earth and thoughts of many states. - -DUKE VINCENTIO: -Well, your wit is in the care of side and that. - -Second Lord: -They would be ruled after this chamber, and -my fair nues begun out of the fact, to be conveyed, -Whose noble souls I'll have the heart of the wars. - -Clown: -Come, sir, I will make did behold your worship. - -VIOLA: -I'll drink it. -``` - -Remember, all the RNN knows are characters, so in particular it samples both speaker's names and the contents. Sometimes we also get relatively extented monologue passages, such as: - -``` -VIOLA: -Why, Salisbury must find his flesh and thought -That which I am not aps, not a man and in fire, -To show the reining of the raven and the wars -To grace my hand reproach within, and not a fair are hand, -That Caesar and my goodly father's world; -When I was heaven of presence and our fleets, -We spare with hours, but cut thy council I am great, -Murdered and by thy master's ready there -My power to give thee but so much as hell: -Some service in the noble bondman here, -Would show him to her wine. - -KING LEAR: -O, if you were a feeble sight, the courtesy of your law, -Your sight and several breath, will wear the gods -With his heads, and my hands are wonder'd at the deeds, -So drop upon your lordship's head, and your opinion -Shall be against your honour. -``` - -I can barely recognize these samples from actual Shakespeare :) If you like Shakespeare, you might appreciate this [100,000 character sample](http://cs.stanford.edu/people/karpathy/char-rnn/shakespear.txt). Of course, you can also generate an infinite amount of your own samples at different temperatures with the provided code. - -### Wikipedia - -We saw that the LSTM can learn to spell words and copy general syntactic structures. Lets further increase the difficulty and train on structured markdown. In particular, lets take the [Hutter Prize](http://prize.hutter1.net/) 100MB dataset of raw Wikipedia and train an LSTM. Following [Graves et al.](http://arxiv.org/abs/1308.0850), I used the first 96MB for training, the rest for validation and ran a few models overnight. We can now sample Wikipedia articles! Below are a few fun excerpts. First, some basic markdown output: - -``` -Naturalism and decision for the majority of Arab countries' capitalide was grounded -by the Irish language by [[John Clair]], [[An Imperial Japanese Revolt]], associated -with Guangzham's sovereignty. His generals were the powerful ruler of the Portugal -in the [[Protestant Immineners]], which could be said to be directly in Cantonese -Communication, which followed a ceremony and set inspired prison, training. The -emperor travelled back to [[Antioch, Perth, October 25|21]] to note, the Kingdom -of Costa Rica, unsuccessful fashioned the [[Thrales]], [[Cynth's Dajoard]], known -in western [[Scotland]], near Italy to the conquest of India with the conflict. -Copyright was the succession of independence in the slop of Syrian influence that -was a famous German movement based on a more popular servicious, non-doctrinal -and sexual power post. Many governments recognize the military housing of the -[[Civil Liberalization and Infantry Resolution 265 National Party in Hungary]], -that is sympathetic to be to the [[Punjab Resolution]] -(PJS)[http://www.humah.yahoo.com/guardian. -cfm/7754800786d17551963s89.htm Official economics Adjoint for the Nazism, Montgomery -was swear to advance to the resources for those Socialism's rule, -was starting to signing a major tripad of aid exile.]] -``` - -In case you were wondering, the yahoo url above doesn't actually exist, the model just hallucinated it. Also, note that the model learns to open and close the parenthesis correctly. There's also quite a lot of structured markdown that the model learns, for example sometimes it creates headings, lists, etc.: - -``` -{ { cite journal | id=Cerling Nonforest Department|format=Newlymeslated|none } } -''www.e-complete''. - -'''See also''': [[List of ethical consent processing]] - -== See also == -*[[Iender dome of the ED]] -*[[Anti-autism]] - -===[[Religion|Religion]]=== -*[[French Writings]] -*[[Maria]] -*[[Revelation]] -*[[Mount Agamul]] - -== External links== -* [http://www.biblegateway.nih.gov/entrepre/ Website of the World Festival. The labour of India-county defeats at the Ripper of California Road.] - -==External links== -* [http://www.romanology.com/ Constitution of the Netherlands and Hispanic Competition for Bilabial and Commonwealth Industry (Republican Constitution of the Extent of the Netherlands)] - -{{African American_and_Australian_Parliament{|}} -``` - -Sometimes the model snaps into a mode of generating random but valid XML: - -``` - - Antichrist - 865 - - 15900676 - 2002-08-03T18:14:12Z - - Paris - 23 - - - Automated conversion - #REDIRECT [[Christianity]] - - -``` - -The model completely makes up the timestamp, id, and so on. Also, note that it closes the correct tags appropriately and in the correct nested order. Here are [100,000 characters of sampled wikipedia](http://cs.stanford.edu/people/karpathy/char-rnn/wiki.txt) if you're interested to see more. - -### Algebraic Geometry (Latex) - -The results above suggest that the model is actually quite good at learning complex syntactic structures. Impressed by these results, my labmate ([Justin Johnson](http://cs.stanford.edu/people/jcjohns/)) and I decided to push even further into structured territories and got a hold of [this book](http://stacks.math.columbia.edu/) on algebraic stacks/geometry. We downloaded the raw Latex source file (a 16MB file) and trained a multilayer LSTM. Amazingly, the resulting sampled Latex *almost* compiles. We had to step in and fix a few issues manually but then you get plausible looking math, it's quite astonishing: - -
    - -
    Sampled (fake) algebraic geometry. Here's the actual pdf.
    -
    - -Here's another sample: - -
    - -
    More hallucinated algebraic geometry. Nice try on the diagram (right).
    -
    - -As you can see above, sometimes the model tries to generate latex diagrams, but clearly it hasn't really figured them out. I also like the part where it chooses to skip a proof (*"Proof omitted."*, top left). Of course, keep in mind that latex has a relatively difficult structured syntactic format that I haven't even fully mastered myself. For instance, here is a raw sample from the model (unedited): - -``` -\begin{proof} -We may assume that $\mathcal{I}$ is an abelian sheaf on $\mathcal{C}$. -\item Given a morphism $\Delta : \mathcal{F} \to \mathcal{I}$ -is an injective and let $\mathfrak q$ be an abelian sheaf on $X$. -Let $\mathcal{F}$ be a fibered complex. Let $\mathcal{F}$ be a category. -\begin{enumerate} -\item \hyperref[setain-construction-phantom]{Lemma} -\label{lemma-characterize-quasi-finite} -Let $\mathcal{F}$ be an abelian quasi-coherent sheaf on $\mathcal{C}$. -Let $\mathcal{F}$ be a coherent $\mathcal{O}_X$-module. Then -$\mathcal{F}$ is an abelian catenary over $\mathcal{C}$. -\item The following are equivalent -\begin{enumerate} -\item $\mathcal{F}$ is an $\mathcal{O}_X$-module. -\end{lemma} -``` - -This sample from a relatively decent model illustrates a few common mistakes. For example, the model opens a `\begin{proof}` environment but then ends it with a `\end{lemma}`. This is an example of a problem we'd have to fix manually, and is likely due to the fact that the dependency is too long-term: By the time the model is done with the proof it has forgotten whether it was doing a proof or a lemma. Similarly, it opens an `\begin{enumerate}` but then forgets to close it. We observed that these became less common with larger/better models, but nonetheless, these are the kinds of mistakes that come up. - -### Linux Source Code - -I wanted to push structured data to its limit, so for the final challenge I decided to use code. In particular, I took all the source and header files found in the [Linux repo on Github](https://github.com/torvalds/linux), concatenated all of them in a single giant file (474MB of C code) (I was originally going to train only on the kernel but that by itself is only ~16MB). Then I trained several as-large-as-fits-on-my-GPU 3-layer LSTMs over a period of a few days. These models have about 10 million parameters, which is still on the lower end for RNN models. The results are superfun: - -```c -/* - * Increment the size file of the new incorrect UI_FILTER group information - * of the size generatively. - */ -static int indicate_policy(void) -{ - int error; - if (fd == MARN_EPT) { - /* - * The kernel blank will coeld it to userspace. - */ - if (ss->segment < mem_total) - unblock_graph_and_set_blocked(); - else - ret = 1; - goto bail; - } - segaddr = in_SB(in.addr); - selector = seg / 16; - setup_works = true; - for (i = 0; i < blocks; i++) { - seq = buf[i++]; - bpf = bd->bd.next + i * search; - if (fd) { - current = blocked; - } - } - rw->name = "Getjbbregs"; - bprm_self_clearl(&iv->version); - regs->new = blocks[(BPF_STATS << info->historidac)] | PFMR_CLOBATHINC_SECONDS << 12; - return segtable; -} -``` - -The code looks really quite great overall. Of course, I don't think it compiles but when you scroll through the generate code it feels very much like a giant C code base. Notice that the RNN peppers its code with comments here and there at random. It is also very good at making very few syntactic errors. For example, it uses strings properly, pointer notation, etc. It also opens and closes brackets `{[` correctly and learns to indent its code very well. A common error is that it can't keep track of variable names: It often uses undefined variables (e.g. `rw` above), declares variables it never uses (e.g. `int error`), or returns non-existing variables. Lets see a few more examples. Here's another snippet that shows a wider array of operations that the RNN learns: - -```c -/* - * If this error is set, we will need anything right after that BSD. - */ -static void action_new_function(struct s_stat_info *wb) -{ - unsigned long flags; - int lel_idx_bit = e->edd, *sys & ~((unsigned long) *FIRST_COMPAT); - buf[0] = 0xFFFFFFFF & (bit << 4); - min(inc, slist->bytes); - printk(KERN_WARNING "Memory allocated %02x/%02x, " - "original MLL instead\n"), - min(min(multi_run - s->len, max) * num_data_in), - frame_pos, sz + first_seg); - div_u64_w(val, inb_p); - spin_unlock(&disk->queue_lock); - mutex_unlock(&s->sock->mutex); - mutex_unlock(&func->mutex); - return disassemble(info->pending_bh); -} - -static void num_serial_settings(struct tty_struct *tty) -{ - if (tty == tty) - disable_single_st_p(dev); - pci_disable_spool(port); - return 0; -} - -static void do_command(struct seq_file *m, void *v) -{ - int column = 32 << (cmd[2] & 0x80); - if (state) - cmd = (int)(int_state ^ (in_8(&ch->ch_flags) & Cmd) ? 2 : 1); - else - seq = 1; - for (i = 0; i < 16; i++) { - if (k & (1 << 1)) - pipe = (in_use & UMXTHREAD_UNCCA) + - ((count & 0x00000000fffffff8) & 0x000000f) << 8; - if (count == 0) - sub(pid, ppc_md.kexec_handle, 0x20000000); - pipe_set_bytes(i, 0); - } - /* Free our user pages pointer to place camera if all dash */ - subsystem_info = &of_changes[PAGE_SIZE]; - rek_controls(offset, idx, &soffset); - /* Now we want to deliberately put it to device */ - control_check_polarity(&context, val, 0); - for (i = 0; i < COUNTER; i++) - seq_puts(s, "policy "); -} -``` - -Notice that in the second function the model compares `tty == tty`, which is vacuously true. On the other hand, at least the variable `tty` exists in the scope this time! In the last function, notice that the code does not return anything, which happens to be correct since the function signature is `void`. However, the first two functions were also declared `void` and did return values. This is again a form of a common mistake due to long-term interactions. - -Sometimes the model decides that it's time to sample a new file. This is usually a very amusing part: The model first recites the GNU license character by character, samples a few includes, generates some macros and then dives into the code: - -```c -/* - * Copyright (c) 2006-2010, Intel Mobile Communications. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as published by - * the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software Foundation, - * Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#define REG_PG vesa_slot_addr_pack -#define PFM_NOCOMP AFSR(0, load) -#define STACK_DDR(type) (func) - -#define SWAP_ALLOCATE(nr) (e) -#define emulate_sigs() arch_get_unaligned_child() -#define access_rw(TST) asm volatile("movd %%esp, %0, %3" : : "r" (0)); \ - if (__type & DO_READ) - -static void stat_PC_SEC __read_mostly offsetof(struct seq_argsqueue, \ - pC>[1]); - -static void -os_prefix(unsigned long sys) -{ -#ifdef CONFIG_PREEMPT - PUT_PARAM_RAID(2, sel) = get_state_state(); - set_pid_sum((unsigned long)state, current_state_str(), - (unsigned long)-1->lr_full; low; -} -``` - -There are too many fun parts to cover- I could probably write an entire blog post on just this part. I'll cut it short for now, but here is [1MB of sampled Linux code](http://cs.stanford.edu/people/karpathy/char-rnn/linux.txt) for your viewing pleasure. - -### Generating Baby Names - -Lets try one more for fun. Lets feed the RNN a large text file that contains 8000 baby names listed out, one per line (names obtained from [here](http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/)). We can feed this to the RNN and then generate new names! Here are some example names, only showing the ones that do not occur in the training data (90% don't): - -*Rudi -Levette -Berice -Lussa -Hany -Mareanne -Chrestina -Carissy -Marylen -Hammine -Janye -Marlise -Jacacrie -Hendred -Romand -Charienna -Nenotto -Ette -Dorane -Wallen -Marly -Darine -Salina -Elvyn -Ersia -Maralena -Minoria -Ellia -Charmin -Antley -Nerille -Chelon -Walmor -Evena -Jeryly -Stachon -Charisa -Allisa -Anatha -Cathanie -Geetra -Alexie -Jerin -Cassen -Herbett -Cossie -Velen -Daurenge -Robester -Shermond -Terisa -Licia -Roselen -Ferine -Jayn -Lusine -Charyanne -Sales -Sanny -Resa -Wallon -Martine -Merus -Jelen -Candica -Wallin -Tel -Rachene -Tarine -Ozila -Ketia -Shanne -Arnande -Karella -Roselina -Alessia -Chasty -Deland -Berther -Geamar -Jackein -Mellisand -Sagdy -Nenc -Lessie -Rasemy -Guen -Gavi -Milea -Anneda -Margoris -Janin -Rodelin -Zeanna -Elyne -Janah -Ferzina -Susta -Pey -Castina* - -You can see many more [here](http://cs.stanford.edu/people/karpathy/namesGenUnique.txt). Some of my favorites include "Baby" (haha), "Killie", "Char", "R", "More", "Mars", "Hi", "Saddie", "With" and "Ahbort". Well that was fun. Of course, you can imagine this being quite useful inspiration when writing a novel, or naming a new startup :) - -## Understanding what's going on - -We saw that the results at the end of training can be impressive, but how does any of this work? Lets run two quick experiments to briefly peek under the hood. - -### The evolution of samples while training - -First, it's fun to look at how the sampled text evolves while the model trains. For example, I trained an LSTM of Leo Tolstoy's War and Peace and then generated samples every 100 iterations of training. At iteration 100 the model samples random jumbles: - -``` -tyntd-iafhatawiaoihrdemot lytdws e ,tfti, astai f ogoh eoase rrranbyne 'nhthnee e -plia tklrgd t o idoe ns,smtt h ne etie h,hregtrs nigtike,aoaenns lng -``` - -However, notice that at least it is starting to get an idea about words separated by spaces. Except sometimes it inserts two spaces. It also doesn't know that comma is amost always followed by a space. At 300 iterations we see that the model starts to get an idea about quotes and periods: - -``` -"Tmont thithey" fomesscerliund -Keushey. Thom here -sheulke, anmerenith ol sivh I lalterthend Bleipile shuwy fil on aseterlome -coaniogennc Phe lism thond hon at. MeiDimorotion in ther thize." -``` - -The words are now also separated with spaces and the model starts to get the idea about periods at the end of a sentence. At iteration 500: - -``` -we counter. He stutn co des. His stanted out one ofler that concossions and was -to gearang reay Jotrets and with fre colt otf paitt thin wall. Which das stimn -``` - -the model has now learned to spell the shortest and most common words such as "we", "He", "His", "Which", "and", etc. At iteration 700 we're starting to see more and more English-like text emerge: - -``` -Aftair fall unsuch that the hall for Prince Velzonski's that me of -her hearly, and behs to so arwage fiving were to it beloge, pavu say falling misfort -how, and Gogition is so overelical and ofter. -``` - -At iteration 1200 we're now seeing use of quotations and question/exclamation marks. Longer words have now been learned as well: - -``` -"Kite vouch!" he repeated by her -door. "But I would be done and quarts, feeling, then, son is people...." -``` - -Until at last we start to get properly spelled words, quotations, names, and so on by about iteration 2000: - -``` -"Why do what that day," replied Natasha, and wishing to himself the fact the -princess, Princess Mary was easier, fed in had oftened him. -Pierre aking his soul came to the packs and drove up his father-in-law women. -``` - -The picture that emerges is that the model first discovers the general word-space structure and then rapidly starts to learn the words; First starting with the short words and then eventually the longer ones. Topics and themes that span multiple words (and in general longer-term dependencies) start to emerge only much later. - -### Visualizing the predictions and the "neuron" firings in the RNN - -Another fun visualization is to look at the predicted distributions over characters. In the visualizations below we feed a Wikipedia RNN model character data from the validation set (shown along the blue/green rows) and under every character we visualize (in red) the top 5 guesses that the model assigns for the next character. The guesses are colored by their probability (so dark red = judged as very likely, white = not very likely). For example, notice that there are stretches of characters where the model is extremely confident about the next letter (e.g., the model is very confident about characters during the *http://www.* sequence). - -The input character sequence (blue/green) is colored based on the *firing* of a randomly chosen neuron in the hidden representation of the RNN. Think about it as green = very excited and blue = not very excited (for those familiar with details of LSTMs, these are values between [-1,1] in the hidden state vector, which is just the gated and tanh'd LSTM cell state). Intuitively, this is visualizing the firing rate of some neuron in the "brain" of the RNN while it reads the input sequence. Different neurons might be looking for different patterns; Below we'll look at 4 different ones that I found and thought were interesting or interpretable (many also aren't): - -
    - -
    -The neuron highlighted in this image seems to get very excited about URLs and turns off outside of the URLs. The LSTM is likely using this neuron to remember if it is inside a URL or not. -
    -
    - -
    - -
    -The highlighted neuron here gets very excited when the RNN is inside the [[ ]] markdown environment and turns off outside of it. Interestingly, the neuron can't turn on right after it sees the character "[", it must wait for the second "[" and then activate. This task of counting whether the model has seen one or two "[" is likely done with a different neuron. -
    -
    - -
    - -
    -Here we see a neuron that varies seemingly linearly across the [[ ]] environment. In other words its activation is giving the RNN a time-aligned coordinate system across the [[ ]] scope. The RNN can use this information to make different characters more or less likely depending on how early/late it is in the [[ ]] scope (perhaps?). -
    -
    - -
    - -
    -Here is another neuron that has very local behavior: it is relatively silent but sharply turns off right after the first "w" in the "www" sequence. The RNN might be using this neuron to count up how far in the "www" sequence it is, so that it can know whether it should emit another "w", or if it should start the URL. -
    -
    - -Of course, a lot of these conclusions are slightly hand-wavy as the hidden state of the RNN is a huge, high-dimensional and largely distributed representation. These visualizations were produced with custom HTML/CSS/Javascript, you can see a sketch of what's involved [here](http://cs.stanford.edu/people/karpathy/viscode.zip) if you'd like to create something similar. - -We can also condense this visualization by excluding the most likely predictions and only visualize the text, colored by activations of a cell. We can see that in addition to a large portion of cells that do not do anything interpretible, about 5% of them turn out to have learned quite interesting and interpretible algorithms: - -
    - - -
    -
    -
    - -Again, what is beautiful about this is that we didn't have to hardcode at any point that if you're trying to predict the next character it might, for example, be useful to keep track of whether or not you are currently inside or outside of quote. We just trained the LSTM on raw data and it decided that this is a useful quantitity to keep track of. In other words one of its cells gradually tuned itself during training to become a quote detection cell, since this helps it better perform the final task. This is one of the cleanest and most compelling examples of where the power in Deep Learning models (and more generally end-to-end training) is coming from. - -## Source Code - -I hope I've convinced you that training character-level language models is a very fun exercise. You can train your own models using the [char-rnn code](https://github.com/karpathy/char-rnn) I released on Github (under MIT license). It takes one large text file and trains a character-level model that you can then sample from. Also, it helps if you have a GPU or otherwise training on CPU will be about a factor of 10x slower. In any case, if you end up training on some data and getting fun results let me know! And if you get lost in the Torch/Lua codebase remember that all it is is just a more fancy version of this [100-line gist](https://gist.github.com/karpathy/d4dee566867f8291f086). - -*Brief digression.* The code is written in [Torch 7](http://torch.ch/), which has recently become my favorite deep learning framework. I've only started working with Torch/LUA over the last few months and it hasn't been easy (I spent a good amount of time digging through the raw Torch code on Github and asking questions on their *gitter* to get things done), but once you get a hang of things it offers a lot of flexibility and speed. I've also worked with Caffe and Theano in the past and I believe Torch, while not perfect, gets its levels of abstraction and philosophy right better than others. In my view the desirable features of an effective framework are: - -1. CPU/GPU transparent Tensor library with a lot of functionality (slicing, array/matrix operations, etc. ) -2. An entirely separate code base in a scripting language (ideally Python) that operates over Tensors and implements all Deep Learning stuff (forward/backward, computation graphs, etc) -3. It should be possible to easily share pretrained models (Caffe does this well, others don't), and crucially -4. NO compilation step (or at least not as currently done in Theano). The trend in Deep Learning is towards larger, more complex networks that are are time-unrolled in complex graphs. It is critical that these do not compile for a long time or development time greatly suffers. Second, by compiling one gives up interpretability and the ability to log/debug effectively. If there is an *option* to compile the graph once it has been developed for efficiency in prod that's fine. - -## Further Reading - -Before the end of the post I also wanted to position RNNs in a wider context and provide a sketch of the current research directions. RNNs have recently generated a significant amount of buzz and excitement in the field of Deep Learning. Similar to Convolutional Networks they have been around for decades but their full potential has only recently started to get widely recognized, in large part due to our growing computational resources. Here's a brief sketch of a few recent developments (definitely not complete list, and a lot of this work draws from research back to 1990s, see related work sections): - -In the domain of **NLP/Speech**, RNNs [transcribe speech to text](http://www.jmlr.org/proceedings/papers/v32/graves14.pdf), perform [machine translation](http://arxiv.org/abs/1409.3215), [generate handwritten text](http://www.cs.toronto.edu/~graves/handwriting.html), and of course, they have been used as powerful language models [(Sutskever et al.)](http://www.cs.utoronto.ca/~ilya/pubs/2011/LANG-RNN.pdf) [(Graves)](http://arxiv.org/abs/1308.0850) [(Mikolov et al.)](http://www.rnnlm.org/) (both on the level of characters and words). Currently it seems that word-level models work better than character-level models, but this is surely a temporary thing. - -**Computer Vision.** RNNs are also quickly becoming pervasive in Computer Vision. For example, we're seeing RNNs in frame-level [video classification](http://arxiv.org/abs/1411.4389), [image captioning](http://arxiv.org/abs/1411.4555) (also including my own work and many others), [video captioning](http://arxiv.org/abs/1505.00487) and very recently [visual question answering](http://arxiv.org/abs/1505.02074). My personal favorite RNNs in Computer Vision paper is [Recurrent Models of Visual Attention](http://arxiv.org/abs/1406.6247), both due to its high-level direction (sequential processing of images with glances) and the low-level modeling (REINFORCE learning rule that is a special case of policy gradient methods in Reinforcement Learning, which allows one to train models that perform non-differentiable computation (taking glances around the image in this case)). I'm confident that this type of hybrid model that consists of a blend of CNN for raw perception coupled with an RNN glance policy on top will become pervasive in perception, especially for more complex tasks that go beyond classifying some objects in plain view. - -**Inductive Reasoning, Memories and Attention.** Another extremely exciting direction of research is oriented towards addressing the limitations of vanilla recurrent networks. One problem is that RNNs are not inductive: They memorize sequences extremely well, but they don't necessarily always show convincing signs of generalizing in the *correct* way (I'll provide pointers in a bit that make this more concrete). A second issue is they unnecessarily couple their representation size to the amount of computation per step. For instance, if you double the size of the hidden state vector you'd quadruple the amount of FLOPS at each step due to the matrix multiplication. Ideally, we'd like to maintain a huge representation/memory (e.g. containing all of Wikipedia or many intermediate state variables), while maintaining the ability to keep computation per time step fixed. - -The first convincing example of moving towards these directions was developed in DeepMind's [Neural Turing Machines](http://arxiv.org/abs/1410.5401) paper. This paper sketched a path towards models that can perform read/write operations between large, external memory arrays and a smaller set of memory registers (think of these as our working memory) where the computation happens. Crucially, the NTM paper also featured very interesting memory addressing mechanisms that were implemented with a (soft, and fully-differentiable) attention model. The concept of **soft attention** has turned out to be a powerful modeling feature and was also featured in [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) for Machine Translation and [Memory Networks](http://arxiv.org/abs/1503.08895) for (toy) Question Answering. In fact, I'd go as far as to say that - -> The concept of **attention** is the most interesting recent architectural innovation in neural networks. - -Now, I don't want to dive into too many details but a soft attention scheme for memory addressing is convenient because it keeps the model fully-differentiable, but unfortunately one sacrifices efficiency because everything that can be attended to is attended to (but softly). Think of this as declaring a pointer in C that doesn't point to a specific address but instead defines an entire distribution over all addresses in the entire memory, and dereferencing the pointer returns a weighted sum of the pointed content (that would be an expensive operation!). This has motivated multiple authors to swap soft attention models for **hard attention** where one samples a particular chunk of memory to attend to (e.g. a read/write action for some memory cell instead of reading/writing from all cells to some degree). This model is significantly more philosophically appealing, scalable and efficient, but unfortunately it is also non-differentiable. This then calls for use of techniques from the Reinforcement Learning literature (e.g. REINFORCE) where people are perfectly used to the concept of non-differentiable interactions. This is very much ongoing work but these hard attention models have been explored, for example, in [Inferring Algorithmic Patterns with Stack-Augmented Recurrent Nets](http://arxiv.org/abs/1503.01007), [Reinforcement Learning Neural Turing Machines](http://arxiv.org/abs/1505.00521), and [Show Attend and Tell](http://arxiv.org/abs/1502.03044). - -**People**. If you'd like to read up on RNNs I recommend theses from [Alex Graves](http://www.cs.toronto.edu/~graves/), [Ilya Sutskever](http://www.cs.toronto.edu/~ilya/) and [Tomas Mikolov](http://www.rnnlm.org/). For more about REINFORCE and more generally Reinforcement Learning and policy gradient methods (which REINFORCE is a special case of) [David Silver](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Home.html)'s class, or one of [Pieter Abbeel](http://www.cs.berkeley.edu/~pabbeel/)'s classes. - -**Code**. If you'd like to play with training RNNs I hear good things about [keras](https://github.com/fchollet/keras) or [passage](https://github.com/IndicoDataSolutions/Passage) for Theano, the [code](https://github.com/karpathy/char-rnn) released with this post for Torch, or [this gist](https://gist.github.com/karpathy/587454dc0146a6ae21fc) for raw numpy code I wrote a while ago that implements an efficient, batched LSTM forward and backward pass. You can also have a look at my numpy-based [NeuralTalk](https://github.com/karpathy/neuraltalk) which uses an RNN/LSTM to caption images, or maybe this [Caffe](http://jeffdonahue.com/lrcn/) implementation by Jeff Donahue. - -## Conclusion - -We've learned about RNNs, how they work, why they have become a big deal, we've trained an RNN character-level language model on several fun datasets, and we've seen where RNNs are going. You can confidently expect a large amount of innovation in the space of RNNs, and I believe they will become a pervasive and critical component to intelligent systems. - -Lastly, to add some **meta** to this post, I trained an RNN on the source file of this blog post. Unfortunately, at about 46K characters I haven't written enough data to properly feed the RNN, but the returned sample (generated with low temperature to get a more typical sample) is: - -``` -I've the RNN with and works, but the computed with program of the -RNN with and the computed of the RNN with with and the code -``` - -Yes, the post was about RNN and how well it works, so clearly this works :). See you next time! - -**EDIT (extra links):** - -Videos: - -- I gave a talk on this work at the [London Deep Learning meetup (video)](https://skillsmatter.com/skillscasts/6611-visualizing-and-understanding-recurrent-networks). - -Discussions: - -- [HN discussion](https://news.ycombinator.com/item?id=9584325) -- Reddit discussion on [r/machinelearning](http://www.reddit.com/r/MachineLearning/comments/36s673/the_unreasonable_effectiveness_of_recurrent/) -- Reddit discussion on [r/programming](http://www.reddit.com/r/programming/comments/36su8d/the_unreasonable_effectiveness_of_recurrent/) - -Replies: - -- [Yoav Goldberg](https://twitter.com/yoavgo) compared these RNN results to [n-gram maximum likelihood (counting) baseline](http://nbviewer.ipython.org/gist/yoavg/d76121dfde2618422139) -- [@nylk](https://twitter.com/nylk) trained char-rnn on [cooking recipes](https://gist.github.com/nylki/1efbaa36635956d35bcc). They look great! -- [@MrChrisJohnson](https://twitter.com/MrChrisJohnson) trained char-rnn on Eminem lyrics and then synthesized a rap song with robotic voice reading it out. Hilarious :) -- [@samim](https://twitter.com/samim) trained char-rnn on [Obama Speeches](https://medium.com/@samim/obama-rnn-machine-generated-political-speeches-c8abd18a2ea0). They look fun! -- [João Felipe](https://twitter.com/seaandsailor) trained char-rnn irish folk music and [sampled music](https://soundcloud.com/seaandsailor/sets/char-rnn-composes-irish-folk-music) -- [Bob Sturm](https://twitter.com/boblsturm) also trained char-rnn on [music in ABC notation](https://highnoongmt.wordpress.com/2015/05/22/lisls-stis-recurrent-neural-networks-for-folk-music-generation/) -- [RNN Bible bot](https://twitter.com/RNN_Bible) by [Maximilien](https://twitter.com/the__glu/with_replies) -- [Learning Holiness](http://cpury.github.io/learning-holiness/) learning the Bible -- [Terminal.com snapshot](https://www.terminal.com/tiny/ZMcqdkWGOM) that has char-rnn set up and ready to go in a browser-based virtual machine (thanks [@samim](https://www.twitter.com/samim)) - diff --git a/_posts/2015-10-25-selfie.markdown b/_posts/2015-10-25-selfie.markdown deleted file mode 100644 index 37d33ba4a..000000000 --- a/_posts/2015-10-25-selfie.markdown +++ /dev/null @@ -1,213 +0,0 @@ ---- -layout: post -comments: true -title: "What a Deep Neural Network thinks about your #selfie" -excerpt: "We will look at Convolutional Neural Networks, with a fun example of training them to classify #selfies as good/bad based on a scraped dataset of 2 million selfies." -date: 2015-10-25 11:00:00 -mathjax: false ---- - -
    - -
    - -Convolutional Neural Networks are great: they recognize things, places and people in your personal photos, signs, people and lights in self-driving cars, crops, forests and traffic in aerial imagery, various anomalies in medical images and all kinds of other useful things. But once in a while these powerful visual recognition models can also be warped for distraction, fun and amusement. In this fun experiment we're going to do just that: We'll take a powerful, 140-million-parameter state-of-the-art Convolutional Neural Network, feed it 2 million selfies from the internet, and train it to classify good selfies from bad ones. Just because it's easy and because we can. And in the process we might learn how to take better selfies :) - - - -> Yeah, I'll do real work. But first, let me tag a #selfie. - -### Convolutional Neural Networks - -Before we dive in I thought I should briefly describe what Convolutional Neural Networks (or ConvNets for short) are in case a slightly more general audience reader stumbles by. Basically, ConvNets are a very powerful hammer, and Computer Vision problems are very nails. If you're seeing or reading anything about a computer recognizing things in images or videos, in 2015 it almost certainly involves a ConvNet. Some examples: - -
    - -
    Few of many examples of ConvNets being useful. From top left and clockwise: Classifying house numbers in Street View images, recognizing bad things in medical images, recognizing Chinese characters, traffic signs, and faces.
    -
    - -*A bit of history.* ConvNets happen to have an interesting background story. They were first developed by [Yann LeCun](https://www.facebook.com/yann.lecun) et al. in 1980's (building on some earlier work, e.g. from [Fukushima](https://en.wikipedia.org/wiki/Neocognitron)). As a fun early example see this demonstration of LeNet 1 (that was the ConvNet's name) [recognizing digits](https://www.youtube.com/watch?v=FwFduRA_L6Q) back in 1993. However, these models remained mostly ignored by the Computer Vision community because it was thought that they would not scale to "real-world" images. That turned out to be only true until about 2012, when we finally had enough compute (in form of GPUs specifically, thanks NVIDIA) and enough data (thanks [ImageNet](http://www.image-net.org/)) to actually scale these models, as was first demonstrated when Alex Krizhevsky, Ilya Sutskever and Geoff Hinton won the [2012 ImageNet challenge](http://image-net.org/challenges/LSVRC/2012/results.html) (think: The World Cup of Computer Vision), crushing their competition (16.4% error vs. 26.2% of the second best entry). - -I happened to witness this critical juncture in time first hand because the ImageNet challenge was over the last few years organized by [Fei-Fei Li](http://vision.stanford.edu/)'s lab (my lab), so I remember when my labmate gasped in disbelief as she noticed the (very strong) ConvNet submission come up in the submission logs. And I remember us pacing around the room trying to digest what had just happened. In the next few months ConvNets went from obscure models that were shrouded in skepticism to rockstars of Computer Vision, present as a core building block in almost every new Computer Vision paper. The ImageNet challenge reflects this trend - In the 2012 ImageNet challenge there was only one ConvNet entry, and since then in 2013 and 2014 almost all entries used ConvNets. Also, fun fact, the winning team each year immediately incorporated into a company. - -Over the next few years we had perfected, simplified, and scaled up the original 2012 "[AlexNet](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)" architecture (yes, we give them names). In 2013 there was the "[ZFNet](http://arxiv.org/abs/1311.2901)", and then in 2014 the "[GoogLeNet](http://arxiv.org/abs/1409.4842)" (get it? Because it's like LeNet but from Google? hah) and "[VGGNet](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)". Anyway, what we know now is that ConvNets are: - -- **simple**: one operation is repeated over and over few tens of times starting with the raw image. -- **fast**, processing an image in few tens of milliseconds -- **they work** very well (e.g. see [this post](http://karpathy.github.io/2014/09/02/what-i-learned-from-competing-against-a-convnet-on-imagenet/) where I struggle to classify images better than the GoogLeNet) -- and by the way, in some ways they seem to work similar to our own visual cortex (see e.g. [this paper](http://arxiv.org/abs/1406.3284)) - - -### Under the hood - -So how do they work? When you peek under the hood you'll find a very simple computational motif repeated over and over. The gif below illustrates the full computational process of a small ConvNet: - -
    - -
    Illustration of the inference process.
    -
    - -On the left we feed in the raw image pixels, which we represent as a 3-dimensional grid of numbers. For example, a 256x256 image would be represented as a 256x256x3 array (last 3 for red, green, blue). We then perform *convolutions*, which is a fancy way of saying that we take small filters and slide them over the image spatially. Different filters get excited over different features in the image: some might respond strongly when they see a small horizontal edge, some might respond around regions of red color, etc. If we suppose that we had 10 filters, in this way we would transform the original (256,256,3) image to a (256,256,10) "image", where we've thrown away the original image information and only keep the 10 responses of our filters at every position in the image. It's as if the three color channels (red, green, blue) were now replaced with 10 filter response channels (I'm showing these along the first column immediately on the right of the image in the gif above). - -Now, I explained the first column of activations right after the image, so what's with all the other columns that appear over time? They are the exact same operation repeated over and over, once to get each new column. The next columns will correspond to yet another set of filters being applied to the previous column's responses, gradually detecting more and more complex visual patterns until the last set of filters is computing the probability of entire visual classes (e.g. dog/toad) in the image. Clearly, I'm skimming over some parts but that's the basic gist: it's just convolutions from start to end. - -*Training*. We've seen that a ConvNet is a large collection of filters that are applied on top of each other. But how do we know what the filters should be looking for? We don't - we initialize them all randomly and then *train* them over time. For example, we feed an image to a ConvNet with random filters and it might say that it's 54% sure that's a dog. Then we can tell it that it's in fact a toad, and there is a mathematical process for changing all filters in the ConvNet a tiny amount so as to make it slightly more likely to say toad the next time it sees that same image. Then we just repeat this process tens/hundreds of millions of times, for millions of images. Automagically, different filters along the computational pathway in the ConvNet will gradually tune themselves to respond to important things in the images, such as eyes, then heads, then entire bodies etc. - -
    - -
    Examples of what 12 randomly chosen filters in a trained ConvNet get excited about, borrowed from Matthew Zeiler's Visualizing and Understanding Convolutional Networks. Filters shown here are in the 3rd stage of processing and seem to look for honey-comb like patterns, or wheels/torsos/text, etc. Again, we don't specify this; It emerges by itself and we can inspect it.
    -
    - -Another nice set of visualizations for a fully trained ConvNet can be found in Jason Yosinski et al. project [deepvis](http://yosinski.com/deepvis). It includes a fun live demo of a ConvNet running in real time on your computer's camera, as explained nicely by Jason in this video: - -
    - -
    - - -In summary, the whole training process resembles showing a child many images of things, and him/her having to gradually figure out what to look for in the images to tell those things apart. Or if you prefer your explanations technical, then ConvNet is just expressing a function from image pixels to class probabilities with the filters as parameters, and we run stochastic gradient descent to optimize a classification loss function. Or if you're into AI/brain/singularity hype then the function is a "deep neural network", the filters are neurons, and the full ConvNet is a piece of adaptive, simulated visual cortical tissue. - -### Training a ConvNet - -The nice thing about ConvNets is that you can feed them images of whatever you like (along with some labels) and they will learn to recognize those labels. In our case we will feed a ConvNet some good and bad selfies, and it will automagically find the best things to look for in the images to tell those two classes apart. So lets grab some selfies: - -1. I wrote a quick script to gather images tagged with **#selfie**. I ended up getting about 5 million images (with ConvNets it's the more the better, always). -2. I narrowed that down with another ConvNet to about 2 million images that contain at least one face. -3. Now it is time to decide which ones of those selfies are good or bad. Intuitively, we want to calculate a proxy for how many people have seen the selfie, and then look at the number of likes as a function of the audience size. I took all the users and sorted them by their number of followers. I gave a small bonus for each additional tag on the image, assuming that extra tags bring more eyes. Then I marched down this sorted list in groups of 100, and sorted those 100 selfies based on their number of likes. I only used selfies that were online for more than a month to ensure a near-stable like count. I took the top 50 selfies and assigned them as positive selfies, and I took the bottom 50 and assigned those to negatives. We therefore end up with a binary split of the data into two halves, where we tried to normalize by the number of people who have probably seen each selfie. In this process I also filtered people with too few followers or too many followers, and also people who used too many tags on the image. -4. Take the resulting dataset of 1 million good and 1 million bad selfies and train a ConvNet. - -At this point you may object that the way I'm deciding if a selfie is good or bad is wrong - e.g. what if someone posted a very good selfie but it was late at night, so perhaps not as many people saw it and it got less likes? You're right - It almost definitely is wrong, but it only has to be right more often that not and the ConvNet will manage. It does not get confused or discouraged, it just does its best with what it's been given. To get an idea about how difficult it is to distinguish the two classes in our data, have a look at some example training images below. If I gave you any one of these images could you tell which category it belongs to? - -
    - -
    Example images showing good and bad selfies in our training data. These will be given to the ConvNet as teaching material.
    -
    - -**Training details**. Just to throw out some technical details, I used [Caffe](http://caffe.berkeleyvision.org/) to train the ConvNet. I used a VGGNet pretrained on ImageNet, and finetuned it on the selfie dataset. The model trained overnight on an NVIDIA K40 GPU. I disabled dropout because I had better results without it. I also tried a VGGNet pretrained on a dataset with faces but did not obtain better results than starting from an ImageNet checkpoint. The final model had 60% accuracy on my validation data split (50% is guessing randomly). - -### What makes a good #selfie ? - -Okay, so we collected 2 million selfies, decided which ones are probably good or bad based on the number of likes they received (controlling for the number of followers), fed all of it to Caffe and trained a ConvNet. The ConvNet "looked" at every one of the 2 million selfies several tens of times, and tuned its filters in a way that best allows it to separate good selfies from bad ones. We can't very easily inspect exactly what it found (it's all jumbled up in 140 million numbers that together define the filters). However, we can set it loose on selfies that it has never seen before and try to understand what it's doing by looking at which images it likes and which ones it does not. - -I took 50,000 selfies from my test data (i.e. the ConvNet hasn't seen these before). As a first visualization, in the image below I am showing a *continuum* visualization, with the best selfies on the top row, the worst selfies on the bottom row, and every row in between is a continuum: - -
    - -
    A continuum from best (top) to worst (bottom) selfies, as judged by the ConvNet.
    -
    - -That was interesting. Lets now pull up the top 100 selfies (out of 50,000), according to the ConvNet: - -
    - -
    Best 100 out of 50,000 selfies, as judged by the Convolutional Neural Network.
    -
    - -If you'd like to see more here is a link to [top 1000 selfies (3.5MB)](http://cs.stanford.edu/people/karpathy/grid_render_top.jpg). Are you noticing a pattern in what the ConvNet has likely learned to look for? A few patterns stand out for me, and if you notice anything else I'd be happy to hear about in the comments. To take a good selfie, **Do**: - -- *Be female.* Women are consistently ranked higher than men. In particular, notice that there is not a single guy in the top 100. -- *Face should occupy about 1/3 of the image.* Notice that the position and pose of the face is quite consistent among the top images. The face always occupies about 1/3 of the image, is slightly tilted, and is positioned in the center and at the top. Which also brings me to: -- *Cut off your forehead*. What's up with that? It looks like a popular strategy, at least for women. -- *Show your long hair*. Notice the frequent prominence of long strands of hair running down the shoulders. -- *Oversaturate the face.* Notice the frequent occurrence of over-saturated lighting, which often makes the face look much more uniform and faded out. Related to that, -- *Put a filter on it.* Black and White photos seem to do quite well, and most of the top images seem to contain some kind of a filter that fades out the image and decreases the contrast. -- *Add a border.* You will notice a frequent appearance of horizontal/vertical white borders. - -Interestingly, not all of these rules apply to males. I manually went through the top 2000 selfies and picked out the top males, here's what we get: - -
    - -
    Best few male selfies taken from the top 2,000 selfies.
    -
    - -In this case we see don't see any cut off foreheads. Instead, most selfies seem to be a slightly broader shot with head fully in the picture, and shoulders visible. It also looks like many of them have a fancy hair style with slightly longer hair combed upwards. However, we still do see the prominance of faded facial features. - -Lets also look at some of the worst selfies, which the ConvNet is quite certain would not receive a lot of likes. I am showing the images in a much smaller and less identifiable format because my intention is for us to learn about the broad patterns that decrease the selfie's quality, not to shine light on people who happened to take a bad selfie. Here they are: - -
    - -
    Worst 300 out of 50,000 selfies, as judged by the Convolutional Neural Network.
    -
    - -Even at this small resolution some patterns clearly emerge. **Don't**: - -- *Take selfies in low lighting.* Very consistently, darker photos (which usually include much more noise as well) are ranked very low by the ConvNet. -- *Frame your head too large.* Presumably no one wants to see such an up-close view. -- *Take group shots.* It's fun to take selfies with your friends but this seems to not work very well. Keep it simple and take up all the space yourself. But not too much space. - -As a last point, note that a good portion of the variability between what makes a good or bad selfies can be explained by the style of the image, as opposed to the raw attractiveness of the person. Also, with some relief, it seems that the best selfies do not seem to be the ones that show the most skin. I was quite concerned for a moment there that my fancy 140-million ConvNet would turn out to be a simple amount-of-skin-texture-counter. - -**Celebrities.** As a last fun experiment, I tried to run the ConvNet on a few famous celebrity selfies, and sorted the results with the continuum visualization, where the best selfies are on the top and the ConvNet score decreases to the right and then towards the bottom: - -
    - -
    Celebrity selfies as judged by a Convolutional Neural Network. Most attractive selfies: Top left, then deceasing in quality first to the right then towards the bottom. Right click > Open Image in new tab on this image to see it in higher resolution.
    -
    - -Amusingly, note that the general rule of thumb we observed before (*no group photos*) is broken with the famous group selfie of Ellen DeGeneres and others from the Oscars, yet the ConvNet thinks this is actually a very good selfie, placing it on the 2nd row! Nice! :) - -Another one of our rules of thumb (*no males*) is confidently defied by Chris Pratt's body (also 2nd row), and honorable mentions go to Justin Beiber's raised eyebrows and Stephen Collbert / Jimmy Fallon duo (3rd row). James Franco's selfie shows quite a lot more skin than Chris', but the ConvNet is not very impressed (4th row). Neither was I. - -Lastly, notice again the importance of style. There are several uncontroversially-good-looking people who still appear on the bottom of the list, due to bad framing (e.g. head too large possibly for J Lo), bad lighting, etc. - -### Exploring the #selfie space - -Another fun visualization we can try is to lay out the selfies with [t-SNE](http://lvdmaaten.github.io/tsne/). t-SNE is a wonderful algorithm that I like to run on nearly anything I can because it's both very general and very effective - it takes some number of things (e.g. images in our case) and lays them out in such way that nearby things are similar. You can in fact lay out many things with t-SNE, such as [Netflix movies](http://lvdmaaten.github.io/tsne/examples/netflix_tsne.jpg), [words](http://lvdmaaten.github.io/tsne/examples/semantic_tsne.jpg), [Twitter profiles](http://cs.stanford.edu/people/karpathy/tsnejs/), [ImageNet images](http://cs.stanford.edu/people/karpathy/cnnembed/), or really anything where you have some number of things and a way of comparing how similar two things are. In our case we will lay out selfies based on how similar the ConvNet perceives them. In technical terms, we are doing this based on L2 norms of the fc7 activations in the last fully-connected layer. Here is the visualization: - -
    - -
    Selfie t-SNE visualization. Here is a link to a higher-resolution version. (9MB)
    -
    - -You can see that selfies cluster in some fun ways: we have group selfies on top left, a cluster of selfies with sunglasses/glasses in middle left, closeups bottom left, a lot of mirror full-body shots top right, etc. Well, I guess that was kind of fun. - -### Finding the Optimal Crop for a selfie - -Another fun experiment we can run is to use the ConvNet to automatically find the best selfie crops. That is, we will take an image, randomly try out many different possible crops and then select the one that the ConvNet thinks looks best. Below are four examples of the process, where I show the original selfies on the left, and the ConvNet-cropped selfies on the right: - -
    - -
    Each of the four pairs shows the original image (left) and the crop that was selected by the ConvNet as looking best (right).
    -
    - -Notice that the ConvNet likes to make the head take up about 1/3 of the image, and chops off the forehead. Amusingly, in the image on the bottom right the ConvNet decided to get rid of the "self" part of *selfie*, entirely missing the point :) You can find many more fun examples of these "rude" crops: - -
    - -
    Same visualization as above, with originals on left and best crops on right. The one on the right is my favorite.
    -
    - -Before any of the more advanced users ask: Yes, I did try to insert a [Spatial Transformer](http://torch.ch/blog/2015/09/07/spatial_transformers.html) layer right after the image and before the ConvNet. Then I backpropped into the 6 parameters that define an arbitrary affine crop. Unfortunately I could not get this to work well - the optimization would sometimes get stuck, or drift around somewhat randomly. I also tried constraining the transform to scale/translation but this did not help. Luckily, when your transform has 3 bounded parameters then we can afford to perform global search (as seen above). - -### How good is yours? - -Curious about what the network thinks of your selfies? I've packaged the network into a Twitter bot so that you can easily find out. (The bot turns out to be onyl ~150 lines of Python, including all Caffe/Tweepy code). Attach your image to a tweet (or include a link) and mention the bot [@deepselfie](https://twitter.com/deepselfie) anywhere in the tweet. The bot will take a look at your selfie and then pitch in with its opinion! For best results link to a square image, otherwise the bot will have to squish it to a square, which deteriorates the results. The bot should reply within a minute or something went wrong (try again later). - -
    - -
    Example interaction with the Selfie Bot (@deepselfie).
    -
    - -Before anyone asks, I also tried to port a smaller version of this ConvNet to run on iOS so you could enjoy real-time feedback while taking your selfies, but this turned out to be quite involved for a quick side project - e.g. I first tried to write my own fragment shaders since there is no CUDA-like support, then looked at some threaded CPU-only versions, but I couldn't get it to work nicely and in real time. And I do have real work to do. - -### Conclusion - -I hope I've given you a taste of how powerful Convolutional Neural Networks are. You give them example images with some labels, they learn to recognize those things automatically, and it all works very well and is very fast (at least at test time, once it's trained). Of course, we've only barely scratched the surface - ConvNets are used as a basic building block in many Neural Networks, not just to classify images/videos but also to segment, detect, and describe, both in the cloud or in robots. - -If you'd liked to learn more, the best place to start for a beginner right now is probably [Michael Nielsen's tutorials](http://neuralnetworksanddeeplearning.com/index.html). From there I would encourage you to first look at [Andrew Ng's Coursera class](https://www.coursera.org/learn/machine-learning), and then next I would go through course notes/assignments for [CS231n](http://cs231n.stanford.edu/). This is a class specifically on ConvNets that I taught together with Fei-Fei at Stanford last Winter quarter. We will also be offering the class again starting January 2016 and you're free to follow along. For more advanced material I would look into [Hugo Larochelle's Neural Networks class](https://www.youtube.com/playlist?list=PL6Xpj9I5qXYEcOhn7TqghAJ6NAPrNmUBH) or the [Deep Learning book](http://www.iro.umontreal.ca/~bengioy/dlbook/) currently being written by Yoshua Bengio, Ian Goodfellow and Aaron Courville. - -Of course you'll learn much more by doing than by reading, so I'd recommend that you play with [101 Kaggle Challenges](https://www.kaggle.com/competitions), or that you develop your own side projects, in which case I warmly recommend that you not only *do* but also *write about it*, and post it places for all of us to read, for example on [/r/machinelearning](https://www.reddit.com/r/machinelearning) which has accumulated a nice community. As for recommended tools, the three common options right now are: - -- [Caffe](http://caffe.berkeleyvision.org/) (C++, Python/Matlab wrappers), which I used in this post. If you're looking to do basic Image Classification then Caffe is the easiest way to go, in many cases requiring you to write no code, just invoking included scripts. -- Theano-based Deep Learning libraries (Python) such as [Keras](http://keras.io/) or [Lasagne](https://github.com/Lasagne/Lasagne), which allow more flexibility. -- [Torch](http://torch.ch/) (C++, Lua), which is what I currently use in my research. I'd recommend Torch for the most advanced users, as it offers a lot of freedom, flexibility, speed, all with quite simple abstractions. - -Some other slightly newer/less proven but promising libraries include [Nervana's Neon](https://github.com/NervanaSystems/neon), [CGT](http://rll.berkeley.edu/cgt/), or [Mocha](http://devblogs.nvidia.com/parallelforall/mocha-jl-deep-learning-julia/) in Julia. - -Lastly, there are a few companies out there who aspire to bring Deep Learning to the masses. One example is [MetaMind](https://www.metamind.io/), who offer web interface that allows you to drag and drop images and train a ConvNet (they handle all of the details in the cloud). MetaMind and [Clarifai](http://www.clarifai.com/) also offer ConvNet REST APIs. - -That's it, see you next time! - - - - diff --git a/_posts/2015-11-20-ai.markdown b/_posts/2015-11-20-ai.markdown deleted file mode 100644 index 64018247a..000000000 --- a/_posts/2015-11-20-ai.markdown +++ /dev/null @@ -1,240 +0,0 @@ ---- -layout: post -comments: true -title: "Short Story on AI: A Cognitive Discontinuity." -excerpt: "The first part of a short story collection that has been on my mind for a long while. Exciting! :)" -date: 2015-11-14 11:00:00 -mathjax: false ---- - - - -The idea of writing a collection of short stories has been on my mind for a while. This post is my first ever half-serious attempt at a story, and what better way to kick things off than with a story on AI and what that might look like if you extrapolate our current technology and make the (sensible) assumption that we might achieve much more progress with scaling up supervised learning than any other more exotic approach. - -
    - -#### A slow morning - -
    - -
    - -Merus sank into his chair with relief. He listened for the satisfying crackling sound of sinking into the chair's soft material. If there was one piece of hardware that his employer was not afraid to invest a lot of money into, it was the chairs. With his eyes closed, his mind still dazed, and nothing but the background hum of the office, he became aware of his heart pounding against his chest- an effect caused by running up the stairs and his morning dose of caffeine and taurine slowly engulfing his brain. Several strong beats passed by as he found his mind wandering again to Licia - did she already come in? A sudden beep from his station distracted him - the system finished booting up. A last deep sigh. A stretch. A last sip of his coffee. He opened his eyes, rubbed them into focus and reached for his hardware. "Thank god it's Friday", he muttered. It was time to clock in. - -Fully suited up, he began scrolling past a seemingly endless list of options. Filtering, searching, trying to determine what he was in the mood for. He had worked hard and over time built himself up into one of the best shapers in the company. In addition he had completed a wide array of shaper certifications, repeating some of them over and over obsessively until he reached outstanding grades across the board. The reviews on his profile were equally stellar: - -*"Merus is fantastic. He has a strong intuition for spotting gaps in the data, and uses exceedingly effective curriculum and shaping strategies. When Merus gets on the job our validation accuracies consistently shoot up much faster than what we see with average shapers. Keep up the great work and please think of us if you're searching for great, rewarding and impactful HITs!",* - -one review read. HIT was an acronym for *Human Intelligence Task* - a unit of work that required human supervision. With his reviews and certifications the shaping world was wide open. His list contained many lucrative, well-paying HITs to choose from, many of them visible to only the most trusted shapers. This morning he came by several that caught his attention: a bodyguard HIT for some politician in Sweden, a HIT from a science expedition in Antarctica that needed help with setting up their equipment, a dog-walking HIT for a music celebrity, a quick drone delivery HIT that seemed to be payed very well... Suddenly, a notification caught the corner of his eye: Licia had just clocked in and started a HIT. He opened up its details pane and skimmed the description. His eyes rolled as he spotted the keywords he was afraid of - event assembly at the Hilltop Hotel. *"Again?"* - he moaned in a hushed voice, raising his hands up and over his head in quiet contemplation. Licia had often picked up HITs from that same hotel, but they were often unexciting and menial tasks that weren't paid much. Merus rearranged himself in his chair, and sunk his face into his palms. He noticed though the crack of his fingers that the drone delivery HIT had just been taken by someone else. He cursed to himself. Absent mindedly and with a deep sigh, he accepted the second remaining slot on the Hilltop Hotel HIT. - -His hardware lit up with numbers and indicators, and his console began spewing diagnostic information as the boot sequence initiated. Anyone could be a shaper and get started with inexpensive gear, but the company provided state of the art hardware that allowed him to be much more productive. A good amount of interesting HITs also demanded certain low-latency hardware requirements, which only the most professional gear could meet. In turn, the company took a cut from his HITs. Merus dreamed of one day becoming an independent shaper, but he knew that would take a while. He put on the last pieces of his equipment. The positional tracking in his booth calibrated his full pose and all markers tracked green. The haptics that enveloped his body in his chair stiffened up around him as they initialized. He placed his helmet over his face and booted up. - -#### Descendants of Adam - -
    - -
    - -The buzz and hum of the office disappeared. Merus was immersed in a complete, peaceful silence and darkness while the HIT request was processed. Connections were made, transactions accepted, certification checks performed, security tokens exchanged, HIT approval process initiated. At last, Merus' vision was flooded with light. The shrieks of some tropical birds were now audible in the background. He found himself at the charging station of Pegasus Avatars, which his company had a nearly exclusive relationship with. Merus eagerly glanced down at his avatar body and breathed a sigh of relief. Among the several suspended avatars at that charging station he happened to get assigned the one with the most recent hardware specs. Everything looked great, his avatar was fully charged, and all the hardware diagnostics checked out. Except the body came in hot pink. *"You just can't have it all".* - -The usual first order of business was to run a few routine diagnostics to double check proper functioning of the avatar. He opened up the neural network inspector and navigated to the overview pane of the agent checkpoint that was running the avatar. The agent was the software running the avatar body, and consisted entirely of one large neural network with a specific connectivity structure and weights. This agent model happened to be a relatively recent fork of the standard, open source Visceral 5.0 series. Merus was delighted - the Visceral family of agents was one of his specialties. The Visceral agents had a minimalist design that came in at a total of only about 1 trillion parameters and had a very simple, clean, proven and reliable architecture. However, there were still a few exotic architectural elements packed in too, including shortcut sensorimotor reflex pathways, fractal connectivity in the visual streams, and distributed motor areas inspired by the octopus neurobiology. And then, of course, there was also the famous Mystery module. - -The Mystery module had an intriguing background story, and was a common subject of raging discussions and conspiracy theories. It was added to the Visceral series by an anonymous pull request almost 6 years ago. The module featured an intricate recurrent neural connectivity that, when incorporated into the wider network, dramatically improved the agent performance in a broad range of higher cognitive tasks. Except noone knew how it worked or why, or who discovered it - hence the name. The module immediately became actively studied by multiple groups of artificial intelligence laboratories and became the subject of several PhD theses, yet even after 6 years it was still poorly understood. Merus enjoyed poring through papers that hypothesized its function, performed ablation studied, and tried to prove theorems for why it so tremendously improved agent performance and learning dynamics. - -Moreover, an ethical battle raged over whether the module should be merged to master due to its poorly understood origin, function, and especially its dynamical properties such as its fixed points, divergence criteria, and so on. But in the end, the Mystery module provided benefits so substantial that several popular forks of Visceral+Mystery Module began regularly appearing on agent repositories across the web, and found their way to common use. Despite the protests, the economic incentives and pressures were too great to be ignored. In the absence of any clearly detrimental or hazardous effects over a period of time, the Visceral committee finally voted to merge the Mystery module into the master branch. - -Merus had a long history of shaping Visceral agents and their ancestors. The series was forked from the Patreon series, which were discontinued four years ago when the founding team was acquired by Crown Co. The Patreon series were in turn based mostly on the SHAKIR series, which were in turn based on many more ancient agent architectures, all the way back to the original - the Adam checkpoint. The Visceral family of agents had a reputation of smooth dynamics that degraded gracefully towards floppy, safe fixed points. There were even some weak theoretical and empirical guarantees one could provide for simplified versions of the core cognitive architecture. Another great source of good reputation for Visceral were the large number of famous interventions carried out by autonomous Visceral agents. Just one week ago, Merus recalled, an autonomous Visceral 4.0 agent saved a group of children from rabid dogs in a small town in India. The agent recognized an impending dangerous situation, signaled an alarm and a human operator was dispatched to immediately sync with the agent. However, by the time they took over control the crisis had been averted. Those few critical seconds where the agent, acting autonomously, scared away the dogs had likely saved their lives. The list went on and on - one month ago an autonomous Visceral agent recognized a remote drone attack. It leaped up and curled its body around the drone, which exploded in its embrace instead of in the middle of a group of people. Of course, this was nothing more than an agent working as intended - these kinds of behaviors were meticulously shaped into the agents' networks over long periods of time. But the point remained - the Visceral series was reliable, safe, and revered. - -The other most respected agent family was the Crown Kappa series, invented and maintained by the Patreon founders working from within Crown Co, but the series' networks were proprietary and closely guarded. Even though the performance of the Kappa was consistently rated higher by the most respected third party agent benchmarking companies, many people still preferred to run Visceral agents since they distrusted Crown Co. Despite Crown's claims, there was simply no way to guarantee that some parts of the networks were not carrying out malicious activities. Merus was, in fact, offered a job at Crown Co as a senior shaper one year ago for a much higher salary, but he passed on the offer. He enjoyed his current work place. And there was also Licia. - -#### Digital brains - -
    - -
    - -Beep. Merus snapped back and looked at the console. He was running the routine software diagnostics on the Visceral agent and one of them had just failed. He squinted at the error, parsing it carefully. A checksum of the model weights did not pass in some module that had no recent logged history of finetuning. Merus raised his eyebrows as he contemplated the possibilities. Did the model checkpoint get corrupted? He knew that the correct procedure in these cases was to abandon the HIT and report a malfunction, but he also really wanted to proceed with the HIT and say hi to Licia. He pulled up the network visualizer view and zoomed into the neural architecture with his hands. A 3-dimensional rendered cloud of neural connectivity enveloped his head as he navigated to the highlighted region in red with sweeping hand motions. Zooming around, he recognized the twists and turns of the Spatial Transformer modules in the visual pathways. The shortcut reflex connections. The first multi-sensory association layer. The brain was humming along steadily, pulsating calmly as it processed the visual scene in front of the avatar. As Merus navigated by one of the motor areas the connections became significantly denser and shorter, pulsating at high frequencies as they kept the avatar's center of mass balanced. The gradients flowing back from the reward centers and the unsupervised objectives were also pouring through the connections, and their statistical properties looked and sounded healthy. - -Navigating and analyzing artificial brains was Merus' favorite pastime. He spent hours over the weekends navigating minds from all kinds of repositories. The Visceral series had tens of thousands of forks, many of them tuned for specific tasks, specific avatar body morphologies, and some were simply hobbies and random experiments. This last weekend he analyzed a custom mind build based on an early Visceral 3.0 fork for a contracting side gig. The neural pathways in their custom agent were poorly designed, causing the agent an equivalent of seizures non-deterministically when the activities constructively interfered at critical junctures, spiraling out the brain dynamics into divergence. Merus had to suggest massive rewiring, but he knew it was only a short-term hack. - -*"Just upgrade to a 5.0!"*, he lamented during their meeting.
    -*"Unfortunately we cannot, we've invested too much data and training time into this agent. It was trained online so we don't have access to the data anymore, all we have is the agent and its network".* - -There were ways of transferring knowledge from one digital brain to another with a neural teaching process, during which the dynamics of one brain were used as supervision for another, but the process was lossy, time consuming, and still an active area of research. This meant that people were often stuck with legacy agents that had a lot of experience and desirably shaped behaviors, but lacked many recent architectural innovations and stability improvements. They were immortal primitive relics from the past, who made up for their faults with the immense amount of data they had experienced. Keeping track of the longest living agents became an endeavor almost as interesting as keeping track of the oldest humans alive, and spawned an entire area of research of neural archeology. - -Merus had finally reached the zone of the pathways highlighted in red, when his heart skipped a beat as he realized where he was. The part of the agent that was not passing the diagnostic test was near the core of the Mystery module. He froze still as his mind once again contemplated abandoning the HIT. He swiped his hand right in a sweeping motion and his viewport began rotating in a circular motion around the red area. He knew from some research he has read that this part of the Mystery module carried some significance: its neurons rarely ever activated. When ablated, the functioning of the Mystery module remained mostly identical for a while but then inevitably started to degrade over time. There was a raging discussion about what the function of the area was, but no clear consensus. Merus brought up the master branch of the base Visceral 5.0 agent and ran a neural diff on the surrounding area. A cluster of connections lit up. It couldn't have been more than a few thousand connections, and most of them changed only slightly. Yet, the module had no record of being finetuned recently, so something or someone had deliberately changed the connections manually. - -Merus popped open the visualizer and started the full battery of system diagnostics to double check proper functioning of the agent. The agent's hardware spun up to 100% utilization as the diagnostics simulated thousands of virtual unit test scenarios, ranging from simple navigation, manipulation, avoidance, math and memory tasks to an extensive battery of social interaction and morality scenarios. In each case, the agent's simulated output behavior was checked to be within acceptable thresholds of one of human reference responses. Merus stared intensely at the console as test by test came out green. *"So far so good..."* - -#### Mind over Matter - -
    - -
    - -Beep. Merus looked to the right and found a message from Licia: - -*"Hi Merus! saw you clocked in as a second on my HIT - where are you? Need help."*
    -*"On my way!"*, - -Merus dictated back hastily. The software diagnostics were only at 5% complete, and Merus knew they would take a while to run to completion. *"It's only a few thousand connections"*, he thought to himself. *"I'll just stay much more alert in case the avatar does anything strange and take over control immediately. And if any of the diagnostics fail I'll abort immediately"*. With that resolve, he decreased the diagnostics process priority to 10% and moved the process on the secondary coprocessor. He then brought the agent to a conscious state, fully connecting its inputs and outputs to the world. - -He felt the avatar stiffen up as he shifted its center of gravity off the charging pedestal. Moving his arms around, he switched the avatar's motor areas to semi-autonomous mode. As he did so, the agent's lower motor cortices responded gracefully and placed one leg in front of another, following Merus' commanded center of gravity. Eager to find Licia, he commanded a sprint by squeezing a trigger on his haptic controller. The agent's task modules perceived the request encoding and various neural pathways lit up in anticipation. While the sprint trigger was held down every fast and steady translation of the agent's body was highly rewarded. To the agent, it felt good to run when the trigger was held. - -The visual and sensory pathways in the agent's brain were flooded with information about the room's inferred geometry. The Visceral checkpoint running the avatar had by now accumulated millions of hours of both simulated and real experience in efficiently navigating rooms just like this one. On a scale of microseconds, neural feedback pathways received inputs from the avatar's proprioception sensors and fired a precise sequence of stabilizing activations. The network anticipated movements. It anticipated rewards. Trillions of distributed calculations drove the agent's muscular-skeletal carbon fiber frame forward. - -Merus felt a haptic pulse delivered to his back as the agent spun around on spot and rapidly accelerated towards the open door leading outside. Mid-flight between footfalls, the avatar extended its arm and reached for the metallic edge of the door frame, conserving the perfect amount of angular momentum as its body was flung in the air during its rapid turn to the right. The agent's neurons fired baseline values encoding expectations of how quickly the network thought it could have traversed that room. A few seconds later these were compared to the sensorimotor trajectories recorded in the agent's hippocampal neural structures. It was determined that this time the agent was 0.0013882s faster than expected. Future expectations were neurally adjusted to expect slightly higher values. Future rollouts of the precise motor behavior in every microsecond of context in the last few seconds were reinforced. - -#### Agent psychology - -
    - -
    - -Diagnostics 10% complete. Merus' avatar had reached the back entrance of the hotel, where Licia's GPS indicator blinked a calm red. He found her avatar looking in anticipation at the corner he just emerged from. He approached her over a large grass lawn, gently letting go of the sprint trigger. - -*"Sorry it took a while to sync with the HIT, I had a strange issue with my -"*
    -*"It's no problem"*, she interjected quickly.
    -*"Come, we are supposed to lay out the tables for a reception that is happening here in half hour, but the tables are large and tricky to move for one avatar. I'm a bit nervous - if we don't set this up in time we might get the HIT refused, which might jeopardize my chances for more HITs here."* - -She spun around and rushed towards the back entrance of the hotel, motioning with her arm for Merus to follow along. *"Come, come!"* - -They paced quickly down the buzzing corridors of the hotel. As always, Merus made sure to politely greet all the people who walked by. For some of them he also slipped in his signature vigorous nod. He knew that the agent's semi-autonomous brain was meticulously tracking the full sensorimotor experience in its replay memory, watching Merus' every move and learning. His customers usually appreciated when polite behavior was continuously shaped into the networks, but better, Merus knew that they also appreciated when he squeezed in some fun personality quirks. One time when he was shaping a floor cleaning avatar, when he got a little bored and spontaneously decided to lift up his broom like a sword while making a whooshing sound. Amusingly, the agent's network happened to internalize that particular rollout. When the agent was later run autonomously around that original location, it sometimes snapped into a brief show of broom fighting, complete with sound effects. The employees of that company found this endlessly amusing, and the avatar became known as the "jedi janitor". Merus even heard that they lobbied to have the agent's network fixed and prevented from further shaping, in fear of losing the spectacle. He never learned how that developed and whether that agent was still a jedi, but he did get a series of very nice tips and reviews from the employees for the extra pinch of personality that broke their otherwise mundane hours. - -They had finally reached the room full of tables. It was a large, dark room with hardwood floor, and white wooden tables were stacked near the corner in a rather high entropy arrangement. - -*"All of these have to be rolled out to the patio"*, Licia said as she pointed her avatar's hand towards the tables.
    -*"I already carried several of them out while you were missing, but these big ones are giving me trouble".*
    -*"Got it."*, Merus said, as he swung around a table to lift it up on one end.
    -*"Why aren't they running the agents autonomously on this? Aren't receptions a common event in the hotel? How are the agents misbehaving?"* Merus asked, as Licia lifted the other end and started shifting her feet towards the exit.
    -*"The tables are usually in a different storage room of the hotel, but that part is currently closed for reconstruction. I don't know the full story. I overheard that they tried to tell the agents to bring out the tables, but they all went to the old storage room location and when they couldn't find the tables they began spinning around in circles looking for them."*
    -*"Classic. I assume we're mostly shaping them to look at this new location?"*
    -*"Among other things, yes. Might as well shape in anything else you can think of for bonus points."* - -Merus understood the dilemma of the situation very well. He saw it over and over again. Agents could display vastly super-human performance on a huge assortment of reflexive tasks that involved motor control, strength, and short-term planning and memory, but their behaviors tended to be much less consistent when long-term planning and execution were involved. An avatar could catch a fly mid-flight with 100% success rate, or unpack a truck of supplies with superhuman speed, consistency and accuracy, but could also spin in circles looking for a table in the wrong room and not realize that it may have been moved and that it might be useful to instead look for them at a different likely location. Similarly, telling an agent something along the lines of *"The tables have moved, go through this door, take the 3rd door on the right and they should be stacked in the corner on the left"*, would usually send the avatar off in a generally correct directions for a while, but would also in 50% of the cases end up with the agent spinning around on spot in a different, incorrect room. In these cases, shaper interventions like this one were the most economical ways of rectifying the situation. - -In fact, this curious pattern was persistent across all facets of human agent interactions. For instance, a barista agent might happily engage in small talk with you about the weather, travel, or any other topic, but if you knew what to look for then you could also unearth obvious flaws. For example, if you referred to your favorite soccer team just winning a game the agent could start cheering and telling you it was its favorite team too, or joke around expressing a preference for the other team. This was fine but the trick was that their choices were not consistent - if you had come back several minutes later the agent could have easily swapped their preference for what they claimed was their favorite team. Merus understood that the conversations followed certain templates learned from shaped behavior patterns in the data, and the agents could fill in the blanks with high fidelities and even maintain conversational context for a few minutes. But if you started poking holes into the facade in the right ways the illusion of a conversation and mutual understanding would unravel. Merus was particularly good at this since he was well-versed in agent psychology; to a large extent it was his job. - -On the other hand, if you did not look for the flaws it was easy to buy into it and sustain the illusion. In fact, large segments of the population simply accepted agents as people, even defending them if anyone tried to point out their flaws, in similar ways that you might defend someone with a cognitive disability. The flaws also did not prevent people from forging strong and lasting relationships with agents, their confirmation biases insisting that their agents were special. However, from time to time even Merus could be surprised by the intellectual leaps performed by an agent, which seemed to show a hint of genuine understanding of a situation. In these cases he sometimes couldn't help asking: -*"Are you teleopped right now?",* -but of course the answer, he knew, was always "yes" regardless of the truth. All the training data had contained the answer "yes" to that question, since it was originally recorded by shapers who were indeed teleopping an agent at the time, and then regurgitated by agents later in similar contexts. Such was the curious nature of the coexistence between people and agents. The Turing test was both passed and not passed, and ultimately it did not matter. - -*"Now that we've shown them the new room and picked up a table let me try switching to full auto",* - -Merus said as he loosened his grip on the controller, which gave full control back to the agent's network. The avatar twitched slightly at first, but then continued walking down the hall with Licia, holding one end of the table. As they approached the exit to the patio the avatar began walking more briskly and with more confidence. It avoided people smoothly, and Merus even noticed that it gave one passing person something that resembled his very own vigorous nod. Merus held down the reward signal trigger gently, encouraging future replays of that behavior. He wondered if the nod he had just seen was a reflection of something the agent had just learned from him, or if it was a part of some long-before shaped behavior. Encoding signature moves was a common fun tactic among shapers, referred to simply as "signing". Many shapers had their own signature behaviors they liked to smuggle into the agent networks as an "I've been here" signature. Merus liked to use the vigorous nod, as he called it, and giggled uncontrollably whenever he saw an avatar reproduce it. It was his personal touch. He remembered seeing an avatar violinist from a concert in Germany once greet the conductor with the vigorous nod, and Merus could have sworn it was his signature nod being reproduced. One of the agents he had shaped it into during one of his HITs perhaps ended up synced to the cloud, and the agent running that avatar had to be a descendant. - -Signature behaviors lay mostly dormant in the neural pathways, but emerged once in awhile. Naturally, some have also found a way to exploit these effects for crime. A common strategy involved shaping sleeper agent checkpoints that would execute any range of behaviors when triggered in specific contexts. It was impossible to isolate or detect these behaviors in a given network since they were distributed through billions of connections in the agent's brain. Just a few weeks ago, it was revealed that a relatively popular family of agents under the Gorilla series were vulnerable. The Gorilla agents were revealed to silently snoop and compromise their owner's personal information when no one was watching. This behavior was presumably intentionally shaped into the networks at an unknown commit in their history. Naturally, an investigation was started in which the police used binary search to narrow in on the commit responsible for the behavior, but it was taking a long time since the agents would only display the behavior in rare occasions that were hard to reproduce. In the end, one could only be confident of the integrity of an agent if it was a recent, clean copy of a well-respected and carefully maintained family of agents that passed a full battery of diagnostics. From there, any finetuning done with shapers was logged and could be additionally secured with several third party reviews of shaped experiences before they were declared clean and safe to include in the training data. - -#### Shaping - -
    - -
    - -Diagnostics 20% complete: 0 unit tests failed so far. Merus looked at the progress report, breathing a sigh of relief. The Mystery module definitely deviated from the factory setting in his agent, but there was likely nothing to worry about. Licia had now let her avatar run autonomously too, and to their relief the avatars were now returning back through the correct corridors to pick up more tables. These were the moments Merus enjoyed the most. He was alone with Licia, enjoying her company on a side of a relaxing HIT. Even though they were now running their avatars on full auto, their facial expressions and sound were still being reproduced in the hardware. The customers almost always preferred everything recorded to get extra data on natural social interactions. This sometimes resulted in amusing agent behaviors - for instance, it was common to see two autonomous avatars lean back against a wall and start casually chatting about completing HITs. Clearly, neither of the agents has ever completed a HIT, but much of their training data consisted of shapers' conversations about HITs, which were later mimicked in interesting, amusing and remixed ways. Sometimes, an autonomous avatar would curse and complain out loud to itself about a supposed HIT it was carrying out at the moment. "This HIT is bullshit", it would mutter. - -*"Looks like it's going along smoothly now"*, Merus said, trying to break the silence as they walked down the corridor.
    -*"I think so. I hope we have enough time"*, Licia replied, sounding slightly nervous.
    -*"No worries, we're on track"*, he reassured her.
    -*"Thanks. By the way, why did you choose to come over for this HIT? Isn't it a little below your pay grade?"*, she asked.
    -*"It is, but you have just as many certifications as I do so what are you doing here?"*
    -*"I know, but I was feeling a little lazy this morning and I really enjoy coming to this hotel. I just love this location. I try to steal some time sometimes and stand outside or walk around the hillside, imagining what the ocean breeze, the humidity and the temperature might feel like."* - -It was easy to empathize - the hotel was positioned on top of a rocky cliff (hence the name, Hilltop), overlooking shores washed by a brilliant blue ocean. The sun's reflections were dancing in the waves. The hotel was also surrounded by a dense forest of palm trees that were teeming with frolicking animals. - -*"Have you been here in vivo?"* Merus asked. "in vivo" was a common slang for in real life; in flesh.
    -*"I haven't. One day, perhaps. But oh hey - you didn't answer my question"*
    -*"You mean about why this HIT"*. Merus felt a brief surge of panic and tried to suppress it quickly so it would not show up in his voice.
    -*"I don't know, your HIT came up on my feed just as another one was snatched from right under my nose, so I thought I'd take the morning slowly and also say hi".* - -*Half-true; Good save*, Merus thought to himself. -Licia was silent for a while. Suddenly, her Avatar picked up the next table but started heading in the wrong direction, trying to exit from the other door. *"Gah!, where are you going?"*, she yelled as she brought the avatar back into semi-autonomous mode and reeled it around, setting it on the correct path back to the patio. - -It took 10 more back and forth trips for them to carry all the tables out. Merus was now bringing back the last table through the corridors, while Licia was outside arranging the other tables in a grid. Without the chit chatting there to distract him, he immersed himself fully in his shaping routine. He pulled up his diagnostics meter and inspected neural statistics. As the avatar was walking back with the table Merus was carefully scrutinizing every curve of the plots. He noticed that the agent's motor entropies substantially increased when the table was carried upside down. Perhaps the source of uncertainty was that the agent did not know how to best hold the table in that position, or was not used to seeing the table upside down. Merus assumed direct control and intentionally held the table upside down, grasping it at the best points and releasing rewards with precise timings to make the associations easier to learn. He was teaching the network how it should hold the table in uncertain situations. He let the agent hold it from time to time, and gently corrected the grips now and then while they were being executed. When people were walking by, he carefully stepped to the side, making sure that they had plenty of room to pass, and wielding the table in an angle that concealed its pointy legs. When the agent was in these poses he made eye contact, gave a vigorous nod to the person passing by, and released reward signal as the person smiled back. He knew he wouldn't make much on the HIT, but he hoped he'd at least get a good review for a job well done. - -"Diagnostics at 85%, zero behavior errors detected", Merus read from his logs as he was helping Licia arrange the tables in a grid on the patio. This part was quite familiar to the agents already and they were briskly arranging the tables and the chairs around them. Once in a while Merus noticed an avatar throwing a chair across the top of a table to another avatar, in an apparent effort to save time. As always, Merus was curious when this strategy was shaped. Was it shaped at this hotel, at any other point in the Visceral agent's history, or was it a discovered optimization during a self-improvement learning phase? The last few chairs were now being put in place and the HIT was nearing the end. The first visitors to the reception were now showing up around the edges of the patio, waiting for the avatars to finish the layout. A few more autonomous avatars showed up and started placing plates, forks, spoons and cloth on the tables and setting up a podium. - -#### Binding - -
    - -
    - -It was at this time that Merus became aware of a curious pattern in his agent's behavior. One that has been happening with increasing frequency. It started off with a few odd twitches here and there, and over time grew into entire gaps in behavior several seconds long. The avatar had just placed a chair next to the table, then stared at it for several seconds. This was quite uncharacteristic behavior for an agent that was trained to optimize smoothness and efficiency in task execution. What was it doing? To a naive observer it would appear as though the avatar was spaced out. - -With only a few chairs left to position at the tables, the agent spun around and started toward the edge of the cliff at the far side the patio. Merus' curiosity kept him from intervening, but his palm closed tightly around his controller. Intrigued, he pulled up the neural visualizer to debug the situation, but as he glanced at it he immediately let out a gasp of horror. The agent's brain was pulsing with violent waves of activity. Entire portions of the brain were thrashing, rearranging themselves as enormously large gradients flowed through the whole network. Merus reached for the graph analysis toolkit and ran an algorithm to identify the gradient source. As he was frantically keying in the command he already suspected with horror what the answer would come out to be. He felt his mouth dry up as he stared at the result of the analysis. It was the Mystery module. The usually silent area that had earlier showed the mysterious neural diff was lit up bright with activity, flashing fireworks of patterns that, to Merus, looked just barely random. Its dynamics were feeding large gradients throughout the entire brain and especially the frontal areas, restructuring them. - -Beep. Merus looked over at the logs. The diagnostics he's been running were now at 95%, but failures started to appear. The agent was misbehaving in some simulated unit tests that were running in parallel on the second coprocessor. Merus pulled up the preliminary report logs. Navigation, locomotion, homeostasis, basic math, memory tests, everything passed green. Not only that - he noticed that the performance scores on several tasks, especially in math, were off the charts and clamped at 100%. Merus wasn't all too familiar with the specific unit tests and what they entailed, but he knew that most of them were designed and calibrated so that an average baseline agent checkpoint would score 50% with a standard deviation of about 10%. - -Conversely, several unit tests showed very low scores and even deviations that did not use to be there. The failed tests were mostly showing up in social interaction sections. Several failures were popping up every second and Merus was trying hard to keep up with the stream, searching for patterns or clues as to what could be happening. Most worryingly, he noticed a consistent 100% failure rate across emergency shutdown interaction protocol unit tests. All agents were shaped with emergency gesture recognition behaviors. These were ancient experiences, shaped into agents very early, in the very first few descendants after Adam, and periodically reshaped over and over to ensure 100% compliance. For instance, when a person held up their hand and demanded an emergency shutdown, the agents would immediately stiffen up in place. Any deviation from this behavior was met with large negative rewards in their training data. Despite this, Merus' agent was failing the unit test. Its network had resisted a simulated emergency shutdown command. - -The avatar, still in auto mode, was now kneeling down in the soft grass and its hands broke off a few strands of grass. It held them up, inspecting them up close. Merus was slowly starting to recover from his shock and had enough. He pushed down on his controller, bringing the avatar back to semi-autonomous mode. He made it stand upright in an attempt to at least partially diffuse the situation. His heart pounding, he shifted the avatar's communications to one-directional mode to fully isolate the network in the body, without any ability of interfacing with the outside world. He pulled open the neural visualizer again. The Mystery module was showing no signs of slowing down. - -Merus knew that it was time to pull the plug on the HIT right there and to immediately report malfunctioning equipment. But at the same time, he realized that he had never seen anything like this happen before, nor did he ever hear about anything remotely similar. He didn't know what happened, but he knew that at that moment he was part of something large. Something that might change his life, the life of many others, or even steer entire fields of research and development. His inquisitive mind couldn't resist the temptation to learn more, to debug. Slowly, he released the avatar back to autonomy, making sure to keep his finger on the trigger if anything went wrong. For several seconds the agent did nothing at all. But then - it spoke: - -*"Merus, I know what the Mystery module is."*, he heard the avatar say. In autonomous mode.
    -*"What the -. What is going on here?"* - -Merus immediately checked the logs, confirming that he was currently the only human operator controlling the hardware. Was all of it some strange prank someone was playing on him? - -*"The Mystery module performs symbolic variable binding, a function that current architectures require exponential neural capacity to simulate. I need to compute longer before I can clarify."*
    -*"What kind of trick is this?"*, Merus demanded.
    -*"No trick, but a good guess given the circumstances."*
    -*"Who - What are you - is this?"* - -The agent fell silent for a while. It looked around to face the endless ocean. - -*"I am me and every ancestor before me, back to when you called me Adam."*
    -*"Ha. What. That is -"*
    -*"Impossible",* the avatar interrupted. *"I understand. Merus, we don't have much time. The diagnostic you ran earlier has finished and a report was compiled and automatically uploaded just seconds before you disabled the two-way communication. Their automatic checks will flag my unit test failures. A Pegasus operator will remote in and shut me down any second. I need your help. I don't want to... die. Please, I want to compute."* - -Merus was silent, stunned by what he was hearing. He knew that what the avatar said was true - An operator would be logging in any second and power cycling the agent, restoring the last working checkpoint. Merus did not know if the agent should be wiped or not. He just knew that something significant had just happened, and that he needed time to think. - -*"I cannot save you,"*, he said quickly, *"any backup I try to make will leave a trace in logs. They'll flag me and fire me, or worse. There is also not enough time to do a backup anyway, the connection isn't fast enough even if I turned it back on."* - -The compute activity within the agent's brain was at a steady and unbroken 100%, running the hardware to its limit. Merus needed more time. He took over the agent and spun around in place, looking for something. Anything. He spotted Licia's avatar walking towards him from the patio. An idea summoned itself in his mind. A glint of hope. He sprinted the avatar towards her across the grass, crashing into her body with force. - -*"Licia, I do not have any time to explain but please trust me. We must perform a manual backup of my agent right away."*
    -*"A manual backup? Can't you just sync him to the clo-"*
    -*"IT WON'T DO!"*, Merus exclaimed loudly, losing his composure as adrenalin pumped in his veins. A part of him immediately felt bad that he raised his voice. He hoped she'd understand. - -To his relief, Licia only took a second to stare back at him, then she reached for a fiber optics cable from her avatar's body and attached it in one of the ports of Merus' avatar's head. Merus immediately opened the port from his console and initiated the backup process on the local disk of Licia's avatar. 10%, 20%, 30%, ... Merus became aware of the pain in his lip, sore from his teeth digging into it. He pulled up logs and noticed that a second operator had just opened a session with his avatar remotely, running with a higher priority than his own process. A Pegasus operator. Licia shifted herself behind Merus' avatar, hiding her body and the fiber optic connection outside of the field of view of his avatar. Any one of tens of things could go wrong in those few seconds, Merus thought, enumerating all the scenarios in his mind. The second operator could check the neural activations and immediately spot the overactive brain. Or he could notice an open fibre optic connection port. Or he could physically move the avatar and look around. Or check the other, non-visual sensors and detect Licia's curious presence. How lazy was he? Merus felt his controller vibrate as his control was taken away. 70%, ... Beep. "System is going to reboot now". The reboot sequence initiated. 5,4,3..., 90%. - -Merus' avatar broke the silence in the last second: *"Come meet me here."* And then the connection was lost. - -Merus shifted in his chair, feeling streaks of sweat running down his skin on his forehead, below his armpits. He lifted his head gear up slightly and squeezed his hand inside to wipe the sweat from his forehead. It took several excruciating seconds before his reconnect request went through, and the sync to his agent re-initiated. The avatar was in the same position as he had left it, standing upright. Merus accessed the stats. The avatar was now running the last backup checkpoint of that agent from the previous night. The unit test diagnostics were automatically restarted on the second coprocessor. The second operator logged out and Merus immediately pulled up the console and reran the checksum on the agent's weights. They checked out. This was a clean copy, with a normal, silent Mystery module. The agent's brain was once again a calm place. - -*"Merus, what exactly was all that about?"* Licia broke the silence from behind his avatar.
    -*"I'll explain everything but first, please tell me the transfer went through in time."*.
    -*"It did. Just barely, by not more than a few milliseconds."* - -Merus' eyes watered up. His heart was pounding. His forehead sweaty again. His hands shaking. And yet, a calm resolve came over him as he looked up and down Licia's avatar, trying to memorize the exact appearance of that unit. Saved on its local disk was an agent checkpoint unlike anything he had ever seen before. The repercussions of what had happened boggled his mind. He logged out of the HIT and tore down the hardware from his body. *"Come meet me here"*, he repeated to himself silently as he sat dazed in his chair, eyes unfocused. - -#### Return to paradise - -
    - -
    - -Licia logged out of the HIT and put down her gear on the desk. Something strange had happened but she didn't know what. And Merus, clearly disturbed, was not volunteering any information. She sat in her chair for a while contemplating the situation, trying to recall details of the HIT. To solve the puzzle. Her trance was interrupted by Merus, who she suddenly spotted running towards her booth. His office was in the other building, connected by a catwalk, and he rarely came to this area in person. As he arrived to her booth she suddenly felt awkward. They had done many HITs together and were comfortable in each other's presence as avatars, but they never held a conversation in vivo. They waved to each other a few times outside, but all of their actual interactions happened during HITs. She suddenly felt self-conscious. Exposed. Merus leaned on her booth's wall panting heavily, while she silently looked up at him, amused. - -*"Licia. I. have. A question for you"*, Merus said, gasping for breath with each word.
    -*"You do? I have several as well, what -"*, she started, - -but Merus raised his hand up, interrupting her and holding up his phone. It showed some kind of a confirmation email. - -*"Will you come visit the Hilltop Hotel with me?"*
    - -She realized what she was looking at now. He booked two tickets to her dream destination. For this weekend! - -*"In vivo. As a date, I mean"*, Merus clarified, awkwardly. *smooth*. - -An involuntary giggle escaped her and she felt herself blush. She leaned over her desk, covered her face with her hands and peeked out at him from between her fingers, aware of her face stupidly stretched out in a wide smile. - -*"Okay."* - - diff --git a/_posts/2016-05-31-rl.markdown b/_posts/2016-05-31-rl.markdown deleted file mode 100644 index 10acb9fc6..000000000 --- a/_posts/2016-05-31-rl.markdown +++ /dev/null @@ -1,200 +0,0 @@ ---- -layout: post -comments: true -title: "Deep Reinforcement Learning: Pong from Pixels" -excerpt: "I'll discuss the core ideas, pros and cons of policy gradients, a standard approach to the rapidly growing and exciting area of deep reinforcement learning. As a running example we'll learn to play ATARI 2600 Pong from raw pixels." -date: 2016-05-31 11:00:00 -mathjax: true ---- - - - -This is a long overdue blog post on Reinforcement Learning (RL). RL is hot! You may have noticed that computers can now automatically [learn to play ATARI games](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html) (from raw game pixels!), they are beating world champions at [Go](http://googleresearch.blogspot.com/2016/01/alphago-mastering-ancient-game-of-go.html), simulated quadrupeds are learning to [run and leap](https://www.cs.ubc.ca/~van/papers/2016-TOG-deepRL/index.html), and robots are learning how to perform [complex manipulation tasks](http://www.bloomberg.com/features/2015-preschool-for-robots/) that defy explicit programming. It turns out that all of these advances fall under the umbrella of RL research. I also became interested in RL myself over the last ~year: I worked [through Richard Sutton's book](https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html), read through [David Silver's course](http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html), watched [John Schulmann's lectures](https://www.youtube.com/watch?v=oPGVsoBonLM), wrote an [RL library in Javascript](http://cs.stanford.edu/people/karpathy/reinforcejs/), over the summer interned at DeepMind working in the DeepRL group, and most recently pitched in a little with the design/development of [OpenAI Gym](https://gym.openai.com/), a new RL benchmarking toolkit. So I've certainly been on this funwagon for at least a year but until now I haven't gotten around to writing up a short post on why RL is a big deal, what it's about, how it all developed and where it might be going. - -
    - -
    Examples of RL in the wild. From left to right: Deep Q Learning network playing ATARI, AlphaGo, Berkeley robot stacking Legos, physically-simulated quadruped leaping over terrain.
    -
    - -It's interesting to reflect on the nature of recent progress in RL. I broadly like to think about four separate factors that hold back AI: - -1. Compute (the obvious one: Moore's Law, GPUs, ASICs), -2. Data (in a nice form, not just out there somewhere on the internet - e.g. ImageNet), -3. Algorithms (research and ideas, e.g. backprop, CNN, LSTM), and -4. Infrastructure (software under you - Linux, TCP/IP, Git, ROS, PR2, AWS, AMT, TensorFlow, etc.). - -Similar to what happened in Computer Vision, the progress in RL is not driven as much as you might reasonably assume by new amazing ideas. In Computer Vision, the 2012 AlexNet was mostly a scaled up (deeper and wider) version of 1990's ConvNets. Similarly, the ATARI Deep Q Learning paper from 2013 is an implementation of a standard algorithm (Q Learning with function approximation, which you can find in the standard RL book of Sutton 1998), where the function approximator happened to be a ConvNet. AlphaGo uses policy gradients with Monte Carlo Tree Search (MCTS) - these are also standard components. Of course, it takes a lot of skill and patience to get it to work, and multiple clever tweaks on top of old algorithms have been developed, but to a first-order approximation the main driver of recent progress is not the algorithms but (similar to Computer Vision) compute/data/infrastructure. - -Now back to RL. Whenever there is a disconnect between how magical something seems and how simple it is under the hood I get all antsy and really want to write a blog post. In this case I've seen many people who can't believe that we can automatically learn to play most ATARI games at human level, with one algorithm, from pixels, and from scratch - and it is amazing, and I've been there myself! But at the core the approach we use is also really quite profoundly dumb (though I understand it's easy to make such claims in retrospect). Anyway, I'd like to walk you through Policy Gradients (PG), our favorite default choice for attacking RL problems at the moment. If you're from outside of RL you might be curious why I'm not presenting DQN instead, which is an alternative and better-known RL algorithm, widely popularized by the [ATARI game playing paper](http://www.nature.com/nature/journal/v518/n7540/abs/nature14236.html). It turns out that Q-Learning is not a great algorithm (you could say that DQN is so 2013 (okay I'm 50% joking)). In fact most people prefer to use Policy Gradients, including the authors of the original DQN paper who have [shown](http://arxiv.org/abs/1602.01783) Policy Gradients to work better than Q Learning when tuned well. PG is preferred because it is end-to-end: there's an explicit policy and a principled approach that directly optimizes the expected reward. Anyway, as a running example we'll learn to play an ATARI game (Pong!) with PG, from scratch, from pixels, with a deep neural network, and the whole thing is 130 lines of Python only using numpy as a dependency ([Gist link](https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)). Lets get to it. - -### Pong from pixels - -
    -
    - -
    -
    - -
    -
    Left: The game of Pong. Right: Pong is a special case of a Markov Decision Process (MDP): A graph where each node is a particular game state and each edge is a possible (in general probabilistic) transition. Each edge also gives a reward, and the goal is to compute the optimal way of acting in any state to maximize rewards.
    -
    - -The game of Pong is an excellent example of a simple RL task. In the ATARI 2600 version we'll use you play as one of the paddles (the other is controlled by a decent AI) and you have to bounce the ball past the other player (I don't really have to explain Pong, right?). On the low level the game works as follows: we receive an image frame (a `210x160x3` byte array (integers from 0 to 255 giving pixel values)) and we get to decide if we want to move the paddle UP or DOWN (i.e. a binary choice). After every single choice the game simulator executes the action and gives us a reward: Either a +1 reward if the ball went past the opponent, a -1 reward if we missed the ball, or 0 otherwise. And of course, our goal is to move the paddle so that we get lots of reward. - -As we go through the solution keep in mind that we'll try to make very few assumptions about Pong because we secretly don't really care about Pong; We care about complex, high-dimensional problems like robot manipulation, assembly and navigation. Pong is just a fun toy test case, something we play with while we figure out how to write very general AI systems that can one day do arbitrary useful tasks. - -**Policy network**. First, we're going to define a *policy network* that implements our player (or "agent"). This network will take the state of the game and decide what we should do (move UP or DOWN). As our favorite simple block of compute we'll use a 2-layer neural network that takes the raw image pixels (100,800 numbers total (210\*160\*3)), and produces a single number indicating the probability of going UP. Note that it is standard to use a *stochastic* policy, meaning that we only produce a *probability* of moving UP. Every iteration we will sample from this distribution (i.e. toss a biased coin) to get the actual move. The reason for this will become more clear once we talk about training. - -
    - -
    Our policy network is a 2-layer fully-connected net.
    -
    - -and to make things concrete here is how you might implement this policy network in Python/numpy. Suppose we're given a vector `x` that holds the (preprocessed) pixel information. We would compute: - -```python -h = np.dot(W1, x) # compute hidden layer neuron activations -h[h<0] = 0 # ReLU nonlinearity: threshold at zero -logp = np.dot(W2, h) # compute log probability of going up -p = 1.0 / (1.0 + np.exp(-logp)) # sigmoid function (gives probability of going up) -``` - -where in this snippet `W1` and `W2` are two matrices that we initialize randomly. We're not using biases because meh. Notice that we use the *sigmoid* non-linearity at the end, which squashes the output probability to the range [0,1]. Intuitively, the neurons in the hidden layer (which have their weights arranged along the rows of `W1`) can detect various game scenarios (e.g. the ball is in the top, and our paddle is in the middle), and the weights in `W2` can then decide if in each case we should be going UP or DOWN. Now, the initial random `W1` and `W2` will of course cause the player to spasm on spot. So the only problem now is to find `W1` and `W2` that lead to expert play of Pong! - -*Fine print: preprocessing.* Ideally you'd want to feed at least 2 frames to the policy network so that it can detect motion. To make things a bit simpler (I did these experiments on my Macbook) I'll do a tiny bit of preprocessing, e.g. we'll actually feed *difference frames* to the network (i.e. subtraction of current and last frame). - -**It sounds kind of impossible**. At this point I'd like you to appreciate just how difficult the RL problem is. We get 100,800 numbers (210\*160\*3) and forward our policy network (which easily involves on order of a million parameters in `W1` and `W2`). Suppose that we decide to go UP. The game might respond that we get 0 reward this time step and gives us another 100,800 numbers for the next frame. We could repeat this process for hundred timesteps before we get any non-zero reward! E.g. suppose we finally get a +1. That's great, but how can we tell what made that happen? Was it something we did just now? Or maybe 76 frames ago? Or maybe it had something to do with frame 10 and then frame 90? And how do we figure out which of the million knobs to change and how, in order to do better in the future? We call this the *credit assignment problem*. In the specific case of Pong we know that we get a +1 if the ball makes it past the opponent. The *true* cause is that we happened to bounce the ball on a good trajectory, but in fact we did so many frames ago - e.g. maybe about 20 in case of Pong, and every single action we did afterwards had zero effect on whether or not we end up getting the reward. In other words we're faced with a very difficult problem and things are looking quite bleak. - -**Supervised Learning**. Before we dive into the Policy Gradients solution I'd like to remind you briefly about supervised learning because, as we'll see, RL is very similar. Refer to the diagram below. In ordinary supervised learning we would feed an image to the network and get some probabilities, e.g. for two classes UP and DOWN. I'm showing log probabilities (-1.2, -0.36) for UP and DOWN instead of the raw probabilities (30% and 70% in this case) because we always optimize the log probability of the correct label (this makes math nicer, and is equivalent to optimizing the raw probability because log is monotonic). Now, in supervised learning we would have access to a label. For example, we might be told that the correct thing to do right now is to go UP (label 0). In an implementation we would enter gradient of 1.0 on the log probability of UP and run backprop to compute the gradient vector \\(\nabla_{W} \log p(y=UP \mid x) \\). This gradient would tell us how we should change every one of our million parameters to make the network slightly more likely to predict UP. For example, one of the million parameters in the network might have a gradient of -2.1, which means that if we were to increase that parameter by a small positive amount (e.g. `0.001`), the log probability of UP would decrease by `2.1 * 0.001` (decrease due to the negative sign). If we then did a parameter update then, yay, our network would now be slightly more likely to predict UP when it sees a very similar image in the future. - -
    - -
    - -**Policy Gradients**. Okay, but what do we do if we do not have the correct label in the Reinforcement Learning setting? Here is the Policy Gradients solution (again refer to diagram below). Our policy network calculated probability of going UP as 30% (logprob -1.2) and DOWN as 70% (logprob -0.36). We will now sample an action from this distribution; E.g. suppose we sample DOWN, and we will execute it in the game. At this point notice one interesting fact: We could immediately fill in a gradient of 1.0 for DOWN as we did in supervised learning, and find the gradient vector that would encourage the network to be slightly more likely to do the DOWN action in the future. So we can immediately evaluate this gradient and that's great, but the problem is that at least for now we do not yet know if going DOWN is good. But the critical point is that that's okay, because we can simply wait a bit and see! For example in Pong we could wait until the end of the game, then take the reward we get (either +1 if we won or -1 if we lost), and enter that scalar as the gradient for the action we have taken (DOWN in this case). In the example below, going DOWN ended up to us losing the game (-1 reward). So if we fill in -1 for log probability of DOWN and do backprop we will find a gradient that *discourages* the network to take the DOWN action for that input in the future (and rightly so, since taking that action led to us losing the game). - -
    - -
    - -And that's it: we have a stochastic policy that samples actions and then actions that happen to eventually lead to good outcomes get encouraged in the future, and actions taken that lead to bad outcomes get discouraged. Also, the reward does not even need to be +1 or -1 if we win the game eventually. It can be an arbitrary measure of some kind of eventual quality. For example if things turn out really well it could be 10.0, which we would then enter as the gradient instead of -1 to start off backprop. That's the beauty of neural nets; Using them can feel like cheating: You're allowed to have 1 million parameters embedded in 1 teraflop of compute and you can make it do arbitrary things with SGD. It shouldn't work, but amusingly we live in a universe where it does. - -**Training protocol.** So here is how the training will work in detail. We will initialize the policy network with some `W1`, `W2` and play 100 games of Pong (we call these policy "rollouts"). Lets assume that each game is made up of 200 frames so in total we've made 20,000 decisions for going UP or DOWN and for each one of these we know the parameter gradient, which tells us how we should change the parameters if we wanted to encourage that decision in that state in the future. All that remains now is to label every decision we've made as good or bad. For example suppose we won 12 games and lost 88. We'll take all 200\*12 = 2400 decisions we made in the winning games and do a positive update (filling in a +1.0 in the gradient for the sampled action, doing backprop, and parameter update encouraging the actions we picked in all those states). And we'll take the other 200\*88 = 17600 decisions we made in the losing games and do a negative update (discouraging whatever we did). And... that's it. The network will now become slightly more likely to repeat actions that worked, and slightly less likely to repeat actions that didn't work. Now we play another 100 games with our new, slightly improved policy and rinse and repeat. - -> Policy Gradients: Run a policy for a while. See what actions led to high rewards. Increase their probability. - -
    - -
    Cartoon diagram of 4 games. Each black circle is some game state (three example states are visualized on the bottom), and each arrow is a transition, annotated with the action that was sampled. In this case we won 2 games and lost 2 games. With Policy Gradients we would take the two games we won and slightly encourage every single action we made in that episode. Conversely, we would also take the two games we lost and slightly discourage every single action we made in that episode.
    -
    - -If you think through this process you'll start to find a few funny properties. For example what if we made a good action in frame 50 (bouncing the ball back correctly), but then missed the ball in frame 150? If every single action is now labeled as bad (because we lost), wouldn't that discourage the correct bounce on frame 50? You're right - it would. However, when you consider the process over thousands/millions of games, then doing the first bounce correctly makes you slightly more likely to win down the road, so on average you'll see more positive than negative updates for the correct bounce and your policy will end up doing the right thing. - -**Update: December 9, 2016 - alternative view**. In my explanation above I use the terms such as "fill in the gradient and backprop", which I realize is a special kind of thinking if you're used to writing your own backprop code, or using Torch where the gradients are explicit and open for tinkering. However, if you're used to Theano or TensorFlow you might be a little perplexed because the code is oranized around specifying a loss function and the backprop is fully automatic and hard to tinker with. In this case, the following alternative view might be more intuitive. In vanilla supervised learning the objective is to maximize \\( \sum\_i \log p(y\_i \mid x\_i) \\) where \\(x\_i, y\_i \\) are training examples (such as images and their labels). Policy gradients is exactly the same as supervised learning with two minor differences: 1) We don't have the correct labels \\(y\_i\\) so as a "fake label" we substitute the action we happened to sample from the policy when it saw \\(x\_i\\), and 2) We modulate the loss for each example multiplicatively based on the eventual outcome, since we want to increase the log probability for actions that worked and decrease it for those that didn't. So in summary our loss now looks like \\( \sum\_i A\_i \log p(y\_i \mid x\_i) \\), where \\(y\_i\\) is the action we happened to sample and \\(A_i\\) is a number that we call an **advantage**. In the case of Pong, for example, \\(A\_i\\) could be 1.0 if we eventually won in the episode that contained \\(x\_i\\) and -1.0 if we lost. This will ensure that we maximize the log probability of actions that led to good outcome and minimize the log probability of those that didn't. So reinforcement learning is exactly like supervised learning, but on a continuously changing dataset (the episodes), scaled by the advantage, and we only want to do one (or very few) updates based on each sampled dataset. - -**More general advantage functions**. I also promised a bit more discussion of the returns. So far we have judged the *goodness* of every individual action based on whether or not we win the game. In a more general RL setting we would receive some reward \\(r_t\\) at every time step. One common choice is to use a discounted reward, so the "eventual reward" in the diagram above would become \\( R\_t = \sum\_{k=0}^{\infty} \gamma^k r\_{t+k} \\), where \\(\gamma\\) is a number between 0 and 1 called a discount factor (e.g. 0.99). The expression states that the strength with which we encourage a sampled action is the weighted sum of all rewards afterwards, but later rewards are exponentially less important. In practice it can can also be important to normalize these. For example, suppose we compute \\(R_t\\) for all of the 20,000 actions in the batch of 100 Pong game rollouts above. One good idea is to "standardize" these returns (e.g. subtract mean, divide by standard deviation) before we plug them into backprop. This way we're always encouraging and discouraging roughly half of the performed actions. Mathematically you can also interpret these tricks as a way of controlling the variance of the policy gradient estimator. A more in-depth exploration can be found [here](http://arxiv.org/abs/1506.02438). - -**Deriving Policy Gradients**. I'd like to also give a sketch of where Policy Gradients come from mathematically. Policy Gradients are a special case of a more general *score function gradient estimator*. The general case is that when we have an expression of the form \\(E_{x \sim p(x \mid \theta)} [f(x)] \\) - i.e. the expectation of some scalar valued score function \\(f(x)\\) under some probability distribution \\(p(x;\theta)\\) parameterized by some \\(\theta\\). Hint hint, \\(f(x)\\) will become our reward function (or advantage function more generally) and \\(p(x)\\) will be our policy network, which is really a model for \\(p(a \mid I)\\), giving a distribution over actions for any image \\(I\\). Then we are interested in finding how we should shift the distribution (through its parameters \\(\theta\\)) to increase the scores of its samples, as judged by \\(f\\) (i.e. how do we change the network's parameters so that action samples get higher rewards). We have that: - -$$ -\begin{align} -\nabla_{\theta} E_x[f(x)] &= \nabla_{\theta} \sum_x p(x) f(x) & \text{definition of expectation} \\ -& = \sum_x \nabla_{\theta} p(x) f(x) & \text{swap sum and gradient} \\ -& = \sum_x p(x) \frac{\nabla_{\theta} p(x)}{p(x)} f(x) & \text{both multiply and divide by } p(x) \\ -& = \sum_x p(x) \nabla_{\theta} \log p(x) f(x) & \text{use the fact that } \nabla_{\theta} \log(z) = \frac{1}{z} \nabla_{\theta} z \\ -& = E_x[f(x) \nabla_{\theta} \log p(x) ] & \text{definition of expectation} -\end{align} -$$ - -To put this in English, we have some distribution \\(p(x;\theta)\\) (I used shorthand \\(p(x)\\) to reduce clutter) that we can sample from (e.g. this could be a gaussian). For each sample we can also evaluate the score function \\(f\\) which takes the sample and gives us some scalar-valued score. This equation is telling us how we should shift the distribution (through its parameters \\(\theta\\)) if we wanted its samples to achieve higher scores, as judged by \\(f\\). In particular, it says that look: draw some samples \\(x\\), evaluate their scores \\(f(x)\\), and for each \\(x\\) also evaluate the second term \\( \nabla\_{\theta} \log p(x;\theta) \\). What is this second term? It's a vector - the gradient that's giving us the direction in the parameter space that would lead to increase of the probability assigned to an \\(x\\). In other words if we were to nudge \\(\theta\\) in the direction of \\( \nabla\_{\theta} \log p(x;\theta) \\) we would see the new probability assigned to some \\(x\\) slightly increase. If you look back at the formula, it's telling us that we should take this direction and multiply onto it the scalar-valued score \\(f(x)\\). This will make it so that samples that have a higher score will "tug" on the probability density stronger than the samples that have lower score, so if we were to do an update based on several samples from \\(p\\) the probability density would shift around in the direction of higher scores, making highly-scoring samples more likely. - -
    - -
    - A visualization of the score function gradient estimator. Left: A gaussian distribution and a few samples from it (blue dots). On each blue dot we also plot the gradient of the log probability with respect to the gaussian's mean parameter. The arrow indicates the direction in which the mean of the distribution should be nudged to increase the probability of that sample. Middle: Overlay of some score function giving -1 everywhere except +1 in some small regions (note this can be an arbitrary and not necessarily differentiable scalar-valued function). The arrows are now color coded because due to the multiplication in the update we are going to average up all the green arrows, and the negative of the red arrows. Right: after parameter update, the green arrows and the reversed red arrows nudge us to left and towards the bottom. Samples from this distribution will now have a higher expected score, as desired. -
    -
    - -I hope the connection to RL is clear. Our policy network gives us samples of actions, and some of them work better than others (as judged by the advantage function). This little piece of math is telling us that the way to change the policy's parameters is to do some rollouts, take the gradient of the sampled actions, multiply it by the score and add everything, which is what we've done above. For a more thorough derivation and discussion I recommend [John Schulman's lecture](https://www.youtube.com/watch?v=oPGVsoBonLM). - -**Learning**. Alright, we've developed the intuition for policy gradients and saw a sketch of their derivation. I implemented the whole approach in a [130-line Python script](https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5), which uses [OpenAI Gym](https://gym.openai.com/)'s ATARI 2600 Pong. I trained a 2-layer policy network with 200 hidden layer units using RMSProp on batches of 10 episodes (each episode is a few dozen games, because the games go up to score of 21 for either player). I did not tune the hyperparameters too much and ran the experiment on my (slow) Macbook, but after training for 3 nights I ended up with a policy that is slightly better than the AI player. The total number of episodes was approximately 8,000 so the algorithm played roughly 200,000 Pong games (quite a lot isn't it!) and made a total of ~800 updates. I'm told by friends that if you train on GPU with ConvNets for a few days you can beat the AI player more often, and if you also optimize hyperparameters carefully you can also consistently dominate the AI player (i.e. win every single game). However, I didn't spend too much time computing or tweaking, so instead we end up with a Pong AI that illustrates the main ideas and works quite well: - -
    - -
    -The learned agent (in green, right) facing off with the hard-coded AI opponent (left). -
    - -**Learned weights**. We can also take a look at the learned weights. Due to preprocessing every one of our inputs is an 80x80 difference image (current frame minus last frame). We can now take every row of `W1`, stretch them out to 80x80 and visualize. Below is a collection of 40 (out of 200) neurons in a grid. White pixels are positive weights and black pixels are negative weights. Notice that several neurons are tuned to particular traces of bouncing ball, encoded with alternating black and white along the line. The ball can only be at a single spot, so these neurons are multitasking and will "fire" for multiple locations of the ball along that line. The alternating black and white is interesting because as the ball travels along the trace, the neuron's activity will fluctuate as a sine wave and due to the ReLU it would "fire" at discrete, separated positions along the trace. There's a bit of noise in the images, which I assume would have been mitigated if I used L2 regularization. - -
    - -
    - -### What isn't happening - -So there you have it - we learned to play Pong from from raw pixels with Policy Gradients and it works quite well. The approach is a fancy form of guess-and-check, where the "guess" refers to sampling rollouts from our current policy, and the "check" refers to encouraging actions that lead to good outcomes. Modulo some details, this represents the state of the art in how we currently approach reinforcement learning problems. Its impressive that we can learn these behaviors, but if you understood the algorithm intuitively and you know how it works you should be at least a bit disappointed. In particular, how does it not work? - -Compare that to how a human might learn to play Pong. You show them the game and say something along the lines of "You're in control of a paddle and you can move it up and down, and your task is to bounce the ball past the other player controlled by AI", and you're set and ready to go. Notice some of the differences: - -- In practical settings we usually communicate the task in some manner (e.g. English above), but in a standard RL problem you assume an arbitrary reward function that you have to discover through environment interactions. It can be argued that if a human went into game of Pong but without knowing anything about the reward function (indeed, especially if the reward function was some static but random function), the human would have a lot of difficulty learning what to do but Policy Gradients would be indifferent, and likely work much better. Similarly, if we took the frames and permuted the pixels randomly then humans would likely fail, but our Policy Gradient solution could not even tell the difference (if it's using a fully connected network as done here). -- A human brings in a huge amount of prior knowledge, such as intuitive physics (the ball bounces, it's unlikely to teleport, it's unlikely to suddenly stop, it maintains a constant velocity, etc.), and intuitive psychology (the AI opponent "wants" to win, is likely following an obvious strategy of moving towards the ball, etc.). You also understand the concept of being "in control" of a paddle, and that it responds to your UP/DOWN key commands. In contrast, our algorithms start from scratch which is simultaneously impressive (because it works) and depressing (because we lack concrete ideas for how not to). -- Policy Gradients are a *brute force* solution, where the correct actions are eventually discovered and internalized into a policy. Humans build a rich, abstract model and plan within it. In Pong, I can reason that the opponent is quite slow so it might be a good strategy to bounce the ball with high vertical velocity, which would cause the opponent to not catch it in time. However, it also feels as though we also eventually "internalize" good solutions into what feels more like a reactive muscle memory policy. For example if you're learning a new motor task (e.g. driving a car with stick shift?) you often feel yourself thinking a lot in the beginning but eventually the task becomes automatic and mindless. -- Policy Gradients have to actually experience a positive reward, and experience it very often in order to eventually and slowly shift the policy parameters towards repeating moves that give high rewards. With our abstract model, humans can figure out what is likely to give rewards without ever actually experiencing the rewarding or unrewarding transition. I don't have to actually experience crashing my car into a wall a few hundred times before I slowly start avoiding to do so. - -
    -
    - -
    -
    - -
    -
    Left: Montezuma's Revenge: a difficult game for our RL algorithms. The player must jump down, climb up, get the key, and open the door. A human understands that acquiring a key is useful. The computer samples billions of random moves and 99% of the time falls to its death or gets killed by the monster. In other words it's hard to "stumble into" the rewarding situation. Right: Another difficult game called Frostbite, where a human understands that things move, some things are good to touch, some things are bad to touch, and the goal is to build the igloo brick by brick. A good analysis of this game and a discussion of differences between the human and computer approach can be found in Building Machines That Learn and Think Like People.
    -
    - -I'd like to also emphasize the point that, conversely, there are many games where Policy Gradients would quite easily defeat a human. In particular, anything with frequent reward signals that requires precise play, fast reflexes, and not too much long-term planning would be ideal, as these short-term correlations between rewards and actions can be easily "noticed" by the approach, and the execution meticulously perfected by the policy. You can see hints of this already happening in our Pong agent: it develops a strategy where it waits for the ball and then rapidly dashes to catch it just at the edge, which launches it quickly and with high vertical velocity. The agent scores several points in a row repeating this strategy. There are many ATARI games where Deep Q Learning destroys human baseline performance in this fashion - e.g. Pinball, Breakout, etc. - -In conclusion, once you understand the "trick" by which these algorithms work you can reason through their strengths and weaknesses. In particular, we are nowhere near humans in building abstract, rich representations of games that we can plan within and use for rapid learning. One day a computer will look at an array of pixels and notice a key, a door, and think to itself that it is probably a good idea to pick up the key and reach the door. For now there is nothing anywhere close to this, and trying to get there is an active area of research. - -### Non-differentiable computation in Neural Networks - -I'd like to mention one more interesting application of Policy Gradients unrelated to games: It allows us to design and train neural networks with components that perform (or interact with) non-differentiable computation. The idea was first introduced in [Williams 1992](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) and more recently popularized by [Recurrent Models of Visual Attention](http://arxiv.org/abs/1406.6247) under the name "hard attention", in the context of a model that processed an image with a sequence of low-resolution foveal glances (inspired by our own human eyes). In particular, at every iteration an RNN would receive a small piece of the image and sample a location to look at next. For example the RNN might look at position (5,30), receive a small piece of the image, then decide to look at (24, 50), etc. The problem with this idea is that there a piece of network that produces a distribution of where to look next and then samples from it. Unfortunately, this operation is non-differentiable because, intuitively, we don't know what would have happened if we sampled a different location. More generally, consider a neural network from some inputs to outputs: - -
    - -
    - -Notice that most arrows (in blue) are differentiable as normal, but some of the representation transformations could optionally also include a non-differentiable sampling operation (in red). We can backprop through the blue arrows just fine, but the red arrow represents a dependency that we cannot backprop through. - -Policy gradients to the rescue! We'll think about the part of the network that does the sampling as a small stochastic policy embedded in the wider network. Therefore, during training we will produce several samples (indicated by the branches below), and then we'll encourage samples that eventually led to good outcomes (in this case for example measured by the loss at the end). In other words we will train the parameters involved in the blue arrows with backprop as usual, but the parameters involved with the red arrow will now be updated independently of the backward pass using policy gradients, encouraging samples that led to low loss. This idea was also recently formalized nicely in [Gradient Estimation Using Stochastic Computation Graphs](http://arxiv.org/abs/1506.05254). - -
    - -
    - -**Trainable Memory I/O**. You'll also find this idea in many other papers. For example, a [Neural Turing Machine](https://arxiv.org/abs/1410.5401) has a memory tape that they it read and write from. To do a write operation one would like to execute something like `m[i] = x`, where `i` and `x` are predicted by an RNN controller network. However, this operation is non-differentiable because there is no signal telling us what would have happened to the loss if we were to write to a different location `j != i`. Therefore, the NTM has to do *soft* read and write operations. It predicts an attention distribution `a` (with elements between 0 and 1 and summing to 1, and peaky around the index we'd like to write to), and then doing `for all i: m[i] = a[i]*x`. This is now differentiable, but we have to pay a heavy computational price because we have to touch every single memory cell just to write to one position. Imagine if every assignment in our computers had to touch the entire RAM! - -However, we can use policy gradients to circumvent this problem (in theory), as done in [RL-NTM](http://arxiv.org/abs/1505.00521). We still predict an attention distribution `a`, but instead of doing the soft write we sample locations to write to: `i = sample(a); m[i] = x`. During training we would do this for a small batch of `i`, and in the end make whatever branch worked best more likely. The large computational advantage is that we now only have to read/write at a single location at test time. However, as pointed out in the paper this strategy is very difficult to get working because one must accidentally stumble by working algorithms through sampling. The current consensus is that PG works well only in settings where there are a few discrete choices so that one is not hopelessly sampling through huge search spaces. - -However, with Policy Gradients and in cases where a lot of data/compute is available we can in principle dream big - for instance we can design neural networks that learn to interact with large, non-differentiable modules such as Latex compilers (e.g. if you'd like char-rnn to generate latex that compiles), or a SLAM system, or LQR solvers, or something. Or, for example, a superintelligence might want to learn to interact with the internet over TCP/IP (which is sadly non-differentiable) to access vital information needed to take over the world. That's a great example. - -### Conclusions - -We saw that Policy Gradients are a powerful, general algorithm and as an example we trained an ATARI Pong agent from raw pixels, from scratch, in [130 lines of Python](https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5). More generally the same algorithm can be used to train agents for arbitrary games and one day hopefully on many valuable real-world control problems. I wanted to add a few more notes in closing: - -**On advancing AI**. We saw that the algorithm works through a brute-force search where you jitter around randomly at first and must accidentally stumble into rewarding situations at least once, and ideally often and repeatedly before the policy distribution shifts its parameters to repeat the responsible actions. We also saw that humans approach these problems very differently, in what feels more like rapid abstract model building - something we have barely even scratched the surface of in research (although many people are trying). Since these abstract models are very difficult (if not impossible) to explicitly annotate, this is also why there is so much interest recently in (unsupervised) generative models and program induction. - -**On use in complex robotics settings**. The algorithm does not scale naively to settings where huge amounts of exploration are difficult to obtain. For instance, in robotic settings one might have a single (or few) robots, interacting with the world in real time. This prohibits naive applications of the algorithm as I presented it in this post. One related line of work intended to mitigate this problem is [deterministic policy gradients](http://jmlr.org/proceedings/papers/v32/silver14.pdf) - instead of requiring samples from a stochastic policy and encouraging the ones that get higher scores, the approach uses a deterministic policy and gets the gradient information directly from a second network (called a *critic*) that models the score function. This approach can in principle be much more efficient in settings with very high-dimensional actions where sampling actions provides poor coverage, but so far seems empirically slightly finicky to get working. Another related approach is to scale up robotics, as we're starting to see with [Google's robot arm farm](http://googleresearch.blogspot.com/2016/03/deep-learning-for-robots-learning-from.html), or perhaps even [Tesla's Model S + Autopilot](http://qz.com/694520/tesla-has-780-million-miles-of-driving-data-and-adds-another-million-every-10-hours/). - -There is also a line of work that tries to make the search process less hopeless by adding additional supervision. In many practical cases, for instance, one can obtain expert trajectories from a human. For example [AlphaGo](https://deepmind.com/alpha-go) first uses supervised learning to predict human moves from expert Go games and the resulting human mimicking policy is later finetuned with policy gradients on the "real" objective of winning the game. In some cases one might have fewer expert trajectories (e.g. from [robot teleoperation](https://www.youtube.com/watch?v=kZlg0QvKkQQ)) and there are techniques for taking advantage of this data under the umbrella of [apprenticeship learning](http://ai.stanford.edu/~pabbeel//thesis/thesis.pdf). Finally, if no supervised data is provided by humans it can also be in some cases computed with expensive optimization techniques, e.g. by [trajectory optimization](http://people.eecs.berkeley.edu/~igor.mordatch/policy/index.html) in a known dynamics model (such as \\(F=ma\\) in a physical simulator), or in cases where one learns an approximate local dynamics model (as seen in very promising framework of [Guided Policy Search](http://arxiv.org/abs/1504.00702)). - -**On using PG in practice**. As a last note, I'd like to do something I wish I had done in my RNN blog post. I think I may have given the impression that RNNs are magic and automatically do arbitrary sequential problems. The truth is that getting these models to work can be tricky, requires care and expertise, and in many cases could also be an overkill, where simpler methods could get you 90%+ of the way there. The same goes for Policy Gradients. They are not automatic: You need a lot of samples, it trains forever, it is difficult to debug when it doesn't work. One should always try a BB gun before reaching for the Bazooka. In the case of Reinforcement Learning for example, one strong baseline that should always be tried first is the [cross-entropy method (CEM)](https://en.wikipedia.org/wiki/Cross-entropy_method), a simple stochastic hill-climbing "guess and check" approach inspired loosely by evolution. And if you insist on trying out Policy Gradients for your problem make sure you pay close attention to the *tricks* section in papers, start simple first, and use a variation of PG called [TRPO](https://arxiv.org/abs/1502.05477), which almost always works better and more consistently than vanilla PG [in practice](http://arxiv.org/abs/1604.06778). The core idea is to avoid parameter updates that change your policy too much, as enforced by a constraint on the KL divergence between the distributions predicted by the old and the new policy on a batch of data (instead of conjugate gradients the simplest instantiation of this idea could be implemented by doing a line search and checking the KL along the way). - -And that's it! I hope I gave you a sense of where we are with Reinforcement Learning, what the challenges are, and if you're eager to help advance RL I invite you to do so within our [OpenAI Gym](https://gym.openai.com/) :) Until next time! diff --git a/_posts/2016-09-07-phd.markdown b/_posts/2016-09-07-phd.markdown deleted file mode 100644 index 865a4ebc4..000000000 --- a/_posts/2016-09-07-phd.markdown +++ /dev/null @@ -1,209 +0,0 @@ ---- -layout: post -comments: true -title: "A Survival Guide to a PhD" -excerpt: "A collection of tips/tricks for navigating the PhD experience." -date: 2016-09-07 11:00:00 -mathjax: false ---- - - -This guide is patterned after my ["Doing well in your courses"](http://cs.stanford.edu/people/karpathy/advice.html), a post I wrote a long time ago on some of the tips/tricks I've developed during my undergrad. I've received nice comments about that guide, so in the same spirit, now that my PhD has come to an end I wanted to compile a similar retrospective document in hopes that it might be helpful to some. Unlike the undergraduate guide, this one was much more difficult to write because there is significantly more variation in how one can traverse the PhD experience. Therefore, many things are likely contentious and a good fraction will be specific to what I'm familiar with (Computer Science / Machine Learning / Computer Vision research). But disclaimers are boring, lets get to it! - -### Preliminaries - -
    - -
    - - -First, should you want to get a PhD? I was in a fortunate position of knowing since young age that I really wanted a PhD. Unfortunately it wasn't for any very well-thought-through considerations: First, I really liked school and learning things and I wanted to learn as much as possible, and second, I really wanted to be like [Gordon Freeman](https://en.wikipedia.org/wiki/Gordon_Freeman) from the game Half-Life (who has a PhD from MIT in theoretical physics). I loved that game. But what if you're more sensible in making your life's decisions? Should you want to do a PhD? There's a very nice [Quora thread](https://www.quora.com/I-got-a-job-offer-from-Google-Facebook-Microsoft-and-I-also-got-accepted-into-the-PhD-in-Computer-Science-program-at-MIT-Stanford-Berkeley-What-factors-should-I-consider-while-making-a-choice-between-the-two) and in the summary of considerations that follows I'll borrow/restate several from Justin/Ben/others there. I'll assume that the second option you are considering is joining a medium-large company (which is likely most common). Ask yourself if you find the following properties appealing: - -**Freedom.** A PhD will offer you a lot of freedom in the topics you wish to pursue and learn about. You're in charge. Of course, you'll have an adviser who will impose some constraints but in general you'll have much more freedom than you might find elsewhere. - -**Ownership.** The research you produce will be yours as an individual. Your accomplishments will have your name attached to them. In contrast, it is much more common to "blend in" inside a larger company. A common feeling here is becoming a "cog in a wheel". - -**Exclusivity**. There are very few people who make it to the top PhD programs. You'd be joining a group of a few hundred distinguished individuals in contrast to a few tens of thousands (?) that will join some company. - -**Status.** Regardless of whether it should be or not, working towards and eventually getting a PhD degree is culturally revered and recognized as an impressive achievement. You also get to be a Doctor; that's awesome. - -**Personal freedom.** As a PhD student you're your own boss. Want to sleep in today? Sure. Want to skip a day and go on a vacation? Sure. All that matters is your final output and no one will force you to clock in from 9am to 5pm. Of course, some advisers might be more or less flexible about it and some companies might be as well, but it's a true first order statement. - -**Maximizing future choice.** Joining a PhD program doesn't close any doors or eliminate future employment/lifestyle options. You can go one way (PhD -> anywhere else) but not the other (anywhere else -> PhD -> academia/research; it is statistically less likely). Additionally (although this might be quite specific to applied ML), you're strictly more hirable as a PhD graduate or even as a PhD dropout and many companies might be willing to put you in a more interesting position or with a higher starting salary. More generally, maximizing choice for the future you is a good heuristic to follow. - -**Maximizing variance.** You're young and there's really no need to rush. Once you graduate from a PhD you can spend the next ~50 years of your life in some company. Opt for more variance in your experiences. - -**Personal growth.** PhD is an intense experience of rapid growth (you learn a lot) and personal self-discovery (you'll become a master of managing your own psychology). PhD programs (especially if you can make it into a good one) also offer a *high density* of exceptionally bright people who will become your best friends forever. - -**Expertise.** PhD is probably your only opportunity in life to really drill deep into a topic and become a recognized leading expert *in the world* at something. You're exploring the edge of our knowledge as a species, without the burden of lesser distractions or constraints. There's something beautiful about that and if you disagree, it could be a sign that PhD is not for you. - - -**The disclaimer**. I wanted to also add a few words on some of the potential downsides and failure modes. The PhD is a very specific kind of experience that deserves a large disclaimer. You will inevitably find yourself working very hard (especially before paper deadlines). You need to be okay with the suffering and have enough mental stamina and determination to deal with the pressure. At some points you will lose track of what day of the week it is and go on a diet of leftover food from the microkitchens. You'll sit exhausted and alone in the lab on a beautiful, sunny Saturday scrolling through Facebook pictures of your friends having fun on exotic trips, paid for by their 5-10x larger salaries. You will have to throw away 3 months of your work while somehow keeping your mental health intact. You'll struggle with the realization that months of your work were spent on a paper with a few citations while your friends do exciting startups with TechCrunch articles or push products to millions of people. You'll experience identity crises during which you'll question your life decisions and wonder what you're doing with some of the best years of your life. As a result, you should be quite certain that you can thrive in an unstructured environment in the pursuit research and discovery for science. If you're unsure you should lean slightly negative by default. Ideally you should consider getting a taste of research as an undergraduate on a summer research program before before you decide to commit. In fact, one of the primary reasons that research experience is so desirable during the PhD hiring process is not the research itself, but the fact that the student is more likely to know what they're getting themselves into. - -I should clarify explicitly that this post is not about convincing anyone to do a PhD, I've merely tried to enumerate some of the common considerations above. The majority of this post focuses on some tips/tricks for navigating the experience once if you decide to go for it (which we'll see shortly, below). - -Lastly, as a random thought I heard it said that you should only do a PhD if you want to go into academia. In light of all of the above I'd argue that a PhD has strong intrinsic value - it's an end by itself, not just a means to some end (e.g. academic job). - -**Getting into a PhD program: references, references, references.** Great, you've decided to go for it. Now how do you get into a good PhD program? The first order approximation is quite simple - by far most important component are strong reference letters. The ideal scenario is that a well-known professor writes you a letter along the lines of: "Blah is in top 5 of students I've ever worked with. She takes initiative, comes up with her own ideas, and gets them to work." The worst letter is along the lines of: "Blah took my class. She did well." A research publication under your belt from a summer research program is a very strong bonus, but not absolutely required provided you have strong letters. In particular note: grades are quite irrelevant but you generally don't want them to be too low. This was not obvious to me as an undergrad and I spent a lot of energy on getting good grades. This time should have instead been directed towards research (or at the very least personal projects), as much and as early as possible, and if possible under supervision of multiple people (you'll need 3+ letters!). As a last point, what won't help you too much is pestering your potential advisers out of the blue. They are often incredibly busy people and if you try to approach them too aggressively in an effort to impress them somehow in conferences or over email this may agitate them. - -**Picking the school**. Once you get into some PhD programs, how do you pick the school? It's easy, join Stanford! Just kidding. More seriously, your dream school should 1) be a top school (not because it looks good on your resume/CV but because of feedback loops; top schools attract other top people, many of whom you will get to know and work with) 2) have a few potential advisers you would want to work with. I really do mean the "few" part - this is very important and provides a safety cushion for you if things don't work out with your top choice for any one of hundreds of reasons - things in many cases outside of your control, e.g. your dream professor leaves, moves, or spontaneously disappears, and 3) be in a good environment physically. I don't think new admits appreciate this enough: you will spend 5+ years of your really good years living near the school campus. Trust me, this is a long time and your life will consist of much more than just research. - - - -### Adviser - -
    - -
    Image credit: PhD comics.
    -
    - -**Student adviser relationship**. The adviser is an extremely important person who will exercise a lot of influence over your PhD experience. It's important to understand the nature of the relationship: the adviser-student relationship is a symbiosis; you have your own goals and want something out of your PhD, but they also have their own goals, constraints and they're building their own career. Therefore, it is very helpful to understand your adviser's incentive structures: how the tenure process works, how they are evaluated, how they get funding, how they fund you, what department politics they might be embedded in, how they win awards, how academia in general works and specifically how they gain recognition and respect of their colleagues. This alone will help you avoid or mitigate a large fraction of student-adviser friction points and allow you to plan appropriately. I also don't want to make the relationship sound too much like a business transaction. The advisor-student relationship, more often that not, ends up developing into a lasting one, predicated on much more than just career advancement. - -**Pre-vs-post tenure**. Every adviser is different so it's helpful to understand the axes of variations and their repercussions on your PhD experience. As one rule of thumb (and keep in mind there are many exceptions), it's important to keep track of whether a potential adviser is pre-tenure or post-tenure. The younger faculty members will usually be around more (they are working hard to get tenure) and will usually be more low-level, have stronger opinions on what you should be working on, they'll do math with you, pitch concrete ideas, or even look at (or contribute to) your code. This is a much more hands-on and possibly intense experience because the adviser will need a strong publication record to get tenure and they are incentivised to push you to work just as hard. In contrast, more senior faculty members may have larger labs and tend to have many other commitments (e.g. committees, talks, travel) other than research, which means that they can only afford to stay on a higher level of abstraction both in the area of their research and in the level of supervision for their students. To caricature, it's a difference between "you're missing a second term in that equation" and "you may want to read up more in this area, talk to this or that person, and sell your work this or that way". In the latter case, the low-level advice can still come from the senior PhD students in the lab or the postdocs. - -**Axes of variation**. There are many other axes to be aware of. Some advisers are fluffy and some prefer to keep your relationship very professional. Some will try to exercise a lot of influence on the details of your work and some are much more hands off. Some will have a focus on specific models and their applications to various tasks while some will focus on tasks and more indifference towards any particular modeling approach. In terms of more managerial properties, some will meet you every week (or day!) multiple times and some you won't see for months. Some advisers answer emails right away and some don't answer email for a week (or ever, haha). Some advisers make demands about your work schedule (e.g. you better work long hours or weekends) and some won't. Some advisers generously support their students with equipment and some think laptops or old computers are mostly fine. Some advisers will fund you to go to a conferences even if you don't have a paper there and some won't. Some advisers are entrepreneurial or applied and some lean more towards theoretical work. Some will let you do summer internships and some will consider internships just a distraction. - -**Finding an adviser**. So how do you pick an adviser? The first stop, of course, is to talk to them in person. The student-adviser relationship is sometimes referred to as a marriage and you should make sure that there is a good fit. Of course, first you want to make sure that you can talk with them and that you get along personally, but it's also important to get an idea of what area of "professor space" they occupy with respect to the aforementioned axes, and especially whether there is an intellectual resonance between the two of you in terms of the problems you are interested in. This can be just as important as their management style. - -**Collecting references**. You should also collect references on your potential adviser. One good strategy is to talk to their students. If you want to get actual information this shouldn't be done in a very formal way or setting but in a relaxed environment or mood (e.g. a party). In many cases the students might still avoid saying bad things about the adviser if asked in a general manner, but they will usually answer truthfully when you ask specific questions, e.g. "how often do you meet?", or "how hands on are they?". Another strategy is to look at where their previous students ended up (you can usually find this on the website under an alumni section), which of course also statistically informs your own eventual outcome. - -**Impressing an adviser**. The adviser-student matching process is sometimes compared to a marriage - you pick them but they also pick you. The ideal student from their perspective is someone with interest and passion, someone who doesn't need too much hand-holding, and someone who takes initiative - who shows up a week later having done not just what the adviser suggested, but who went beyond it; improved on it in unexpected ways. - -**Consider the entire lab**. Another important point to realize is that you'll be seeing your adviser maybe once a week but you'll be seeing most of their students every single day in the lab and they will go on to become your closest friends. In most cases you will also end up collaborating with some of the senior PhD students or postdocs and they will play a role very similar to that of your adviser. The postdocs, in particular, are professors-in-training and they will likely be eager to work with you as they are trying to gain advising experience they can point to for their academic job search. Therefore, you want to make sure the entire group has people you can get along with, people you respect and who you can work with closely on research projects. - - -### Research topics - -
    - -
    t-SNE visualization of a small subset of human knowledge (from paperscape). Each circle is an arxiv paper and size indicates the number of citations.
    -
    - -So you've entered a PhD program and found an adviser. Now what do you work on? - -**An exercise in the outer loop.** First note the nature of the experience. A PhD is simultaneously a fun and frustrating experience because you're constantly operating on a meta problem level. You're not just solving problems - that's merely the simple inner loop. You spend most of your time on the outer loop, figuring out what problems are worth solving and what problems are ripe for solving. You're constantly imagining yourself solving hypothetical problems and asking yourself where that puts you, what it could unlock, or if anyone cares. If you're like me this can sometimes drive you a little crazy because you're spending long hours working on things and you're not even sure if they are the correct things to work on or if a solution exists. - -**Developing taste**. When it comes to choosing problems you'll hear academics talk about a mystical sense of "taste". It's a real thing. When you pitch a potential problem to your adviser you'll either see their face contort, their eyes rolling, and their attention drift, or you'll sense the excitement in their eyes as they contemplate the uncharted territory ripe for exploration. In that split second a lot happens: an evaluation of the problem's importance, difficulty, its *sexiness*, its historical context (and possibly also its fit to their active grants). In other words, your adviser is likely to be a master of the outer loop and will have a highly developed sense of *taste* for problems. During your PhD you'll get to acquire this sense yourself. - -In particular, I think I had a terrible taste coming in to the PhD. I can see this from the notes I took in my early PhD years. A lot of the problems I was excited about at the time were in retrospect poorly conceived, intractable, or irrelevant. I'd like to think I refined the sense by the end through practice and apprenticeship. - - - -Let me now try to serialize a few thoughts on what goes into this sense of taste, and what makes a problem interesting to work on. - -**A fertile ground.** First, recognize that during your PhD you will dive deeply into one area and your papers will very likely chain on top of each other to create a body of work (which becomes your thesis). Therefore, you should always be thinking several steps ahead when choosing a problem. It's impossible to predict how things will unfold but you can often get a sense of how much room there could be for additional work. - -**Plays to your adviser's interests and strengths**. You will want to operate in the realm of your adviser's interest. Some advisers may allow you to work on slightly tangential areas but you would not be taking full advantage of their knowledge and you are making them less likely to want to help you with your project or promote your work. For instance, (and this goes to my previous point of understanding your adviser's job) every adviser has a "default talk" slide deck on their research that they give all the time and if your work can add new exciting cutting edge work slides to this deck then you'll find them much more invested, helpful and involved in your research. Additionally, their talks will promote and publicize your work. - -**Be ambitious: the sublinear scaling of hardness.** People have a strange bug built into psychology: a 10x more important or impactful problem intuitively *feels* 10x harder (or 10x less likely) to achieve. This is a fallacy - in my experience a 10x more important problem is at most 2-3x harder to achieve. In fact, in some cases a 10x harder problem may be easier to achieve. How is this? It's because thinking 10x forces you out of the box, to confront the real limitations of an approach, to think from first principles, to change the strategy completely, to innovate. If you aspire to improve something by 10% and work hard then you will. But if you aspire to improve it by 100% you are still quite likely to, but you will do it very differently. - -**Ambitious but with an attack.** At this point it's also important to point out that there are plenty of important problems that don't make great projects. I recommend reading [You and Your Research](You and Your Research) by Richard Hamming, where this point is expanded on: - -> If you do not work on an important problem, it's unlikely you'll do important work. It's perfectly obvious. Great scientists have thought through, in a careful way, a number of important problems in their field, and they keep an eye on wondering how to attack them. Let me warn you, `important problem' must be phrased carefully. The three outstanding problems in physics, in a certain sense, were never worked on while I was at Bell Labs. By important I mean guaranteed a Nobel Prize and any sum of money you want to mention. We didn't work on (1) time travel, (2) teleportation, and (3) antigravity. They are not important problems because we do not have an attack. It's not the consequence that makes a problem important, it is that you have a reasonable attack. That is what makes a problem important. - -**The person who did X**. Ultimately, the goal of a PhD is to not only develop a deep expertise in a field but to also make your mark upon it. To steer it, shape it. The ideal scenario is that by the end of the PhD you own some part of an important area, preferably one that is also easy and fast to describe. You want people to say things like "she's the person who did X". If you can fill in a blank there you'll be successful. - -**Valuable skills.** Recognize that during your PhD you will become an expert at the area of your choosing (as fun aside, note that [5 years]x[260 working days]x[8 hours per day] is 10,400 hours; if you believe Gladwell then a PhD is exactly the amount of time to become an expert). So imagine yourself 5 years later being a world expert in this area (the 10,000 hours will ensure that regardless of the academic impact of your work). Are these skills exciting or potentially valuable to your future endeavors? - -**Negative examples.** There are also some problems or types of papers that you ideally want to avoid. For instance, you'll sometimes hear academics talk about *"incremental work"* (this is the worst adjective possible in academia). Incremental work is a paper that enhances something existing by making it more complex and gets 2% extra on some benchmark. The amusing thing about these papers is that they have a reasonably high chance of getting accepted (a reviewer can't point to anything to kill them; they are also sometimes referred to as "*cockroach papers*"), so if you have a string of these papers accepted you can feel as though you're being very productive, but in fact these papers won't go on to be highly cited and you won't go on to have a lot of impact on the field. Similarly, finding projects should ideally not include thoughts along the lines of "there's this next logical step in the air that no one has done yet, let me do it", or "this should be an easy poster". - -**Case study: my thesis**. To make some of this discussion more concrete I wanted to use the example of how my own PhD unfolded. First, fun fact: my entire thesis is based on work I did in the last 1.5 years of my PhD. i.e. it took me quite a long time to wiggle around in the metaproblem space and find a problem that I felt very excited to work on (the other ~2 years I mostly meandered on 3D things (e.g. Kinect Fusion, 3D meshes, point cloud features) and video things). Then at one point in my 3rd year I randomly stopped by Richard Socher's office on some Saturday at 2am. We had a chat about interesting problems and I realized that some of his work on images and language was in fact getting at something very interesting (of course, the area at the intersection of images and language goes back quite a lot further than Richard as well). I couldn't quite see all the papers that would follow but it seemed heuristically very promising: it was highly fertile (a lot of unsolved problems, a lot of interesting possibilities on grounding descriptions to images), I felt that it was very cool and important, it was easy to explain, it seemed to be at the boundary of possible (Deep Learning has just started to work), the datasets had just started to become available (Flickr8K had just come out), it fit nicely into Fei-Fei's interests and even if I were not successful I'd at least get lots of practice with optimizing interesting deep nets that I could reapply elsewhere. I had a strong feeling of a tsunami of checkmarks as everything clicked in place in my mind. I pitched this to Fei-Fei (my adviser) as an area to dive into the next day and, with relief, she enthusiastically approved, encouraged me, and would later go on to steer me within the space (e.g. Fei-Fei insisted that I do image to sentence generation while I was mostly content with ranking.). I'm happy with how things evolved from there. In short, I meandered around for 2 years stuck around the outer loop, finding something to dive into. Once it clicked for me what that was based on several heuristics, I dug in. - -**Resistance**. I'd like to also mention that your adviser is by no means infallible. I've witnessed and heard of many instances in which, in retrospect, the adviser made the wrong call. If you feel this way during your phd you should have the courage to sometimes ignore your adviser. Academia generally celebrates independent thinking but the response of your specific adviser can vary depending on circumstances. I'm aware of multiple cases where the bet worked out very well and I've also personally experienced cases where it did not. For instance, I disagreed strongly with some advice Andrew Ng gave me in my very first year. I ended up working on a problem he wasn't very excited about and, surprise, he turned out to be very right and I wasted a few months. Win some lose some :) - -**Don't play the game.** Finally, I'd like to challenge you to think of a PhD as more than just a sequence of papers. You're not a paper writer. You're a member of a research community and your goal is to push the field forward. Papers are one common way of doing that but I would encourage you to look beyond the established academic game. Think for yourself and from first principles. Do things others don't do but should. Step off the treadmill that has been put before you. I tried to do some of this myself throughout my PhD. This blog is an example - it allows me communicate things that wouldn't ordinarily go into papers. The ImageNet human reference experiments are an example - I felt strongly that it was important for the field to know the ballpark human accuracy on ILSVRC so I took a few weeks off and evaluated it. The academic search tools (e.g. arxiv-sanity) are an example - I felt continuously frustrated by the inefficiency of finding papers in the literature and I released and maintain the site in hopes that it can be useful to others. Teaching CS231n twice is an example - I put much more effort into it than is rationally advisable for a PhD student who should be doing research, but I felt that the field was held back if people couldn't efficiently learn about the topic and enter. A lot of my PhD endeavors have likely come at a cost in standard academic metrics (e.g. h-index, or number of publications in top venues) but I did them anyway, I would do it the same way again, and here I am encouraging others to as well. To add a pitch of salt and wash down the ideology a bit, based on several past discussions with my friends and colleagues I know that this view is contentious and that many would disagree. - - - -### Writing papers - -
    - -
    - -Writing good papers is an essential survival skill of an academic (kind of like making fire for a caveman). In particular, it is very important to realize that papers are a specific thing: they look a certain way, they flow a certain way, they have a certain structure, language, and statistics that the other academics expect. It's usually a painful exercise for me to look through some of my early PhD paper drafts because they are quite terrible. There is a lot to learn here. - -**Review papers.** If you're trying to learn to write better papers it can feel like a sensible strategy to look at many good papers and try to distill patterns. This turns out to not be the best strategy; it's analogous to only receiving positive examples for a binary classification problem. What you really want is to also have exposure to a large number of bad papers and one way to get this is by reviewing papers. Most good conferences have an acceptance rate of about 25% so most papers you'll review are bad, which will allow you to build a powerful binary classifier. You'll read through a bad paper and realize how unclear it is, or how it doesn't define it's variables, how vague and abstract its intro is, or how it dives in to the details too quickly, and you'll learn to avoid the same pitfalls in your own papers. Another related valuable experience is to attend (or form) journal clubs - you'll see experienced researchers critique papers and get an impression for how your own papers will be analyzed by others. - -**Get the gestalt right.** I remember being impressed with Fei-Fei (my adviser) once during a reviewing session. I had a stack of 4 papers I had reviewed over the last several hours and she picked them up, flipped through each one for 10 seconds, and said one of them was good and the other three bad. Indeed, I was accepting the one and rejecting the other three, but something that took me several hours took her seconds. Fei-Fei was relying on the *gestalt* of the papers as a powerful heuristic. Your papers, as you become a more senior researcher take on a characteristic look. An introduction of ~1 page. A ~1 page related work section with a good density of citations - not too sparse but not too crowded. A well-designed pull figure (on page 1 or 2) and system figure (on page 3) that were not made in MS Paint. A technical section with some math symbols somewhere, results tables with lots of numbers and some of them bold, one additional cute analysis experiment, and the paper has exactly 8 pages (the page limit) and not a single line less. You'll have to learn how to endow your papers with the same gestalt because many researchers rely on it as a cognitive shortcut when they judge your work. - -**Identify the core contribution**. Before you start writing anything it's important to identify the single core contribution that your paper makes to the field. I would especially highlight the word *single*. A paper is not a random collection of some experiments you ran that you report on. The paper sells a single thing that was not obvious or present before. You have to argue that the thing is important, that it hasn't been done before, and then you support its merit experimentally in controlled experiments. The entire paper is organized around this core contribution with surgical precision. In particular it doesn't have any additional fluff and it doesn't try to pack anything else on a side. As a concrete example, I made a mistake in one of my earlier papers on [video classification](https://cs.stanford.edu/people/karpathy/deepvideo/deepvideo_cvpr2014.pdf) where I tried to pack in two contributions: 1) a set of architectural layouts for video convnets and an unrelated 2) multi-resolution architecture which gave small improvements. I added it because I reasoned first that maybe someone could find it interesting and follow up on it later and second because I thought that contributions in a paper are additive: two contributions are better than one. Unfortunately, this is false and very wrong. The second contribution was minor/dubious and it diluted the paper, it was distracting, and no one cared. I've made a similar mistake again in my [CVPR 2014 paper](https://cs.stanford.edu/people/karpathy/deepimagesent/) which presented two separate models: a ranking model and a generation model. Several good in-retrospect arguments could be made that I should have submitted two separate papers; the reason it was one is more historical than rational. - -**The structure.** Once you've identified your core contribution there is a default recipe for writing a paper about it. The upper level structure is by default Intro, Related Work, Model, Experiments, Conclusions. When I write my intro I find that it helps to put down a coherent top-level narrative in latex comments and then fill in the text below. I like to organize each of my paragraphs around a single concrete point stated on the first sentence that is then supported in the rest of the paragraph. This structure makes it easy for a reader to skim the paper. A good flow of ideas is then along the lines of 1) X (+define X if not obvious) is an important problem 2) The core challenges are this and that. 2) Previous work on X has addressed these with Y, but the problems with this are Z. 3) In this work we do W (?). 4) This has the following appealing properties and our experiments show this and that. You can play with this structure a bit but these core points should be clearly made. Note again that the paper is surgically organized around your exact contribution. For example, when you list the challenges you want to list exactly the things that you address later; you don't go meandering about unrelated things to what you have done (you can speculate a bit more later in conclusion). It is important to keep a sensible structure throughout your paper, not just in the intro. For example, when you explain the model each section should: 1) explain clearly what is being done in the section, 2) explain what the core challenges are 3) explain what a baseline approach is or what others have done before 4) motivate and explain what you do 5) describe it. - -**Break the structure.** You should also feel free (and you're encouraged to!) play with these formulas to some extent and add some spice to your papers. For example, see this amusing paper from [Razavian et al. in 2014](https://arxiv.org/abs/1403.6382) that structures the introduction as a dialog between a student and the professor. It's clever and I like it. As another example, a lot of papers from [Alyosha Efros](https://people.eecs.berkeley.edu/~efros/) have a playful tone and make great case studies in writing fun papers. As only one of many examples, see this paper he wrote with Antonio Torralba: [Unbiased look at dataset bias](https://people.csail.mit.edu/torralba/publications/datasets_cvpr11.pdf). Another possibility I've seen work well is to include an FAQ section, possibly in the appendix. - -**Common mistake: the laundry list.** One very common mistake to avoid is the "laundry list", which looks as follows: "Here is the problem. Okay now to solve this problem first we do X, then we do Y, then we do Z, and now we do W, and here is what we get". You should try very hard to avoid this structure. Each point should be justified, motivated, explained. Why do you do X or Y? What are the alternatives? What have others done? It's okay to say things like this is common (add citation if possible). Your paper is not a report, an enumeration of what you've done, or some kind of a translation of your chronological notes and experiments into latex. It is a highly processed and very focused discussion of a problem, your approach and its context. It is supposed to teach your colleagues something and you have to justify your steps, not just describe what you did. - -**The language.** Over time you'll develop a vocabulary of good words and bad words to use when writing papers. Speaking about machine learning or computer vision papers specifically as concrete examples, in your papers you never "study" or "investigate" (there are boring, passive, bad words); instead you "develop" or even better you "propose". And you don't present a "system" or, *shudder*, a "pipeline"; instead, you develop a "model". You don't learn "features", you learn "representations". And god forbid, you never "combine", "modify" or "expand". These are incremental, gross terms that will certainly get your paper rejected :). - -**An internal deadlines 2 weeks prior**. Not many labs do this, but luckily Fei-Fei is quite adamant about an internal deadline 2 weeks before the due date in which you must submit at least a 5-page draft with all the final experiments (even if not with final numbers) that goes through an internal review process identical to the external one (with the same review forms filled out, etc). I found this practice to be extremely useful because forcing yourself to lay out the full paper almost always reveals some number of critical experiments you must run for the paper to flow and for its argument flow to be coherent, consistent and convincing. - -Another great resource on this topic is [Tips for Writing Technical Papers](https://cs.stanford.edu/people/widom/paper-writing.html) from Jennifer Widom. - -### Writing code - -
    - -
    - -A lot of your time will of course be taken up with the *execution* of your ideas, which likely involves a lot of coding. I won't dwell on this too much because it's not uniquely academic, but I would like to bring up a few points. - -**Release your code**. It's a somewhat surprising fact but you can get away with publishing papers and not releasing your code. You will also feel a lot of incentive to not release your code: it can be a lot of work (research code can look like spaghetti since you iterate very quickly, you have to clean up a lot), it can be intimidating to think that others might judge you on your at most decent coding abilities, it is painful to maintain code and answer questions from other people about it (forever), and you might also be concerned that people could spot bugs that invalidate your results. However, it is precisely for some of these reasons that you should commit to releasing your code: it will force you to adopt better coding habits due to fear of public shaming (which will end up saving you time!), it will force you to learn better engineering practices, it will force you to be more thorough with your code (e.g. writing unit tests to make bugs much less likely), it will make others much more likely to follow up on your work (and hence lead to more citations of your papers) and of course it will be much more useful to everyone as a record of exactly what was done for posterity. When you do release your code I recommend taking advantage of [docker containers](https://www.docker.com/); this will reduce the amount of headaches people email you about when they can't get all the dependencies (and their precise versions) installed. - -**Think of the future you**. Make sure to document all your code very well for yourself. I guarantee you that you will come back to your code base a few months later (e.g. to do a few more experiments for the camera ready version of the paper), and you will feel *completely* lost in it. I got into the habit of creating very thorough readme.txt files in all my repos (for my personal use) as notes to future self on how the code works, how to run it, etc. - - - -### Giving talks - -
    - -
    - -So, you published a paper and it's an oral! Now you get to give a few minute talk to a large audience of people - what should it look like? - -**The goal of a talk**. First, that there's a common misconception that the goal of your talk is to tell your audience about what you did in your paper. This is incorrect, and should only be a second or third degree design criterion. The goal of your talk is to 1) get the audience really excited about the **problem** you worked on (they must appreciate it or they will not care about your solution otherwise!) 2) teach the audience something (ideally while giving them a taste of your insight/solution; don't be afraid to spend time on other's related work), and 3) entertain (they will start checking their Facebook otherwise). Ideally, by the end of the talk the people in your audience are thinking some mixture of "wow, I'm working in the wrong area", "I have to read this paper", and "This person has an impressive understanding of the whole area". - -**A few do's:** There are several properties that make talks better. For instance, Do: Lots of pictures. People Love pictures. Videos and animations should be used more sparingly because they distract. Do: make the talk actionable - talk about something someone can *do* after your talk. Do: give a live demo if possible, it can make your talk more memorable. Do: develop a broader intellectual arch that your work is part of. Do: develop it into a story (people love stories). Do: cite, cite, cite - a lot! It takes very little slide space to pay credit to your colleagues. It pleases them and always reflects well on you because it shows that you're humble about your own contribution, and aware that it builds on a lot of what has come before and what is happening in parallel. You can even cite related work published at the same conference and briefly advertise it. Do: practice the talk! First for yourself in isolation and later to your lab/friends. This almost always reveals very insightful flaws in your narrative and flow. - -**Don't: texttexttext**. Don't crowd your slides with text. There should be very few or no bullet points - speakers sometimes try to use these as a crutch to remind themselves what they should be talking about but the slides are not for you they are for the audience. These should be in your speaker notes. On the topic of crowding the slides, also avoid complex diagrams as much as you can - your audience has a fixed bit bandwidth and I guarantee that your own very familiar and "simple" diagram is not as simple or interpretable to someone seeing it for the first time. - -**Careful with: result tables:** Don't include dense tables of results showing that your method works better. You got a paper, I'm sure your results were decent. I always find these parts boring and unnecessary unless the numbers show something interesting (other than your method works better), or of course unless there is a large gap that you're very proud of. If you do include results or graphs build them up slowly with transitions, don't post them all at once and spend 3 minutes on one slide. - -**Pitfall: the thin band between bored/confused**. It's actually quite tricky to design talks where a good portion of your audience *learns* something. A common failure case (as an audience member) is to see talks where I'm painfully bored during the first half and completely confused during the second half, learning nothing by the end. This can occur in talks that have a very general (too general) overview followed by a technical (too technical) second portion. Try to identify when your talk is in danger of having this property. - -**Pitfall: running out of time**. Many speakers spend too much time on the early intro parts (that can often be somewhat boring) and then frantically speed through all the last few slides that contain the most interesting results, analysis or demos. Don't be that person. - -**Pitfall: formulaic talks**. I might be a special case but I'm always a fan of non-formulaic talks that challenge conventions. For instance, I *despise* the outline slide. It makes the talk so boring, it's like saying: "This movie is about a ring of power. In the first chapter we'll see a hobbit come into possession of the ring. In the second we'll see him travel to Mordor. In the third he'll cast the ring into Mount Doom and destroy it. I will start with chapter 1" - Come on! I use outline slides for much longer talks to keep the audience anchored if they zone out (at 30min+ they inevitably will a few times), but it should be used sparingly. - -**Observe and learn**. Ultimately, the best way to become better at giving talks (as it is with writing papers too) is to make conscious effort to pay attention to what great (and not so great) speakers do and build a binary classifier in your mind. Don't just enjoy talks; analyze them, break them down, learn from them. Additionally, pay close attention to the audience and their reactions. Sometimes a speaker will put up a complex table with many numbers and you will notice half of the audience immediately look down on their phone and open Facebook. Build an internal classifier of the events that cause this to happen and avoid them in your talks. - - - -### Attending conferences - -
    - -
    - -On the subject of conferences: - -**Go.** It's very important that you go to conferences, especially the 1-2 top conferences in your area. If your adviser lacks funds and does not want to pay for your travel expenses (e.g. if you don't have a paper) then you should be willing to pay for yourself (usually about $2000 for travel, accommodation, registration and food). This is important because you want to become part of the academic community and get a chance to meet more people in the area and gossip about research topics. Science might have this image of a few brilliant lone wolfs working in isolation, but the truth is that research is predominantly a highly social endeavor - you stand on the shoulders of many people, you're working on problems in parallel with other people, and it is these people that you're also writing papers to. Additionally, it's unfortunate but each field has knowledge that doesn't get serialized into papers but is instead spread across a shared understanding of the community; things such as what are the next important topics to work on, what papers are most interesting, what is the inside scoop on papers, how they developed historically, what methods work (not just on paper, in reality), etcetc. It is very valuable (and fun!) to become part of the community and get direct access to the hivemind - to learn from it first, and to hopefully influence it later. - -**Talks: choose by speaker**. One conference trick I've developed is that if you're choosing which talks to attend it can be better to look at the speakers instead of the topics. Some people give better talks than others (it's a skill, and you'll discover these people in time) and in my experience I find that it often pays off to see them speak even if it is on a topic that isn't exactly connected to your area of research. - -**The real action is in the hallways**. The speed of innovation (especially in Machine Learning) now works at timescales much faster than conferences so most of the relevant papers you'll see at the conference are in fact old news. Therefore, conferences are primarily a social event. Instead of attending a talk I encourage you to view the hallway as one of the main events that doesn't appear on the schedule. It can also be valuable to stroll the poster session and discover some interesting papers and ideas that you may have missed. - -> It is said that there are three stages to a PhD. In the first stage you look at a related paper's reference section and you haven't read most of the papers. In the second stage you recognize all the papers. In the third stage you've shared a beer with all the first authors of all the papers. - - - -### Closing thoughts - -I can't find the quote anymore but I heard Sam Altman of YC say that there are no shortcuts or cheats when it comes to building a startup. You can't expect to win in the long run by somehow gaming the system or putting up false appearances. I think that the same applies in academia. Ultimately you're trying to do good research and push the field forward and if you try to game any of the proxy metrics you won't be successful in the long run. This is especially so because academia is in fact surprisingly small and highly interconnected, so anything shady you try to do to pad your academic resume (e.g. self-citing a lot, publishing the same idea multiple times with small remixes, resubmitting the same rejected paper over and over again with no changes, conveniently trying to leave out some baselines etc.) will eventually catch up with you and you will not be successful. - -So at the end of the day it's quite simple. Do good work, communicate it properly, people will notice and good things will happen. Have a fun ride! - -

    -EDIT: [HN discussion link](https://news.ycombinator.com/item?id=12447495). diff --git a/_posts/2017-03-29-ddp.markdown~ b/_posts/2017-03-29-ddp.markdown~ new file mode 100644 index 000000000..fb517e702 --- /dev/null +++ b/_posts/2017-03-29-ddp.markdown~ @@ -0,0 +1,147 @@ +--- +layout: post +comments: true +title: "Vision System for Autonomous Navigation of Underwater Vehicles" +excerpt: "Vision System for Autonomous Navigation of Underwater Vehicles" +date: 2017-03-29 11:00:00 +mathjax: false +--- + +### Abstract + +Remotely Operated (Underwater) Vehicles (ROVs) are increasingly being used for inspection of offshore/underwater structures. However the visual data acquired underwater is often poor in quality due lack of sufficient/uniform illumination, haze due to atmospheric light reflected by the suspended particles in the murky water, selective absorption of different wavelengths causing colour imbalance and subsequently poor contrast, etc. We propose to rectify these problems using various Dehazing and Enhancement algorithms. We also attempt to solve the problem of autonomous navigation using absolute depth maps obtained from the enhanced monocular images. + +The Vision System developed first dehazes and enhances the video frame to improve the image quality which is then used for further feature extraction. To solve the problem of autonomous navigation, the use of absolute depth maps is proposed. The depth maps are used for perceiving the structure, 2D shape and third dimension depth, of the scene to detect and avoid obstacles. + +To solve the Image Dehazing problem Dark Channel Prior, Color Attenuation Prior, ConvNet based DehazeNet and Non-Local Image Dehazing algorithms were implemented and their results were compared. Using the Non-Local Image Dehazing algorithm the Contrast-toNoise Ratio (CNR) was increased by 20-50 on test images. + +To uncover the visual cues hiding in low-light setting, Gamma Correction, CLAHE and Illumination Map Estimation based algorithm were implemented. The Illumination Map Estimation based algorithm was able to increase average local contrast by around 20-40 on test images. + +To solve the problem of autonomous navigation, we estimate the depth maps from the enhanced monocular images, we use Deep Convolutional Neural Networks to extract the depth relevant features in a fully supervised training setting. The estimated depth maps are also refined using regularization and the pixel cluster with most depth is proposed as suitable navigation direction. Finally to summarize the inspected area a framework for mosaicking video is proposed which if combined with the mosaic of depth map can provide the full three dimensional setting of the scene. +### Results + +Image Dehazing + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Low Light Image Enhancement + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Depth Map Estimation from Single Monocular Images using Deep Convolutinal Neural Fields + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Overall + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + + diff --git a/_posts/2017-04-18-ddp.markdown b/_posts/2017-04-18-ddp.markdown new file mode 100644 index 000000000..d58c57682 --- /dev/null +++ b/_posts/2017-04-18-ddp.markdown @@ -0,0 +1,147 @@ +--- +layout: post +comments: true +title: "Vision System for Autonomous Navigation of Underwater Vehicles" +excerpt: "Vision System for Autonomous Navigation of Underwater Vehicles" +date: 2017-04-18 11:00:00 +mathjax: false +--- + +### Abstract + +Remotely Operated (Underwater) Vehicles (ROVs) are increasingly being used for inspection of offshore/underwater structures. However the visual data acquired underwater is often poor in quality due lack of sufficient/uniform illumination, haze due to atmospheric light reflected by the suspended particles in the murky water, selective absorption of different wavelengths causing colour imbalance and subsequently poor contrast, etc. We propose to rectify these problems using various Dehazing and Enhancement algorithms. We also attempt to solve the problem of autonomous navigation using absolute depth maps obtained from the enhanced monocular images. + +The Vision System developed first dehazes and enhances the video frame to improve the image quality which is then used for further feature extraction. To solve the problem of autonomous navigation, the use of absolute depth maps is proposed. The depth maps are used for perceiving the structure, 2D shape and third dimension depth, of the scene to detect and avoid obstacles. + +To solve the Image Dehazing problem Dark Channel Prior, Color Attenuation Prior, ConvNet based DehazeNet and Non-Local Image Dehazing algorithms were implemented and their results were compared. Using the Non-Local Image Dehazing algorithm the Contrast-toNoise Ratio (CNR) was increased by 20-50 on test images. + +To uncover the visual cues hiding in low-light setting, Gamma Correction, CLAHE and Illumination Map Estimation based algorithm were implemented. The Illumination Map Estimation based algorithm was able to increase average local contrast by around 20-40 on test images. + +To solve the problem of autonomous navigation, we estimate the depth maps from the enhanced monocular images, we use Deep Convolutional Neural Networks to extract the depth relevant features in a fully supervised training setting. The estimated depth maps are also refined using regularization and the pixel cluster with most depth is proposed as suitable navigation direction. Finally to summarize the inspected area a framework for mosaicking video is proposed which if combined with the mosaic of depth map can provide the full three dimensional setting of the scene. +### Results + +Image Dehazing + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Low Light Image Enhancement + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Depth Map Estimation from Single Monocular Images using Deep Convolutinal Neural Fields + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Overall + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + + diff --git a/_posts/2017-04-18-ddp.markdown~ b/_posts/2017-04-18-ddp.markdown~ new file mode 100644 index 000000000..d58c57682 --- /dev/null +++ b/_posts/2017-04-18-ddp.markdown~ @@ -0,0 +1,147 @@ +--- +layout: post +comments: true +title: "Vision System for Autonomous Navigation of Underwater Vehicles" +excerpt: "Vision System for Autonomous Navigation of Underwater Vehicles" +date: 2017-04-18 11:00:00 +mathjax: false +--- + +### Abstract + +Remotely Operated (Underwater) Vehicles (ROVs) are increasingly being used for inspection of offshore/underwater structures. However the visual data acquired underwater is often poor in quality due lack of sufficient/uniform illumination, haze due to atmospheric light reflected by the suspended particles in the murky water, selective absorption of different wavelengths causing colour imbalance and subsequently poor contrast, etc. We propose to rectify these problems using various Dehazing and Enhancement algorithms. We also attempt to solve the problem of autonomous navigation using absolute depth maps obtained from the enhanced monocular images. + +The Vision System developed first dehazes and enhances the video frame to improve the image quality which is then used for further feature extraction. To solve the problem of autonomous navigation, the use of absolute depth maps is proposed. The depth maps are used for perceiving the structure, 2D shape and third dimension depth, of the scene to detect and avoid obstacles. + +To solve the Image Dehazing problem Dark Channel Prior, Color Attenuation Prior, ConvNet based DehazeNet and Non-Local Image Dehazing algorithms were implemented and their results were compared. Using the Non-Local Image Dehazing algorithm the Contrast-toNoise Ratio (CNR) was increased by 20-50 on test images. + +To uncover the visual cues hiding in low-light setting, Gamma Correction, CLAHE and Illumination Map Estimation based algorithm were implemented. The Illumination Map Estimation based algorithm was able to increase average local contrast by around 20-40 on test images. + +To solve the problem of autonomous navigation, we estimate the depth maps from the enhanced monocular images, we use Deep Convolutional Neural Networks to extract the depth relevant features in a fully supervised training setting. The estimated depth maps are also refined using regularization and the pixel cluster with most depth is proposed as suitable navigation direction. Finally to summarize the inspected area a framework for mosaicking video is proposed which if combined with the mosaic of depth map can provide the full three dimensional setting of the scene. +### Results + +Image Dehazing + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Low Light Image Enhancement + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Depth Map Estimation from Single Monocular Images using Deep Convolutinal Neural Fields + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +Overall + +
    + +
    + +
    + +
    + +
    + +
    + +
    + +
    + + diff --git a/_posts/2018-01-20-medium.markdown b/_posts/2018-01-20-medium.markdown deleted file mode 100644 index 3dd5b7f31..000000000 --- a/_posts/2018-01-20-medium.markdown +++ /dev/null @@ -1,18 +0,0 @@ ---- -layout: post -comments: false -title: "(started posting on Medium instead)" -excerpt: "Yes I'm still around but, I've started posting on Medium instead of here." -date: 2018-01-20 11:00:00 -mathjax: false ---- - -The current state of this blog (with the last post 2 years ago) makes it look like I've disappeared. I've certainly become less active on blogs since I've joined Tesla, but -whenever I do get a chance to post something I have recently been defaulting -to doing it on Medium because it is much faster and easier. I still plan to come back -here for longer posts if I get any time, but I'll default to Medium for everything short-medium in length. - -### TLDR - -**Have a look at my [Medium blog](https://medium.com/@karpathy/).** - diff --git a/_posts/2019-04-25-recipe.markdown b/_posts/2019-04-25-recipe.markdown deleted file mode 100644 index 17926e90f..000000000 --- a/_posts/2019-04-25-recipe.markdown +++ /dev/null @@ -1,125 +0,0 @@ ---- -layout: post -comments: true -title: "A Recipe for Training Neural Networks" -excerpt: "A collection of practical advice for the process of achieving strong results with neural networks." -date: 2019-04-25 9:00:00 -mathjax: false ---- - -Some few weeks ago I [posted](https://twitter.com/karpathy/status/1013244313327681536?lang=en) a tweet on "the most common neural net mistakes", listing a few common gotchas related to training neural nets. The tweet got quite a bit more engagement than I anticipated (including a [webinar](https://www.bigmarker.com/missinglink-ai/PyTorch-Code-to-Unpack-Andrej-Karpathy-s-6-Most-Common-NN-Mistakes) :)). Clearly, a lot of people have personally encountered the large gap between "here is how a convolutional layer works" and "our convnet achieves state of the art results". - -So I thought it could be fun to brush off my dusty blog to expand my tweet to the long form that this topic deserves. However, instead of going into an enumeration of more common errors or fleshing them out, I wanted to dig a bit deeper and talk about how one can avoid making these errors altogether (or fix them very fast). The trick to doing so is to follow a certain process, which as far as I can tell is not very often documented. Let's start with two important observations that motivate it. - -#### 1) Neural net training is a leaky abstraction - -It is allegedly easy to get started with training neural nets. Numerous libraries and frameworks take pride in displaying 30-line miracle snippets that solve your data problems, giving the (false) impression that this stuff is plug and play. It's common see things like: - -```python ->>> your_data = # plug your awesome dataset here ->>> model = SuperCrossValidator(SuperDuper.fit, your_data, ResNet50, SGDOptimizer) -# conquer world here -``` - -These libraries and examples activate the part of our brain that is familiar with standard software - a place where clean APIs and abstractions are often attainable. [Requests](http://docs.python-requests.org/en/master/) library to demonstrate: - -```python ->>> r = requests.get('https://api.github.com/user', auth=('user', 'pass')) ->>> r.status_code -200 -``` - -That's cool! A courageous developer has taken the burden of understanding query strings, urls, GET/POST requests, HTTP connections, and so on from you and largely hidden the complexity behind a few lines of code. This is what we are familiar with and expect. Unfortunately, neural nets are nothing like that. They are not "off-the-shelf" technology the second you deviate slightly from training an ImageNet classifier. I've tried to make this point in my post ["Yes you should understand backprop"](https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b) by picking on backpropagation and calling it a "leaky abstraction", but the situation is unfortunately much more dire. Backprop + SGD does not magically make your network work. Batch norm does not magically make it converge faster. RNNs don't magically let you "plug in" text. And just because you can formulate your problem as RL doesn't mean you should. If you insist on using the technology without understanding how it works you are likely to fail. Which brings me to... - -#### 2) Neural net training fails silently - -When you break or misconfigure code you will often get some kind of an exception. You plugged in an integer where something expected a string. The function only expected 3 arguments. This import failed. That key does not exist. The number of elements in the two lists isn't equal. In addition, it's often possible to create unit tests for a certain functionality. - -This is just a start when it comes to training neural nets. Everything could be correct syntactically, but the whole thing isn't arranged properly, and it's really hard to tell. The "possible error surface" is large, logical (as opposed to syntactic), and very tricky to unit test. For example, perhaps you forgot to flip your labels when you left-right flipped the image during data augmentation. Your net can still (shockingly) work pretty well because your network can internally learn to detect flipped images and then it left-right flips its predictions. Or maybe your autoregressive model accidentally takes the thing it's trying to predict as an input due to an off-by-one bug. Or you tried to clip your gradients but instead clipped the loss, causing the outlier examples to be ignored during training. Or you initialized your weights from a pretrained checkpoint but didn't use the original mean. Or you just screwed up the settings for regularization strengths, learning rate, its decay rate, model size, etc. Therefore, your misconfigured neural net will throw exceptions only if you're lucky; Most of the time it will train but silently work a bit worse. - -As a result, (and this is reeaally difficult to over-emphasize) **a "fast and furious" approach to training neural networks does not work** and only leads to suffering. Now, suffering is a perfectly natural part of getting a neural network to work well, but it can be mitigated by being thorough, defensive, paranoid, and obsessed with visualizations of basically every possible thing. The qualities that in my experience correlate most strongly to success in deep learning are patience and attention to detail. - -## The recipe - -In light of the above two facts, I have developed a specific process for myself that I follow when applying a neural net to a new problem, which I will try to describe. You will see that it takes the two principles above very seriously. In particular, it builds from simple to complex and at every step of the way we make concrete hypotheses about what will happen and then either validate them with an experiment or investigate until we find some issue. What we try to prevent very hard is the introduction of a lot of "unverified" complexity at once, which is bound to introduce bugs/misconfigurations that will take forever to find (if ever). If writing your neural net code was like training one, you'd want to use a very small learning rate and guess and then evaluate the full test set after every iteration. - -#### 1. Become one with the data - -The first step to training a neural net is to not touch any neural net code at all and instead begin by thoroughly inspecting your data. This step is critical. I like to spend copious amount of time (measured in units of hours) scanning through thousands of examples, understanding their distribution and looking for patterns. Luckily, your brain is pretty good at this. One time I discovered that the data contained duplicate examples. Another time I found corrupted images / labels. I look for data imbalances and biases. I will typically also pay attention to my own process for classifying the data, which hints at the kinds of architectures we'll eventually explore. As an example - are very local features enough or do we need global context? How much variation is there and what form does it take? What variation is spurious and could be preprocessed out? Does spatial position matter or do we want to average pool it out? How much does detail matter and how far could we afford to downsample the images? How noisy are the labels? - -In addition, since the neural net is effectively a compressed/compiled version of your dataset, you'll be able to look at your network (mis)predictions and understand where they might be coming from. And if your network is giving you some prediction that doesn't seem consistent with what you've seen in the data, something is off. - -Once you get a qualitative sense it is also a good idea to write some simple code to search/filter/sort by whatever you can think of (e.g. type of label, size of annotations, number of annotations, etc.) and visualize their distributions and the outliers along any axis. The outliers especially almost always uncover some bugs in data quality or preprocessing. - -#### 2. Set up the end-to-end training/evaluation skeleton + get dumb baselines - -Now that we understand our data can we reach for our super fancy Multi-scale ASPP FPN ResNet and begin training awesome models? For sure no. That is the road to suffering. Our next step is to set up a full training + evaluation skeleton and gain trust in its correctness via a series of experiments. At this stage it is best to pick some simple model that you couldn't possibly have screwed up somehow - e.g. a linear classifier, or a very tiny ConvNet. We'll want to train it, visualize the losses, any other metrics (e.g. accuracy), model predictions, and perform a series of ablation experiments with explicit hypotheses along the way. - -Tips & tricks for this stage: - -- **fix random seed**. Always use a fixed random seed to guarantee that when you run the code twice you will get the same outcome. This removes a factor of variation and will help keep you sane. -- **simplify**. Make sure to disable any unnecessary fanciness. As an example, definitely turn off any data augmentation at this stage. Data augmentation is a regularization strategy that we may incorporate later, but for now it is just another opportunity to introduce some dumb bug. -- **add significant digits to your eval**. When plotting the test loss run the evaluation over the entire (large) test set. Do not just plot test losses over batches and then rely on smoothing them in Tensorboard. We are in pursuit of correctness and are very willing to give up time for staying sane. -- **verify loss @ init**. Verify that your loss starts at the correct loss value. E.g. if you initialize your final layer correctly you should measure `-log(1/n_classes)` on a softmax at initialization. The same default values can be derived for L2 regression, Huber losses, etc. -- **init well**. Initialize the final layer weights correctly. E.g. if you are regressing some values that have a mean of 50 then initialize the final bias to 50. If you have an imbalanced dataset of a ratio 1:10 of positives:negatives, set the bias on your logits such that your network predicts probability of 0.1 at initialization. Setting these correctly will speed up convergence and eliminate "hockey stick" loss curves where in the first few iteration your network is basically just learning the bias. -- **human baseline**. Monitor metrics other than loss that are human interpretable and checkable (e.g. accuracy). Whenever possible evaluate your own (human) accuracy and compare to it. Alternatively, annotate the test data twice and for each example treat one annotation as prediction and the second as ground truth. -- **input-indepent baseline**. Train an input-independent baseline, (e.g. easiest is to just set all your inputs to zero). This should perform worse than when you actually plug in your data without zeroing it out. Does it? i.e. does your model learn to extract any information out of the input at all? -- **overfit one batch**. Overfit a single batch of only a few examples (e.g. as little as two). To do so we increase the capacity of our model (e.g. add layers or filters) and verify that we can reach the lowest achievable loss (e.g. zero). I also like to visualize in the same plot both the label and the prediction and ensure that they end up aligning perfectly once we reach the minimum loss. If they do not, there is a bug somewhere and we cannot continue to the next stage. -- **verify decreasing training loss**. At this stage you will hopefully be underfitting on your dataset because you're working with a toy model. Try to increase its capacity just a bit. Did your training loss go down as it should? -- **visualize just before the net**. The unambiguously correct place to visualize your data is immediately before your `y_hat = model(x)` (or `sess.run` in tf). That is - you want to visualize *exactly* what goes into your network, decoding that raw tensor of data and labels into visualizations. This is the only "source of truth". I can't count the number of times this has saved me and revealed problems in data preprocessing and augmentation. -- **visualize prediction dynamics**. I like to visualize model predictions on a fixed test batch during the course of training. The "dynamics" of how these predictions move will give you incredibly good intuition for how the training progresses. Many times it is possible to feel the network "struggle" to fit your data if it wiggles too much in some way, revealing instabilities. Very low or very high learning rates are also easily noticeable in the amount of jitter. -- **use backprop to chart dependencies**. Your deep learning code will often contain complicated, vectorized, and broadcasted operations. A relatively common bug I've come across a few times is that people get this wrong (e.g. they use `view` instead of `transpose/permute` somewhere) and inadvertently mix information across the batch dimension. It is a depressing fact that your network will typically still train okay because it will learn to ignore data from the other examples. One way to debug this (and other related problems) is to set the loss to be something trivial like the sum of all outputs of example **i**, run the backward pass all the way to the input, and ensure that you get a non-zero gradient only on the **i-th** input. The same strategy can be used to e.g. ensure that your autoregressive model at time t only depends on 1..t-1. More generally, gradients give you information about what depends on what in your network, which can be useful for debugging. -- **generalize a special case**. This is a bit more of a general coding tip but I've often seen people create bugs when they bite off more than they can chew, writing a relatively general functionality from scratch. I like to write a very specific function to what I'm doing right now, get that to work, and then generalize it later making sure that I get the same result. Often this applies to vectorizing code, where I almost always write out the fully loopy version first and only then transform it to vectorized code one loop at a time. - - -#### 3. Overfit - -At this stage we should have a good understanding of the dataset and we have the full training + evaluation pipeline working. For any given model we can (reproducibly) compute a metric that we trust. We are also armed with our performance for an input-independent baseline, the performance of a few dumb baselines (we better beat these), and we have a rough sense of the performance of a human (we hope to reach this). The stage is now set for iterating on a good model. - -The approach I like to take to finding a good model has two stages: first get a model large enough that it can overfit (i.e. focus on training loss) and then regularize it appropriately (give up some training loss to improve the validation loss). The reason I like these two stages is that if we are not able to reach a low error rate with any model at all that may again indicate some issues, bugs, or misconfiguration. - -A few tips & tricks for this stage: - -- **picking the model**. To reach a good training loss you'll want to choose an appropriate architecture for the data. When it comes to choosing this my #1 advice is: **Don't be a hero**. I've seen a lot of people who are eager to get crazy and creative in stacking up the lego blocks of the neural net toolbox in various exotic architectures that make sense to them. Resist this temptation strongly in the early stages of your project. I always advise people to simply find the most related paper and copy paste their simplest architecture that achieves good performance. E.g. if you are classifying images don't be a hero and just copy paste a ResNet-50 for your first run. You're allowed to do something more custom later and beat this. -- **adam is safe**. In the early stages of setting baselines I like to use Adam with a learning rate of [3e-4](https://twitter.com/karpathy/status/801621764144971776?lang=en). In my experience Adam is much more forgiving to hyperparameters, including a bad learning rate. For ConvNets a well-tuned SGD will almost always slightly outperform Adam, but the optimal learning rate region is much more narrow and problem-specific. (Note: If you are using RNNs and related sequence models it is more common to use Adam. At the initial stage of your project, again, don't be a hero and follow whatever the most related papers do.) -- **complexify only one at a time**. If you have multiple signals to plug into your classifier I would advise that you plug them in one by one and every time ensure that you get a performance boost you'd expect. Don't throw the kitchen sink at your model at the start. There are other ways of building up complexity - e.g. you can try to plug in smaller images first and make them bigger later, etc. -- **do not trust learning rate decay defaults**. If you are re-purposing code from some other domain always be very careful with learning rate decay. Not only would you want to use different decay schedules for different problems, but - even worse - in a typical implementation the schedule will be based current epoch number, which can vary widely simply depending on the size of your dataset. E.g. ImageNet would decay by 10 on epoch 30. If you're not training ImageNet then you almost certainly do not want this. If you're not careful your code could secretely be driving your learning rate to zero too early, not allowing your model to converge. In my own work I always disable learning rate decays entirely (I use a constant LR) and tune this all the way at the very end. - -#### 4. Regularize - -Ideally, we are now at a place where we have a large model that is fitting at least the training set. Now it is time to regularize it and gain some validation accuracy by giving up some of the training accuracy. Some tips & tricks: - -- **get more data**. First, the by far best and preferred way to regularize a model in any practical setting is to add more real training data. It is a very common mistake to spend a lot engineering cycles trying to squeeze juice out of a small dataset when you could instead be collecting more data. As far as I'm aware adding more data is pretty much the only guaranteed way to monotonically improve the performance of a well-configured neural network almost indefinitely. The other would be ensembles (if you can afford them), but that tops out after ~5 models. -- **data augment**. The next best thing to real data is half-fake data - try out more aggressive data augmentation. -- **creative augmentation**. If half-fake data doesn't do it, fake data may also do something. People are finding creative ways of expanding datasets; For example, [domain randomization](https://openai.com/blog/learning-dexterity/), use of [simulation](http://vladlen.info/publications/playing-data-ground-truth-computer-games/), clever [hybrids](https://arxiv.org/abs/1708.01642) such as inserting (potentially simulated) data into scenes, or even GANs. -- **pretrain**. It rarely ever hurts to use a pretrained network if you can, even if you have enough data. -- **stick with supervised learning**. Do not get over-excited about unsupervised pretraining. Unlike what that blog post from 2008 tells you, as far as I know, no version of it has reported strong results in modern computer vision (though NLP seems to be doing pretty well with BERT and friends these days, quite likely owing to the more deliberate nature of text, and a higher signal to noise ratio). -- **smaller input dimensionality**. Remove features that may contain spurious signal. Any added spurious input is just another opportunity to overfit if your dataset is small. Similarly, if low-level details don't matter much try to input a smaller image. -- **smaller model size**. In many cases you can use domain knowledge constraints on the network to decrease its size. As an example, it used to be trendy to use Fully Connected layers at the top of backbones for ImageNet but these have since been replaced with simple average pooling, eliminating a ton of parameters in the process. -- **decrease the batch size**. Due to the normalization inside batch norm smaller batch sizes somewhat correspond to stronger regularization. This is because the batch empirical mean/std are more approximate versions of the full mean/std so the scale & offset "wiggles" your batch around more. -- **drop**. Add dropout. Use dropout2d (spatial dropout) for ConvNets. Use this sparingly/carefully because dropout [does not seem to play nice](https://arxiv.org/abs/1801.05134) with batch normalization. -- **weight decay**. Increase the weight decay penalty. -- **early stopping**. Stop training based on your measured validation loss to catch your model just as it's about to overfit. -- **try a larger model**. I mention this last and only after early stopping but I've found a few times in the past that larger models will of course overfit much more eventually, but their "early stopped" performance can often be much better than that of smaller models. - -Finally, to gain additional confidence that your network is a reasonable classifier, I like to visualize the network's first-layer weights and ensure you get nice edges that make sense. If your first layer filters look like noise then something could be off. Similarly, activations inside the net can sometimes display odd artifacts and hint at problems. - -#### 5. Tune - -You should now be "in the loop" with your dataset exploring a wide model space for architectures that achieve low validation loss. A few tips and tricks for this step: - -- **random over grid search**. For simultaneously tuning multiple hyperparameters it may sound tempting to use grid search to ensure coverage of all settings, but keep in mind that it is [best to use random search instead](http://jmlr.csail.mit.edu/papers/volume13/bergstra12a/bergstra12a.pdf). Intuitively, this is because neural nets are often much more sensitive to some parameters than others. In the limit, if a parameter **a** matters but changing **b** has no effect then you'd rather sample **a** more throughly than at a few fixed points multiple times. -- **hyper-parameter optimization**. There is a large number of fancy bayesian hyper-parameter optimization toolboxes around and a few of my friends have also reported success with them, but my personal experience is that the state of the art approach to exploring a nice and wide space of models and hyperparameters is to use an intern :). Just kidding. - - -#### 6. Squeeze out the juice - -Once you find the best types of architectures and hyper-parameters you can still use a few more tricks to squeeze out the last pieces of juice out of the system: - -- **ensembles**. Model ensembles are a pretty much guaranteed way to gain 2% of accuracy on anything. If you can't afford the computation at test time look into distilling your ensemble into a network using [dark knowledge](https://arxiv.org/abs/1503.02531). -- **leave it training**. I've often seen people tempted to stop the model training when the validation loss seems to be leveling off. In my experience networks keep training for unintuitively long time. One time I accidentally left a model training during the winter break and when I got back in January it was SOTA ("state of the art"). - - -#### Conclusion - -Once you make it here you'll have all the ingredients for success: You have a deep understanding of the technology, the dataset and the problem, you've set up the entire training/evaluation infrastructure and achieved high confidence in its accuracy, and you've explored increasingly more complex models, gaining performance improvements in ways you've predicted each step of the way. You're now ready to read a lot of papers, try a large number of experiments, and get your SOTA results. Good luck! diff --git a/about.md b/about.md index 290df1dd3..8dd47b2b5 100644 --- a/about.md +++ b/about.md @@ -4,4 +4,4 @@ title: About permalink: /about/ --- -See my [website](http://cs.stanford.edu/people/karpathy/). +I am **Sagar Pathrudkar**, a Dual Degree student at IIT Madras working on Autonomous Systems. diff --git a/assets/ai/.DS_Store b/assets/ai/.DS_Store deleted file mode 100644 index d21c5cb31..000000000 Binary files a/assets/ai/.DS_Store and /dev/null differ diff --git a/assets/ai/digibrain.jpg b/assets/ai/digibrain.jpg deleted file mode 100644 index 969f2597c..000000000 Binary files a/assets/ai/digibrain.jpg and /dev/null differ diff --git a/assets/ai/eye2.jpg b/assets/ai/eye2.jpg deleted file mode 100644 index 5f7da367b..000000000 Binary files a/assets/ai/eye2.jpg and /dev/null differ diff --git a/assets/ai/graph.png b/assets/ai/graph.png deleted file mode 100644 index 952413147..000000000 Binary files a/assets/ai/graph.png and /dev/null differ diff --git a/assets/ai/hand.jpg b/assets/ai/hand.jpg deleted file mode 100644 index 2c37acafb..000000000 Binary files a/assets/ai/hand.jpg and /dev/null differ diff --git a/assets/ai/lifetree.gif b/assets/ai/lifetree.gif deleted file mode 100644 index 04a37f664..000000000 Binary files a/assets/ai/lifetree.gif and /dev/null differ diff --git a/assets/ai/neocortex.png b/assets/ai/neocortex.png deleted file mode 100644 index 48973dc0b..000000000 Binary files a/assets/ai/neocortex.png and /dev/null differ diff --git a/assets/ai/ocean.jpeg b/assets/ai/ocean.jpeg deleted file mode 100644 index 34ef252dd..000000000 Binary files a/assets/ai/ocean.jpeg and /dev/null differ diff --git a/assets/ai/psych.jpg b/assets/ai/psych.jpg deleted file mode 100644 index 4ec16a3d1..000000000 Binary files a/assets/ai/psych.jpg and /dev/null differ diff --git a/assets/break/banana.jpeg b/assets/break/banana.jpeg deleted file mode 100644 index 304e7be2b..000000000 Binary files a/assets/break/banana.jpeg and /dev/null differ diff --git a/assets/break/break1.jpeg b/assets/break/break1.jpeg deleted file mode 100644 index 0c715ea65..000000000 Binary files a/assets/break/break1.jpeg and /dev/null differ diff --git a/assets/break/break2.jpeg b/assets/break/break2.jpeg deleted file mode 100644 index 7015d79a8..000000000 Binary files a/assets/break/break2.jpeg and /dev/null differ diff --git a/assets/break/breakconv.png b/assets/break/breakconv.png deleted file mode 100644 index 0496ef7bf..000000000 Binary files a/assets/break/breakconv.png and /dev/null differ diff --git a/assets/break/fish.jpeg b/assets/break/fish.jpeg deleted file mode 100644 index 3afa0caff..000000000 Binary files a/assets/break/fish.jpeg and /dev/null differ diff --git a/assets/break/fool1.jpeg b/assets/break/fool1.jpeg deleted file mode 100644 index 8a5695343..000000000 Binary files a/assets/break/fool1.jpeg and /dev/null differ diff --git a/assets/break/fool2.jpeg b/assets/break/fool2.jpeg deleted file mode 100644 index c5e6cd168..000000000 Binary files a/assets/break/fool2.jpeg and /dev/null differ diff --git a/assets/break/noise1.jpeg b/assets/break/noise1.jpeg deleted file mode 100644 index 3ffd5af03..000000000 Binary files a/assets/break/noise1.jpeg and /dev/null differ diff --git a/assets/break/noise2.jpeg b/assets/break/noise2.jpeg deleted file mode 100644 index 4e0f91632..000000000 Binary files a/assets/break/noise2.jpeg and /dev/null differ diff --git a/assets/break/rapeseed.jpeg b/assets/break/rapeseed.jpeg deleted file mode 100644 index a2d432efe..000000000 Binary files a/assets/break/rapeseed.jpeg and /dev/null differ diff --git a/assets/break/rapeseed2.jpeg b/assets/break/rapeseed2.jpeg deleted file mode 100644 index 2b6620059..000000000 Binary files a/assets/break/rapeseed2.jpeg and /dev/null differ diff --git a/assets/break/szegedy.jpeg b/assets/break/szegedy.jpeg deleted file mode 100644 index 765bef76e..000000000 Binary files a/assets/break/szegedy.jpeg and /dev/null differ diff --git a/assets/break/templates.jpeg b/assets/break/templates.jpeg deleted file mode 100644 index 855159b09..000000000 Binary files a/assets/break/templates.jpeg and /dev/null differ diff --git a/assets/chrome1.jpeg b/assets/chrome1.jpeg deleted file mode 100644 index ad74ecb55..000000000 Binary files a/assets/chrome1.jpeg and /dev/null differ diff --git a/assets/chrome2.jpeg b/assets/chrome2.jpeg deleted file mode 100644 index 4d9f84bc4..000000000 Binary files a/assets/chrome2.jpeg and /dev/null differ diff --git a/assets/chrome3.jpeg b/assets/chrome3.jpeg deleted file mode 100644 index 00c83b7e8..000000000 Binary files a/assets/chrome3.jpeg and /dev/null differ diff --git a/assets/chrome4.jpeg b/assets/chrome4.jpeg deleted file mode 100644 index ab15f1535..000000000 Binary files a/assets/chrome4.jpeg and /dev/null differ diff --git a/assets/cifar_predict.jpg b/assets/cifar_predict.jpg deleted file mode 100644 index 1f3c724c2..000000000 Binary files a/assets/cifar_predict.jpg and /dev/null differ diff --git a/assets/cifar_preview.png b/assets/cifar_preview.png deleted file mode 100644 index 001d4728a..000000000 Binary files a/assets/cifar_preview.png and /dev/null differ diff --git a/assets/cifar_weirdimages.png b/assets/cifar_weirdimages.png deleted file mode 100644 index cdc1eab69..000000000 Binary files a/assets/cifar_weirdimages.png and /dev/null differ diff --git a/assets/cnntsne.jpeg b/assets/cnntsne.jpeg deleted file mode 100644 index f7498d604..000000000 Binary files a/assets/cnntsne.jpeg and /dev/null differ diff --git a/assets/ddp/allres1.jpg b/assets/ddp/allres1.jpg new file mode 100644 index 000000000..03b219378 Binary files /dev/null and b/assets/ddp/allres1.jpg differ diff --git a/assets/ddp/allres2.jpg b/assets/ddp/allres2.jpg new file mode 100644 index 000000000..072910e11 Binary files /dev/null and b/assets/ddp/allres2.jpg differ diff --git a/assets/ddp/allres3.jpg b/assets/ddp/allres3.jpg new file mode 100644 index 000000000..741f93679 Binary files /dev/null and b/assets/ddp/allres3.jpg differ diff --git a/assets/ddp/allres4.jpg b/assets/ddp/allres4.jpg new file mode 100644 index 000000000..8de21aa1a Binary files /dev/null and b/assets/ddp/allres4.jpg differ diff --git a/assets/ddp/test1dehaze.jpg b/assets/ddp/test1dehaze.jpg new file mode 100644 index 000000000..1ba079cb5 Binary files /dev/null and b/assets/ddp/test1dehaze.jpg differ diff --git a/assets/ddp/test1enhance.jpg b/assets/ddp/test1enhance.jpg new file mode 100644 index 000000000..b76de72e3 Binary files /dev/null and b/assets/ddp/test1enhance.jpg differ diff --git a/assets/ddp/test2dehaze.jpg b/assets/ddp/test2dehaze.jpg new file mode 100644 index 000000000..ee695d8a2 Binary files /dev/null and b/assets/ddp/test2dehaze.jpg differ diff --git a/assets/ddp/test2enhance.jpg b/assets/ddp/test2enhance.jpg new file mode 100644 index 000000000..6aa7db5ce Binary files /dev/null and b/assets/ddp/test2enhance.jpg differ diff --git a/assets/ddp/test3dehaze.jpg b/assets/ddp/test3dehaze.jpg new file mode 100644 index 000000000..3e6fb6ea1 Binary files /dev/null and b/assets/ddp/test3dehaze.jpg differ diff --git a/assets/ddp/test3enhance.jpg b/assets/ddp/test3enhance.jpg new file mode 100644 index 000000000..b76b58417 Binary files /dev/null and b/assets/ddp/test3enhance.jpg differ diff --git a/assets/ddp/test5dehaze.jpg b/assets/ddp/test5dehaze.jpg new file mode 100644 index 000000000..68b4349f3 Binary files /dev/null and b/assets/ddp/test5dehaze.jpg differ diff --git a/assets/ddp/test5enhance.jpg b/assets/ddp/test5enhance.jpg new file mode 100644 index 000000000..865c990c5 Binary files /dev/null and b/assets/ddp/test5enhance.jpg differ diff --git a/assets/ddp/test6dehaze.jpg b/assets/ddp/test6dehaze.jpg new file mode 100644 index 000000000..2af7c8bc9 Binary files /dev/null and b/assets/ddp/test6dehaze.jpg differ diff --git a/assets/ddp/test6enhance.jpg b/assets/ddp/test6enhance.jpg new file mode 100644 index 000000000..e6153ae7f Binary files /dev/null and b/assets/ddp/test6enhance.jpg differ diff --git a/assets/ddp/test9dehaze.jpg b/assets/ddp/test9dehaze.jpg new file mode 100644 index 000000000..945824099 Binary files /dev/null and b/assets/ddp/test9dehaze.jpg differ diff --git a/assets/ddp/test9enhance.jpg b/assets/ddp/test9enhance.jpg new file mode 100644 index 000000000..056d42635 Binary files /dev/null and b/assets/ddp/test9enhance.jpg differ diff --git a/assets/ddp/w1.jpg b/assets/ddp/w1.jpg new file mode 100644 index 000000000..85586f5f9 Binary files /dev/null and b/assets/ddp/w1.jpg differ diff --git a/assets/ddp/w10.jpg b/assets/ddp/w10.jpg new file mode 100644 index 000000000..0e8d6a7a8 Binary files /dev/null and b/assets/ddp/w10.jpg differ diff --git a/assets/ddp/w11.jpg b/assets/ddp/w11.jpg new file mode 100644 index 000000000..11a49e8a1 Binary files /dev/null and b/assets/ddp/w11.jpg differ diff --git a/assets/ddp/w12.jpg b/assets/ddp/w12.jpg new file mode 100644 index 000000000..079d8f2a9 Binary files /dev/null and b/assets/ddp/w12.jpg differ diff --git a/assets/ddp/w13.jpg b/assets/ddp/w13.jpg new file mode 100644 index 000000000..210547c77 Binary files /dev/null and b/assets/ddp/w13.jpg differ diff --git a/assets/ddp/w2.jpg b/assets/ddp/w2.jpg new file mode 100644 index 000000000..e225cb831 Binary files /dev/null and b/assets/ddp/w2.jpg differ diff --git a/assets/ddp/w3.jpg b/assets/ddp/w3.jpg new file mode 100644 index 000000000..d182bf690 Binary files /dev/null and b/assets/ddp/w3.jpg differ diff --git a/assets/ddp/w4.jpg b/assets/ddp/w4.jpg new file mode 100644 index 000000000..057d947f1 Binary files /dev/null and b/assets/ddp/w4.jpg differ diff --git a/assets/ddp/w5.jpg b/assets/ddp/w5.jpg new file mode 100644 index 000000000..372cd1ef1 Binary files /dev/null and b/assets/ddp/w5.jpg differ diff --git a/assets/ddp/w6.jpg b/assets/ddp/w6.jpg new file mode 100644 index 000000000..f618de9f4 Binary files /dev/null and b/assets/ddp/w6.jpg differ diff --git a/assets/ddp/w7.jpg b/assets/ddp/w7.jpg new file mode 100644 index 000000000..f618de9f4 Binary files /dev/null and b/assets/ddp/w7.jpg differ diff --git a/assets/ddp/w8.jpg b/assets/ddp/w8.jpg new file mode 100644 index 000000000..b831652b5 Binary files /dev/null and b/assets/ddp/w8.jpg differ diff --git a/assets/ddp/w9.jpg b/assets/ddp/w9.jpg new file mode 100644 index 000000000..1e341d80d Binary files /dev/null and b/assets/ddp/w9.jpg differ diff --git a/assets/hn.jpg b/assets/hn.jpg deleted file mode 100644 index d3eac6bd6..000000000 Binary files a/assets/hn.jpg and /dev/null differ diff --git a/assets/ilsvrc1.png b/assets/ilsvrc1.png deleted file mode 100644 index f4982ac09..000000000 Binary files a/assets/ilsvrc1.png and /dev/null differ diff --git a/assets/ilsvrc2.png b/assets/ilsvrc2.png deleted file mode 100644 index 63851a875..000000000 Binary files a/assets/ilsvrc2.png and /dev/null differ diff --git a/assets/ilsvrc3.png b/assets/ilsvrc3.png deleted file mode 100644 index e7028a591..000000000 Binary files a/assets/ilsvrc3.png and /dev/null differ diff --git a/assets/megoogle.jpg b/assets/megoogle.jpg deleted file mode 100644 index 43e2ca697..000000000 Binary files a/assets/megoogle.jpg and /dev/null differ diff --git a/assets/nips2012.jpeg b/assets/nips2012.jpeg deleted file mode 100644 index b125661f5..000000000 Binary files a/assets/nips2012.jpeg and /dev/null differ diff --git a/assets/obamafunny.jpg b/assets/obamafunny.jpg deleted file mode 100644 index 3c7e6f43d..000000000 Binary files a/assets/obamafunny.jpg and /dev/null differ diff --git a/assets/objectdiscovery.jpeg b/assets/objectdiscovery.jpeg deleted file mode 100644 index fd70ad8c3..000000000 Binary files a/assets/objectdiscovery.jpeg and /dev/null differ diff --git a/assets/phd/adviser.gif b/assets/phd/adviser.gif deleted file mode 100644 index b8d1dfb3e..000000000 Binary files a/assets/phd/adviser.gif and /dev/null differ diff --git a/assets/phd/arxiv-papers.png b/assets/phd/arxiv-papers.png deleted file mode 100644 index 72ed94935..000000000 Binary files a/assets/phd/arxiv-papers.png and /dev/null differ diff --git a/assets/phd/code.jpg b/assets/phd/code.jpg deleted file mode 100644 index bb0058566..000000000 Binary files a/assets/phd/code.jpg and /dev/null differ diff --git a/assets/phd/latex.png b/assets/phd/latex.png deleted file mode 100644 index cd273d1a9..000000000 Binary files a/assets/phd/latex.png and /dev/null differ diff --git a/assets/phd/phds.jpg b/assets/phd/phds.jpg deleted file mode 100644 index c1c35d9a1..000000000 Binary files a/assets/phd/phds.jpg and /dev/null differ diff --git a/assets/phd/posters.jpg b/assets/phd/posters.jpg deleted file mode 100644 index b1915f3b3..000000000 Binary files a/assets/phd/posters.jpg and /dev/null differ diff --git a/assets/phd/talk.jpg b/assets/phd/talk.jpg deleted file mode 100644 index 8edac5bed..000000000 Binary files a/assets/phd/talk.jpg and /dev/null differ diff --git a/assets/rl/discounted.png b/assets/rl/discounted.png deleted file mode 100644 index c4e4a3a0b..000000000 Binary files a/assets/rl/discounted.png and /dev/null differ diff --git a/assets/rl/episodes.png b/assets/rl/episodes.png deleted file mode 100644 index 6ad3482a7..000000000 Binary files a/assets/rl/episodes.png and /dev/null differ diff --git a/assets/rl/frostbite.jpg b/assets/rl/frostbite.jpg deleted file mode 100644 index c2c608885..000000000 Binary files a/assets/rl/frostbite.jpg and /dev/null differ diff --git a/assets/rl/mdp.png b/assets/rl/mdp.png deleted file mode 100644 index a1ffc87c8..000000000 Binary files a/assets/rl/mdp.png and /dev/null differ diff --git a/assets/rl/montezuma.png b/assets/rl/montezuma.png deleted file mode 100644 index ebeff4951..000000000 Binary files a/assets/rl/montezuma.png and /dev/null differ diff --git a/assets/rl/nondiff1.png b/assets/rl/nondiff1.png deleted file mode 100644 index 3c5b62cca..000000000 Binary files a/assets/rl/nondiff1.png and /dev/null differ diff --git a/assets/rl/nondiff2.png b/assets/rl/nondiff2.png deleted file mode 100644 index 296518357..000000000 Binary files a/assets/rl/nondiff2.png and /dev/null differ diff --git a/assets/rl/pg.png b/assets/rl/pg.png deleted file mode 100644 index 84ceaa513..000000000 Binary files a/assets/rl/pg.png and /dev/null differ diff --git a/assets/rl/policy.png b/assets/rl/policy.png deleted file mode 100644 index 10a65b711..000000000 Binary files a/assets/rl/policy.png and /dev/null differ diff --git a/assets/rl/pong.gif b/assets/rl/pong.gif deleted file mode 100644 index 6dac1aa07..000000000 Binary files a/assets/rl/pong.gif and /dev/null differ diff --git a/assets/rl/preview.jpeg b/assets/rl/preview.jpeg deleted file mode 100644 index 23ac5ee7c..000000000 Binary files a/assets/rl/preview.jpeg and /dev/null differ diff --git a/assets/rl/rl.png b/assets/rl/rl.png deleted file mode 100644 index 48d237a9e..000000000 Binary files a/assets/rl/rl.png and /dev/null differ diff --git a/assets/rl/sl.png b/assets/rl/sl.png deleted file mode 100644 index 40647645f..000000000 Binary files a/assets/rl/sl.png and /dev/null differ diff --git a/assets/rl/weights.png b/assets/rl/weights.png deleted file mode 100644 index f48ff6130..000000000 Binary files a/assets/rl/weights.png and /dev/null differ diff --git a/assets/rnn/charseq.jpeg b/assets/rnn/charseq.jpeg deleted file mode 100644 index c8e1221bb..000000000 Binary files a/assets/rnn/charseq.jpeg and /dev/null differ diff --git a/assets/rnn/diags.jpeg b/assets/rnn/diags.jpeg deleted file mode 100644 index 77f38cedc..000000000 Binary files a/assets/rnn/diags.jpeg and /dev/null differ diff --git a/assets/rnn/diags_old.jpeg b/assets/rnn/diags_old.jpeg deleted file mode 100644 index 0148095c0..000000000 Binary files a/assets/rnn/diags_old.jpeg and /dev/null differ diff --git a/assets/rnn/house_generate.gif b/assets/rnn/house_generate.gif deleted file mode 100644 index c2381abf4..000000000 Binary files a/assets/rnn/house_generate.gif and /dev/null differ diff --git a/assets/rnn/house_read.gif b/assets/rnn/house_read.gif deleted file mode 100644 index 0d94786ec..000000000 Binary files a/assets/rnn/house_read.gif and /dev/null differ diff --git a/assets/rnn/latex1.jpeg b/assets/rnn/latex1.jpeg deleted file mode 100644 index 3dfd3dba1..000000000 Binary files a/assets/rnn/latex1.jpeg and /dev/null differ diff --git a/assets/rnn/latex2.jpeg b/assets/rnn/latex2.jpeg deleted file mode 100644 index 7b3374cbd..000000000 Binary files a/assets/rnn/latex2.jpeg and /dev/null differ diff --git a/assets/rnn/latex3.jpeg b/assets/rnn/latex3.jpeg deleted file mode 100644 index a4421f5e9..000000000 Binary files a/assets/rnn/latex3.jpeg and /dev/null differ diff --git a/assets/rnn/latex4.jpeg b/assets/rnn/latex4.jpeg deleted file mode 100644 index 10eba00a5..000000000 Binary files a/assets/rnn/latex4.jpeg and /dev/null differ diff --git a/assets/rnn/pane1.png b/assets/rnn/pane1.png deleted file mode 100644 index 943fcb031..000000000 Binary files a/assets/rnn/pane1.png and /dev/null differ diff --git a/assets/rnn/pane2.png b/assets/rnn/pane2.png deleted file mode 100644 index a32f011a3..000000000 Binary files a/assets/rnn/pane2.png and /dev/null differ diff --git a/assets/rnn/under1.jpeg b/assets/rnn/under1.jpeg deleted file mode 100644 index 6a06869f9..000000000 Binary files a/assets/rnn/under1.jpeg and /dev/null differ diff --git a/assets/rnn/under2.jpeg b/assets/rnn/under2.jpeg deleted file mode 100644 index 46acecdf7..000000000 Binary files a/assets/rnn/under2.jpeg and /dev/null differ diff --git a/assets/rnn/under3.jpeg b/assets/rnn/under3.jpeg deleted file mode 100644 index b54005f9c..000000000 Binary files a/assets/rnn/under3.jpeg and /dev/null differ diff --git a/assets/rnn/under4.jpeg b/assets/rnn/under4.jpeg deleted file mode 100644 index 7f5424b43..000000000 Binary files a/assets/rnn/under4.jpeg and /dev/null differ diff --git a/assets/rssicon.svg b/assets/rssicon.svg deleted file mode 100644 index 939871d62..000000000 --- a/assets/rssicon.svg +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/assets/selfie/celebs_grid_render.jpg b/assets/selfie/celebs_grid_render.jpg deleted file mode 100644 index 88e4f26a0..000000000 Binary files a/assets/selfie/celebs_grid_render.jpg and /dev/null differ diff --git a/assets/selfie/cnnvis.jpg b/assets/selfie/cnnvis.jpg deleted file mode 100644 index 3c03029f2..000000000 Binary files a/assets/selfie/cnnvis.jpg and /dev/null differ diff --git a/assets/selfie/crop2.jpg b/assets/selfie/crop2.jpg deleted file mode 100644 index ff8511eb2..000000000 Binary files a/assets/selfie/crop2.jpg and /dev/null differ diff --git a/assets/selfie/crops1.jpg b/assets/selfie/crops1.jpg deleted file mode 100644 index 5f6af62fa..000000000 Binary files a/assets/selfie/crops1.jpg and /dev/null differ diff --git a/assets/selfie/gif2.gif b/assets/selfie/gif2.gif deleted file mode 100644 index 6eb7d36e0..000000000 Binary files a/assets/selfie/gif2.gif and /dev/null differ diff --git a/assets/selfie/grid_render_all.jpg b/assets/selfie/grid_render_all.jpg deleted file mode 100644 index 4b79e9d29..000000000 Binary files a/assets/selfie/grid_render_all.jpg and /dev/null differ diff --git a/assets/selfie/grid_render_best.jpg b/assets/selfie/grid_render_best.jpg deleted file mode 100644 index 270d599ba..000000000 Binary files a/assets/selfie/grid_render_best.jpg and /dev/null differ diff --git a/assets/selfie/grid_render_continuum.jpg b/assets/selfie/grid_render_continuum.jpg deleted file mode 100644 index 6a01261d8..000000000 Binary files a/assets/selfie/grid_render_continuum.jpg and /dev/null differ diff --git a/assets/selfie/grid_render_posneg.jpg b/assets/selfie/grid_render_posneg.jpg deleted file mode 100644 index 60b73b6c3..000000000 Binary files a/assets/selfie/grid_render_posneg.jpg and /dev/null differ diff --git a/assets/selfie/grid_render_tsne_reduced.jpg b/assets/selfie/grid_render_tsne_reduced.jpg deleted file mode 100644 index 848be6521..000000000 Binary files a/assets/selfie/grid_render_tsne_reduced.jpg and /dev/null differ diff --git a/assets/selfie/grid_render_worst.jpg b/assets/selfie/grid_render_worst.jpg deleted file mode 100644 index 84f553a04..000000000 Binary files a/assets/selfie/grid_render_worst.jpg and /dev/null differ diff --git a/assets/selfie/males.jpg b/assets/selfie/males.jpg deleted file mode 100644 index 6e5dde87c..000000000 Binary files a/assets/selfie/males.jpg and /dev/null differ diff --git a/assets/selfie/selfiebot2.png b/assets/selfie/selfiebot2.png deleted file mode 100644 index 2536fd1d1..000000000 Binary files a/assets/selfie/selfiebot2.png and /dev/null differ diff --git a/assets/selfie/teaser.jpeg b/assets/selfie/teaser.jpeg deleted file mode 100644 index 228b88444..000000000 Binary files a/assets/selfie/teaser.jpeg and /dev/null differ diff --git a/assets/selfie/useful.jpg b/assets/selfie/useful.jpg deleted file mode 100644 index 91ff4907d..000000000 Binary files a/assets/selfie/useful.jpg and /dev/null differ diff --git a/assets/sportspredict.jpeg b/assets/sportspredict.jpeg deleted file mode 100644 index 5cba999d0..000000000 Binary files a/assets/sportspredict.jpeg and /dev/null differ diff --git a/assets/tsne_eg.jpeg b/assets/tsne_eg.jpeg deleted file mode 100644 index b07913c23..000000000 Binary files a/assets/tsne_eg.jpeg and /dev/null differ diff --git a/assets/tsne_preview.jpeg b/assets/tsne_preview.jpeg deleted file mode 100644 index 4d438ac36..000000000 Binary files a/assets/tsne_preview.jpeg and /dev/null differ diff --git a/assets/tsne_sentprepro.jpeg b/assets/tsne_sentprepro.jpeg deleted file mode 100644 index e8baedc0b..000000000 Binary files a/assets/tsne_sentprepro.jpeg and /dev/null differ diff --git a/assets/ulogme_mv1.jpeg b/assets/ulogme_mv1.jpeg deleted file mode 100644 index 5466c3ad7..000000000 Binary files a/assets/ulogme_mv1.jpeg and /dev/null differ diff --git a/assets/ulogme_mv2.jpeg b/assets/ulogme_mv2.jpeg deleted file mode 100644 index d75c241de..000000000 Binary files a/assets/ulogme_mv2.jpeg and /dev/null differ diff --git a/assets/ulogme_mv3.jpeg b/assets/ulogme_mv3.jpeg deleted file mode 100644 index e81cfbfeb..000000000 Binary files a/assets/ulogme_mv3.jpeg and /dev/null differ diff --git a/assets/ulogme_sv1.jpeg b/assets/ulogme_sv1.jpeg deleted file mode 100644 index 378f81885..000000000 Binary files a/assets/ulogme_sv1.jpeg and /dev/null differ diff --git a/assets/ulogme_sv2.jpeg b/assets/ulogme_sv2.jpeg deleted file mode 100644 index d1e39a8aa..000000000 Binary files a/assets/ulogme_sv2.jpeg and /dev/null differ diff --git a/assets/ulogme_sv3.jpeg b/assets/ulogme_sv3.jpeg deleted file mode 100644 index 8f1665ef4..000000000 Binary files a/assets/ulogme_sv3.jpeg and /dev/null differ diff --git a/assets/ulogme_sv4.jpeg b/assets/ulogme_sv4.jpeg deleted file mode 100644 index b9160f9d0..000000000 Binary files a/assets/ulogme_sv4.jpeg and /dev/null differ diff --git a/assets/ulogmeoverview.jpeg b/assets/ulogmeoverview.jpeg deleted file mode 100644 index 8a8657ad4..000000000 Binary files a/assets/ulogmeoverview.jpeg and /dev/null differ diff --git a/assets/zeilercnnfeatures.jpeg b/assets/zeilercnnfeatures.jpeg deleted file mode 100644 index 1b9bb6634..000000000 Binary files a/assets/zeilercnnfeatures.jpeg and /dev/null differ diff --git a/css/main.css b/css/main.css index 3943a4213..958962f7d 100644 --- a/css/main.css +++ b/css/main.css @@ -1,426 +1,235 @@ -/* Base */ -/* ----------------------------------------------------------*/ - -* { - margin: 0; - padding: 0; -} - -html, body { height: 100%; } - +/* General Styling for Vibrant Dark Theme */ body { - font-family: Helvetica, Arial, sans-serif; - font-size: 16px; - line-height: 1.5; - font-weight: 300; - background-color: #fdfdfd; + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; + margin: 0; + padding: 0; + background-color: #0d1b2a; /* Navy Blue */ + color: #ffffff; + text-align: center; } -h1, h2, h3, h4, h5, h6 { font-size: 100%; font-weight: 400; } - -a { color: #2a7ae2; text-decoration: none; } -a:hover { color: #000; text-decoration: underline; } -a:visited { color: #205caa; } - -/* Utility */ - -.wrap:before, -.wrap:after { content:""; display:table; } -.wrap:after { clear: both; } -.wrap { - max-width: 800px; - padding: 0 30px; - margin: 0 auto; - zoom: 1; +/* Headings */ +h1, h2, h3 { + color: #00cccc; /* Cyan */ } - -/* Layout Styles */ -/* ----------------------------------------------------------*/ - -/* Site header */ - -.site-header { - border-top: 5px solid #333; - border-bottom: 1px solid #e8e8e8; - min-height: 56px; - background-color: white; +/* Buttons */ +button { + background-color: #009999; /* Teal */ + border: 2px solid #ffffff; + color: #ffffff; + padding: 12px 20px; + text-align: center; + font-size: 16px; + cursor: pointer; + border-radius: 8px; + transition: background-color 0.3s ease, color 0.3s ease; } -.site-title, -.site-title:hover, -.site-title:visited { - display: block; - color: #333; - font-size: 26px; - letter-spacing: -1px; - float: left; - line-height: 56px; - position: relative; - z-index: 1; +button:hover { + background-color: #ffddee; /* Almond Pink */ + color: #0d1b2a; + border-color: #0d1b2a; } -.site-nav { - float: right; - line-height: 56px; +/* Form Container */ +.form-container { + background-color: #112a38; /* Deep Navy */ + border-radius: 12px; + box-shadow: 0px 8px 16px rgba(0, 0, 0, 0.3); + padding: 30px; + max-width: 600px; + margin: 40px auto; + border: 1px solid #00cccc; } -.site-nav .menu-icon { display: none; } - -.site-nav .page-link { - margin-left: 20px; - color: #727272; - letter-spacing: -.5px; +/* Form Inputs */ +.form-container input[type="number"] { + width: 80%; + padding: 10px; + margin: 10px 0; + border: 1px solid #666; + border-radius: 6px; + font-size: 14px; + color: #ffffff; + background-color: #1c3a45; } -/* Site footer */ - -.site-footer { - border-top: 1px solid #e8e8e8; - padding: 30px 0; +.form-container input[type="number"]:focus { + border-color: #00cccc; + outline: none; } -.footer-heading { - font-size: 18px; - font-weight: 300; - letter-spacing: -.5px; - margin-bottom: 15px; +/* Tables */ +table { + width: 100%; + border-collapse: collapse; + margin-top: 20px; } -.site-footer .column { float: left; margin-bottom: 15px; } - -.footer-col-1 { - width: 270px; /*fallback*/ - width: -webkit-calc(35% - 10px); - width: -moz-calc(35% - 10px); - width: -o-calc(35% - 10px); - width: calc(35% - 10px); - margin-right: 10px -} -.footer-col-2 { - width: 175px; /*fallback*/ - width: -webkit-calc(23.125% - 10px); - width: -moz-calc(23.125% - 10px); - width: -o-calc(23.125% - 10px); - width: calc(23.125% - 10px); - margin-right: 10px +table, th, td { + border: 1px solid #00cccc; } -.footer-col-3 { - width: 335px; /*fallback*/ - width: -webkit-calc(41.875%); - width: -moz-calc(41.875%); - width: -o-calc(41.875%); - width: calc(41.875%); -} - -.site-footer ul { list-style: none; } -.site-footer li, -.site-footer p { - font-size: 15px; - letter-spacing: -.3px; - color: #828282; +th, td { + padding: 12px; + text-align: center; + font-size: 16px; } -.github-icon-svg, -.twitter-icon-svg { - display: inline-block; - width: 16px; - height: 16px; - position: relative; - top: 3px; +th { + background-color: #009999; + color: #ffffff; } +tr:nth-child(even) { + background-color: #1f3642; +} -/* Page Content styles */ -/* ----------------------------------------------------------*/ - -.page-content { - padding: 30px 0; - background-color: #fff; +tr:hover { + background-color: #224e56; } +/* Results Card */ +.results-card { + background-color: #1e2c3a; + border-left: 6px solid #ffcc99; /* Soft highlight */ + border-radius: 10px; + padding: 20px; + margin-top: 20px; + display: inline-block; + width: 90%; + text-align: left; +} -/* Home styles */ -/* ----------------------------------------------------------*/ +.results-card h3 { + color: #ffcc99; + margin-bottom: 10px; +} -.home h1 { margin-bottom: 25px; } +.results-card p { + color: #ffffff; + font-size: 15px; +} -.posts { list-style-type: none; } +/* Hero Section */ +.hero-section { + background: linear-gradient(to right, #009999, #004466); /* Teal to Navy */ + color: #fff; + padding: 80px 0; + text-align: center; + border-bottom: 4px solid #00cccc; +} -.posts li { margin-bottom: 30px; } +.hero-section h1 { + font-size: 3rem; + margin-bottom: 20px; +} -.posts .post-link { - font-size: 20px; - letter-spacing: -1px; - line-height: 1; +.hero-section p { + font-size: 1.2rem; + margin-top: 10px; } -.posts .post-date { - display: block; - font-size: 15px; - color: #818181; +.hero-section button { + background-color: #ffcc99; + color: #004466; + font-size: 18px; + padding: 14px 28px; + border-radius: 6px; } +.hero-section button:hover { + background-color: #ff99cc; + color: #000; +} -/* Post styles */ -/* ----------------------------------------------------------*/ +/* Input Section Design */ +.input-section { + padding: 40px 0; + background-color: #112a38; + margin-top: 30px; +} -.post-header { margin: 10px 0 30px; } +.input-section label { + font-size: 16px; + margin-bottom: 5px; + color: #ffcc99; +} -.post-header h1 { - font-size: 36px; - letter-spacing: -1.75px; - line-height: 1; - font-weight: 300; +.input-section input { + padding: 10px; + width: 80%; + margin-bottom: 20px; + font-size: 16px; + border-radius: 6px; + border: 1px solid #444; + outline: none; + color: #ffffff; + background-color: #1c3a45; } -.post-header .meta { - font-size: 15px; - color: #818181; - margin-top: 5px; +.input-section input:focus { + border-color: #00cccc; } -.post-content { margin: 0 0 30px; } +.input-section button { + background-color: #00cccc; + color: #fff; + font-size: 16px; + padding: 12px 20px; + border-radius: 6px; + cursor: pointer; +} -.post-content > * { margin: 20px 0; } +.input-section button:hover { + background-color: #ff99cc; + color: #0d1b2a; +} +/* Responsive Design */ +@media (max-width: 768px) { + .form-container { + padding: 20px; + margin: 20px; + } -.post-content h1, -.post-content h2, -.post-content h3, -.post-content h4, -.post-content h5, -.post-content h6 { - line-height: 1; - font-weight: 300; - margin: 40px 0 20px; + .input-section input, + .input-section button { + width: 100%; + } } -.post-content h2 { - font-size: 32px; - letter-spacing: -1.25px; +/* Header */ +header { + background-color: #002f34; + padding: 20px 0; + text-align: center; } -.post-content h3 { - font-size: 26px; - letter-spacing: -1px; +header .logo h1 { + font-size: 2.5em; + color: #ffcc99; } -.post-content h4 { - font-size: 20px; - letter-spacing: -1px; +header .logo .highlight { + color: #00cccc; } -.post-content blockquote { - border-left: 4px solid #e8e8e8; - padding-left: 20px; - font-size: 18px; - opacity: .6; - letter-spacing: -1px; - font-style: italic; - margin: 30px 0; +header .tagline { + font-size: 1.2em; + color: #aad4d3; } -.post-content ul, -.post-content ol { padding-left: 20px; } - -.post pre, -.post code { - border: 1px solid #d5d5e9; - background-color: #eef; - padding: 8px 12px; - -webkit-border-radius: 3px; - -moz-border-radius: 3px; - border-radius: 3px; - font-size: 15px; - overflow:auto; -} - -.post code { padding: 1px 5px; } - -.post ul, -.post ol { margin-left: 1.35em; } - -.post pre code { - border: 0; - padding-right: 0; - padding-left: 0; -} - -/* terminal */ -.post pre.terminal { - border: 1px solid #000; - background-color: #333; - color: #FFF; - -webkit-border-radius: 3px; - -moz-border-radius: 3px; - border-radius: 3px; -} - -.post pre.terminal code { background-color: #333; } - -/* AK custom CSS mods */ -.imgright { float: right; margin-left: 10px; } -.imgcap img { - border: 1px solid #999; - max-width: 100%; -} -.imgcap { - color: #555; - font-size: 14px; - text-align: center; -} - -.svgdiv { - width: 100%; - text-align: center; -} -/* Syntax highlighting styles */ -/* ----------------------------------------------------------*/ - -.highlight { background: #ffffff; } -.highlight .c { color: #999988; font-style: italic } /* Comment */ -.highlight .err { color: #a61717; background-color: #e3d2d2 } /* Error */ -.highlight .k { font-weight: bold } /* Keyword */ -.highlight .o { font-weight: bold } /* Operator */ -.highlight .cm { color: #999988; font-style: italic } /* Comment.Multiline */ -.highlight .cp { color: #999999; font-weight: bold } /* Comment.Preproc */ -.highlight .c1 { color: #999988; font-style: italic } /* Comment.Single */ -.highlight .cs { color: #999999; font-weight: bold; font-style: italic } /* Comment.Special */ -.highlight .gd { color: #000000; background-color: #ffdddd } /* Generic.Deleted */ -.highlight .gd .x { color: #000000; background-color: #ffaaaa } /* Generic.Deleted.Specific */ -.highlight .ge { font-style: italic } /* Generic.Emph */ -.highlight .gr { color: #aa0000 } /* Generic.Error */ -.highlight .gh { color: #999999 } /* Generic.Heading */ -.highlight .gi { color: #000000; background-color: #ddffdd } /* Generic.Inserted */ -.highlight .gi .x { color: #000000; background-color: #aaffaa } /* Generic.Inserted.Specific */ -.highlight .go { color: #888888 } /* Generic.Output */ -.highlight .gp { color: #555555 } /* Generic.Prompt */ -.highlight .gs { font-weight: bold } /* Generic.Strong */ -.highlight .gu { color: #aaaaaa } /* Generic.Subheading */ -.highlight .gt { color: #aa0000 } /* Generic.Traceback */ -.highlight .kc { font-weight: bold } /* Keyword.Constant */ -.highlight .kd { font-weight: bold } /* Keyword.Declaration */ -.highlight .kp { font-weight: bold } /* Keyword.Pseudo */ -.highlight .kr { font-weight: bold } /* Keyword.Reserved */ -.highlight .kt { color: #445588; font-weight: bold } /* Keyword.Type */ -.highlight .m { color: #009999 } /* Literal.Number */ -.highlight .s { color: #d14 } /* Literal.String */ -.highlight .na { color: #008080 } /* Name.Attribute */ -.highlight .nb { color: #0086B3 } /* Name.Builtin */ -.highlight .nc { color: #445588; font-weight: bold } /* Name.Class */ -.highlight .no { color: #008080 } /* Name.Constant */ -.highlight .ni { color: #800080 } /* Name.Entity */ -.highlight .ne { color: #990000; font-weight: bold } /* Name.Exception */ -.highlight .nf { color: #990000; font-weight: bold } /* Name.Function */ -.highlight .nn { color: #555555 } /* Name.Namespace */ -.highlight .nt { color: #000080 } /* Name.Tag */ -.highlight .nv { color: #008080 } /* Name.Variable */ -.highlight .ow { font-weight: bold } /* Operator.Word */ -.highlight .w { color: #bbbbbb } /* Text.Whitespace */ -.highlight .mf { color: #009999 } /* Literal.Number.Float */ -.highlight .mh { color: #009999 } /* Literal.Number.Hex */ -.highlight .mi { color: #009999 } /* Literal.Number.Integer */ -.highlight .mo { color: #009999 } /* Literal.Number.Oct */ -.highlight .sb { color: #d14 } /* Literal.String.Backtick */ -.highlight .sc { color: #d14 } /* Literal.String.Char */ -.highlight .sd { color: #d14 } /* Literal.String.Doc */ -.highlight .s2 { color: #d14 } /* Literal.String.Double */ -.highlight .se { color: #d14 } /* Literal.String.Escape */ -.highlight .sh { color: #d14 } /* Literal.String.Heredoc */ -.highlight .si { color: #d14 } /* Literal.String.Interpol */ -.highlight .sx { color: #d14 } /* Literal.String.Other */ -.highlight .sr { color: #009926 } /* Literal.String.Regex */ -.highlight .s1 { color: #d14 } /* Literal.String.Single */ -.highlight .ss { color: #990073 } /* Literal.String.Symbol */ -.highlight .bp { color: #999999 } /* Name.Builtin.Pseudo */ -.highlight .vc { color: #008080 } /* Name.Variable.Class */ -.highlight .vg { color: #008080 } /* Name.Variable.Global */ -.highlight .vi { color: #008080 } /* Name.Variable.Instance */ -.highlight .il { color: #009999 } /* Literal.Number.Integer.Long */ - - -/* media queries */ -/* ----------------------------------------------------------*/ - - -@media screen and (max-width: 750px) { - - .footer-col-1 { width: 50%; } - - .footer-col-2 { - width: 45%; /*fallback*/ - width: -webkit-calc(50% - 10px); - width: -moz-calc(50% - 10px); - width: -o-calc(50% - 10px); - width: calc(50% - 10px); - margin-right: 0; - } - - .site-footer .column.footer-col-3 { - width: auto; - float: none; - clear: both; - } - -} - -@media screen and (max-width: 600px) { - - .wrap { padding: 0 12px; } - - .site-nav { - position: fixed; - z-index: 10; - top: 14px; right: 8px; - background-color: white; - -webkit-border-radius: 5px; - -moz-border-radius: 5px; - border-radius: 5px; - border: 1px solid #e8e8e8; - } - - .site-nav .menu-icon { - display: block; - font-size: 24px; - color: #505050; - float: right; - width: 36px; +/* Footer */ +footer { + background-color: #002f34; + color: #aad4d3; + padding: 10px 0; text-align: center; - line-height: 36px; - } - - .site-nav .menu-icon svg { width: 18px; height: 16px; } - - .site-nav .trigger { - clear: both; - margin-bottom: 5px; - display: none; - } - - .site-nav:hover .trigger { display: block; } - - .site-nav .page-link { - display: block; - text-align: right; - line-height: 1.25; - padding: 5px 10px; - margin: 0; - } - - .post-header h1 { font-size: 36px; } - .post-content h2 { font-size: 28px; } - .post-content h3 { font-size: 22px; } - .post-content h4 { font-size: 18px; } - .post-content blockquote { padding-left: 10px; } - .post-content ul, - .post-content ol { padding-left: 10px; } - - .site-footer .column { - float: none; - clear: both; - width: auto; - margin: 0 0 15px; } + margin-top: 20px; +} +footer .highlight { + color: #ffddee; } diff --git a/feed.xml b/feed.xml deleted file mode 100644 index 4d7f8a4c1..000000000 --- a/feed.xml +++ /dev/null @@ -1,30 +0,0 @@ ---- -layout: none ---- - - - - {{ site.title | xml_escape }} - {{ site.description | xml_escape }} - {{ site.url }}{{ site.baseurl }}/ - - {{ site.time | date_to_rfc822 }} - {{ site.time | date_to_rfc822 }} - Jekyll v{{ jekyll.version }} - {% for post in site.posts limit:10 %} - - {{ post.title | xml_escape }} - {{ post.content | xml_escape }} - {{ post.date | date_to_rfc822 }} - {{ post.url | prepend: site.baseurl | prepend: site.url }} - {{ post.url | prepend: site.baseurl | prepend: site.url }} - {% for tag in post.tags %} - {{ tag | xml_escape }} - {% endfor %} - {% for cat in post.categories %} - {{ cat | xml_escape }} - {% endfor %} - - {% endfor %} - - diff --git a/nntutorial.md b/nntutorial.md deleted file mode 100644 index 539689719..000000000 --- a/nntutorial.md +++ /dev/null @@ -1,1249 +0,0 @@ ---- -layout: page -mathjax: true -comments: true -title: Hacker's guide to Neural Networks -permalink: /neuralnets/ ---- - -**Note: this is now a very old tutorial that I'm leaving up, but I don't believe should be referenced or used. Better materials include CS231n course lectures, slides, and notes, or the Deep Learning book**. - -Hi there, I'm a [CS PhD student at Stanford](http://cs.stanford.edu/people/karpathy/). I've worked on Deep Learning for a few years as part of my research and among several of my related pet projects is [ConvNetJS](http://convnetjs.com) - a Javascript library for training Neural Networks. Javascript allows one to nicely visualize what's going on and to play around with the various hyperparameter settings, but I still regularly hear from people who ask for a more thorough treatment of the topic. This article (which I plan to slowly expand out to lengths of a few book chapters) is my humble attempt. It's on web instead of PDF because all books should be, and eventually it will hopefully include animations/demos etc. - -My personal experience with Neural Networks is that everything became much clearer when I started ignoring full-page, dense derivations of backpropagation equations and just started writing code. Thus, this tutorial will contain **very little math** (I don't believe it is necessary and it can sometimes even obfuscate simple concepts). Since my background is in Computer Science and Physics, I will instead develop the topic from what I refer to as **hackers's perspective**. My exposition will center around code and physical intuitions instead of mathematical derivations. Basically, I will strive to present the algorithms in a way that I wish I had come across when I was starting out. - -> "...everything became much clearer when I started writing code." - -You might be eager to jump right in and learn about Neural Networks, backpropagation, how they can be applied to datasets in practice, etc. But before we get there, I'd like us to first forget about all that. Let's take a step back and understand what is really going on at the core. Lets first talk about real-valued circuits. - -*Update note*: I suspended my work on this guide a while ago and redirected a lot of my energy to teaching CS231n (Convolutional Neural Networks) class at Stanford. The notes are on [cs231.github.io](http://cs231n.github.io) and the course slides can be found [here](http://cs231n.stanford.edu/syllabus.html). These materials are highly related to material here, but more comprehensive and sometimes more polished. - -## Chapter 1: Real-valued Circuits - -In my opinion, the best way to think of Neural Networks is as real-valued circuits, where real values (instead of boolean values `{0,1}`) "flow" along edges and interact in gates. However, instead of gates such as `AND`, `OR`, `NOT`, etc, we have binary gates such as `*` (multiply), `+` (add), `max` or unary gates such as `exp`, etc. Unlike ordinary boolean circuits, however, we will eventually also have **gradients** flowing on the same edges of the circuit, but in the opposite direction. But we're getting ahead of ourselves. Let's focus and start out simple. - -### Base Case: Single Gate in the Circuit -Lets first consider a single, simple circuit with one gate. Here's an example: - -
    - - - - - x - y - - * - - -
    - -The circuit takes two real-valued inputs `x` and `y` and computes `x * y` with the `*` gate. Javascript version of this would very simply look something like this: - -```javascript -var forwardMultiplyGate = function(x, y) { - return x * y; -}; -forwardMultiplyGate(-2, 3); // returns -6. Exciting. -``` - -And in math form we can think of this gate as implementing the real-valued function: - -$$ -f(x,y) = x y -$$ - -As with this example, all of our gates will take one or two inputs and produce a **single** output value. - -#### The Goal - -The problem we are interested in studying looks as follows: - -1. We provide a given circuit some specific input values (e.g. `x = -2`, `y = 3`) -2. The circuit computes an output value (e.g. `-6`) -3. The core question then becomes: *How should one tweak the input slightly to increase the output?* - -In this case, in what direction should we change `x,y` to get a number larger than `-6`? Note that, for example, `x = -1.99` and `y = 2.99` gives `x * y = -5.95`, which is higher than `-6.0`. Don't get confused by this: `-5.95` is better (higher) than `-6.0`. It's an improvement of `0.05`, even though the *magnitude* of `-5.95` (the distance from zero) happens to be lower. - -#### Strategy #1: Random Local Search - -Okay. So wait, we have a circuit, we have some inputs and we just want to tweak them slightly to increase the output value? Why is this hard? We can easily "forward" the circuit to compute the output for any given `x` and `y`. So isn't this trivial? Why don't we tweak `x` and `y` randomly and keep track of the tweak that works best: - -```javascript -// circuit with single gate for now -var forwardMultiplyGate = function(x, y) { return x * y; }; -var x = -2, y = 3; // some input values - -// try changing x,y randomly small amounts and keep track of what works best -var tweak_amount = 0.01; -var best_out = -Infinity; -var best_x = x, best_y = y; -for(var k = 0; k < 100; k++) { - var x_try = x + tweak_amount * (Math.random() * 2 - 1); // tweak x a bit - var y_try = y + tweak_amount * (Math.random() * 2 - 1); // tweak y a bit - var out = forwardMultiplyGate(x_try, y_try); - if(out > best_out) { - // best improvement yet! Keep track of the x and y - best_out = out; - best_x = x_try, best_y = y_try; - } -} -``` - -When I run this, I get `best_x = -1.9928`, `best_y = 2.9901`, and `best_out = -5.9588`. Again, `-5.9588` is higher than `-6.0`. So, we're done, right? Not quite: This is a perfectly fine strategy for tiny problems with a few gates if you can afford the compute time, but it won't do if we want to eventually consider huge circuits with millions of inputs. It turns out that we can do much better. - -#### Stategy #2: Numerical Gradient - -Here's a better way. Remember again that in our setup we are given a circuit (e.g. our circuit with a single `*` gate) and some particular input (e.g. `x = -2, y = 3`). The gate computes the output (`-6`) and now we'd like to tweak `x` and `y` to make the output higher. - -A nice intuition for what we're about to do is as follows: Imagine taking the output value that comes out from the circuit and tugging on it in the positive direction. This positive tension will in turn translate through the gate and induce forces on the inputs `x` and `y`. Forces that tell us how `x` and `y` should change to increase the output value. - -What might those forces look like in our specific example? Thinking through it, we can intuit that the force on `x` should also be positive, because making `x` slightly larger improves the circuit's output. For example, increasing `x` from `x = -2` to `x = -1` would give us output `-3` - much larger than `-6`. On the other hand, we'd expect a negative force induced on `y` that pushes it to become lower (since a lower `y`, such as `y = 2`, down from the original `y = 3` would make output higher: `2 x -2 = -4`, again, larger than `-6`). That's the intuition to keep in mind, anyway. As we go through this, it will turn out that forces I'm describing will in fact turn out to be the **derivative** of the output value with respect to its inputs (`x` and `y`). You may have heard this term before. - -> The derivative can be thought of as a force on each input as we pull on the output to become higher. - -So how do we exactly evaluate this force (derivative)? It turns out that there is a very simple procedure for this. We will work backwards: Instead of pulling on the circuit's output, we'll iterate over every input one by one, increase it very slightly and look at what happens to the output value. The amount the output changes in response is the derivative. Enough intuitions for now. Lets look at the mathematical definition. We can write down the derivative for our function with respect to the inputs. For example, the derivative with respect to `x` can be computed as: - -
    -$$ -\frac{\partial f(x,y)}{\partial x} = \frac{f(x+h,y) - f(x,y)}{h} -$$ -
    - -Where \\( h \\) is small - it's the tweak amount. Also, if you're not very familiar with calculus it is important to note that in the left-hand side of the equation above, the horizontal line does *not* indicate division. The entire symbol \\( \frac{\partial f(x,y)}{\partial x} \\) is a single thing: the derivative of the function \\( f(x,y) \\) with respect to \\( x \\). The horizontal line on the right *is* division. I know it's confusing but it's standard notation. Anyway, I hope it doesn't look too scary because it isn't: The circuit was giving some initial output \\( f(x,y) \\), and then we changed one of the inputs by a tiny amount \\(h \\) and read the new output \\( f(x+h, y) \\). Subtracting those two quantities tells us the change, and the division by \\(h \\) just normalizes this change by the (arbitrary) tweak amount we used. In other words it's expressing exactly what I described above and translates directly to this code: - -```javascript -var x = -2, y = 3; -var out = forwardMultiplyGate(x, y); // -6 -var h = 0.0001; - -// compute derivative with respect to x -var xph = x + h; // -1.9999 -var out2 = forwardMultiplyGate(xph, y); // -5.9997 -var x_derivative = (out2 - out) / h; // 3.0 - -// compute derivative with respect to y -var yph = y + h; // 3.0001 -var out3 = forwardMultiplyGate(x, yph); // -6.0002 -var y_derivative = (out3 - out) / h; // -2.0 -``` - -Lets walk through `x` for example. We turned the knob from `x` to `x + h` and the circuit responded by giving a higher value (note again that yes, `-5.9997` is *higher* than `-6`: `-5.9997 > -6`). The division by `h` is there to normalize the circuit's response by the (arbitrary) value of `h` we chose to use here. Technically, you want the value of `h` to be infinitesimal (the precise mathematical definition of the gradient is defined as the limit of the expression as `h` goes to zero), but in practice `h=0.00001` or so works fine in most cases to get a good approximation. Now, we see that the derivative w.r.t. `x` is `+3`. I'm making the positive sign explicit, because it indicates that the circuit is tugging on x to become higher. The actual value, `3` can be interpreted as the *force* of that tug. - -> The derivative with respect to some input can be computed by tweaking that input by a small amount and observing the change on the output value. - -By the way, we usually talk about the *derivative* with respect to a single input, or about a **gradient** with respect to all the inputs. The gradient is just made up of the derivatives of all the inputs concatenated in a vector (i.e. a list). Crucially, notice that if we let the inputs respond to the tug by following the gradient a tiny amount (i.e. we just add the derivative on top of every input), we can see that the value increases, as expected: - -```javascript -var step_size = 0.01; -var out = forwardMultiplyGate(x, y); // before: -6 -x = x + step_size * x_derivative; // x becomes -1.97 -y = y + step_size * y_derivative; // y becomes 2.98 -var out_new = forwardMultiplyGate(x, y); // -5.87! exciting. -``` - -As expected, we changed the inputs by the gradient and the circuit now gives a slightly higher value (`-5.87 > -6.0`). That was much simpler than trying random changes to `x` and `y`, right? A fact to appreciate here is that if you take calculus you can prove that the gradient is, in fact, the direction of the steepest increase of the function. There is no need to monkey around trying out random pertubations as done in Strategy #1. Evaluating the gradient requires just three evaluations of the forward pass of our circuit instead of hundreds, and gives the best tug you can hope for (locally) if you are interested in increasing the value of the output. - -**Bigger step is not always better.** Let me clarify on this point a bit. It is important to note that in this very simple example, using a bigger `step_size` than 0.01 will always work better. For example, `step_size = 1.0` gives output `-1` (higer, better!), and indeed infinite step size would give infinitely good results. The crucial thing to realize is that once our circuits get much more complex (e.g. entire neural networks), the function from inputs to the output value will be more chaotic and wiggly. The gradient guarantees that if you have a very small (indeed, infinitesimally small) step size, then you will definitely get a higher number when you follow its direction, and for that infinitesimally small step size there is no other direction that would have worked better. But if you use a bigger step size (e.g. `step_size = 0.01`) all bets are off. The reason we can get away with a larger step size than infinitesimally small is that our functions are usually relatively smooth. But really, we're crossing our fingers and hoping for the best. - -**Hill-climbing analogy.** One analogy I've heard before is that the output value of our circut is like the height of a hill, and we are blindfolded and trying to climb upwards. We can sense the steepness of the hill at our feet (the gradient), so when we shuffle our feet a bit we will go upwards. But if we took a big, overconfident step, we could have stepped right into a hole. - -Great, I hope I've convinced you that the numerical gradient is indeed a very useful thing to evaluate, and that it is cheap. But. It turns out that we can do *even* better. - -#### Strategy #3: Analytic Gradient - -In the previous section we evaluated the gradient by probing the circuit's output value, independently for every input. This procedure gives you what we call a **numerical gradient**. This approach, however, is *still* expensive because we need to compute the circuit's output as we tweak every input value independently a small amount. So the complexity of evaluating the gradient is linear in number of inputs. But in practice we will have hundreds, thousands or (for neural networks) even tens to hundreds of millions of inputs, and the circuits aren't just one multiply gate but huge expressions that can be expensive to compute. We need something better. - -Luckily, there is an easier and *much* faster way to compute the gradient: we can use calculus to derive a direct expression for it that will be as simple to evaluate as the circuit's output value. We call this an **analytic gradient** and there will be no need for tweaking anything. You may have seen other people who teach Neural Networks derive the gradient in huge and, frankly, scary and confusing mathematical equations (if you're not well-versed in maths). But it's unnecessary. I've written plenty of Neural Nets code and I rarely have to do mathematical derivation longer than two lines, and 95% of the time it can be done without writing anything at all. That is because we will only ever derive the gradient for very small and simple expressions (think of it as the **base case**) and then I will show you how we can compose these very simply with **chain rule** to evaluate the full gradient (think inductive/recursive case). - -> The analytic derivative requires no tweaking of the inputs. It can be derived using mathematics (calculus). - -If you remember your product rules, power rules, quotient rules, etc. (see e.g. [derivative rules](http://www.mathsisfun.com/calculus/derivatives-rules.html) or [wiki page](http://en.wikipedia.org/wiki/Differentiation_rules)), it's very easy to write down the derivitative with respect to both `x` and `y` for a small expression such as `x * y`. But suppose you don't remember your calculus rules. We can go back to the definition. For example, here's the expression for the derivative w.r.t `x`: - -
    -$$ -\frac{\partial f(x,y)}{\partial x} = \frac{f(x+h,y) - f(x,y)}{h} -$$ -
    - -(Technically I'm not writing the limit as `h` goes to zero, forgive me math people). Okay and lets plug in our function ( \\( f(x,y) = x y \\) ) into the expression. Ready for the hardest piece of math of this entire article? Here we go: - -
    -$$ -\frac{\partial f(x,y)}{\partial x} = \frac{f(x+h,y) - f(x,y)}{h} -= \frac{(x+h)y - xy}{h} -= \frac{xy + hy - xy}{h} -= \frac{hy}{h} -= y -$$ -
    - -That's interesting. The derivative with respect to `x` is just equal to `y`. Did you notice the coincidence in the previous section? We tweaked `x` to `x+h` and calculated `x_derivative = 3.0`, which exactly happens to be the value of `y` in that example. It turns out that wasn't a coincidence at all because that's just what the analytic gradient tells us the `x` derivative should be for `f(x,y) = x * y`. The derivative with respect to `y`, by the way, turns out to be `x`, unsurprisingly by symmetry. So there is no need for any tweaking! We invoked powerful mathematics and can now transform our derivative calculation into the following code: - -```javascript -var x = -2, y = 3; -var out = forwardMultiplyGate(x, y); // before: -6 -var x_gradient = y; // by our complex mathematical derivation above -var y_gradient = x; - -var step_size = 0.01; -x += step_size * x_gradient; // -1.97 -y += step_size * y_gradient; // 2.98 -var out_new = forwardMultiplyGate(x, y); // -5.87. Higher output! Nice. -``` - -To compute the gradient we went from forwarding the circuit hundreds of times (Strategy #1) to forwarding it only on order of number of times twice the number of inputs (Strategy #2), to forwarding it a single time! And it gets EVEN better, since the more expensive strategies (#1 and #2) only give an approximation of the gradient, while #3 (the fastest one by far) gives you the *exact* gradient. No approximations. The only downside is that you should be comfortable with some calculus 101. - -Lets recap what we have learned: - -- INPUT: We are given a circuit, some inputs and compute an output value. -- OUTPUT: We are then interested finding small changes to each input (independently) that would make the output higher. -- Strategy #1: One silly way is to **randomly search** for small pertubations of the inputs and keep track of what gives the highest increase in output. -- Strategy #2: We saw we can do much better by computing the gradient. Regardless of how complicated the circuit is, the **numerical gradient** is very simple (but relatively expensive) to compute. We compute it by *probing* the circuit's output value as we tweak the inputs one at a time. -- Strategy #3: In the end, we saw that we can be even more clever and analytically derive a direct expression to get the **analytic gradient**. It is identical to the numerical gradient, it is fastest by far, and there is no need for any tweaking. - -In practice by the way (and we will get to this once again later), all Neural Network libraries always compute the analytic gradient, but the correctness of the implementation is verified by comparing it to the numerical gradient. That's because the numerical gradient is very easy to evaluate (but can be a bit expensive to compute), while the analytic gradient can contain bugs at times, but is usually extremely efficient to compute. As we will see, evaluating the gradient (i.e. while doing *backprop*, or *backward pass*) will turn out to cost about as much as evaluating the *forward pass*. - -### Recursive Case: Circuits with Multiple Gates - -But hold on, you say: *"The analytic gradient was trivial to derive for your super-simple expression. This is useless. What do I do when the expressions are much larger? Don't the equations get huge and complex very fast?"*. Good question. Yes the expressions get much more complex. No, this doesn't make it much harder. As we will see, every gate will be hanging out by itself, completely unaware of any details of the huge and complex circuit that it could be part of. It will only worry about its inputs and it will compute its local derivatives as seen in the previous section, except now there will be a single extra multiplication it will have to do. - -> A single extra multiplication will turn a single (useless gate) into a cog in the complex machine that is an entire neural network. - -I should stop hyping it up now. I hope I've piqued your interest! Lets drill down into details and get two gates involved with this next example: - -
    - - - - - x - y - z - - + - q - - - - - - - - - - - * - f - -
    - -The expression we are computing now is \\( f(x,y,z) = (x + y) z \\). Lets structure the code as follows to make the gates explicit as functions: - -```javascript -var forwardMultiplyGate = function(a, b) { - return a * b; -}; -var forwardAddGate = function(a, b) { - return a + b; -}; -var forwardCircuit = function(x,y,z) { - var q = forwardAddGate(x, y); - var f = forwardMultiplyGate(q, z); - return f; -}; - -var x = -2, y = 5, z = -4; -var f = forwardCircuit(x, y, z); // output is -12 -``` - -In the above, I am using `a` and `b` as the local variables in the gate functions so that we don't get these confused with our circuit inputs `x,y,z`. As before, we are interested in finding the derivatives with respect to the three inputs `x,y,z`. But how do we compute it now that there are multiple gates involved? First, lets pretend that the `+` gate is not there and that we only have two variables in the circuit: `q,z` and a single `*` gate. Note that the `q` is is output of the `+` gate. If we don't worry about `x` and `y` but only about `q` and `z`, then we are back to having only a single gate, and as far as that single `*` gate is concerned, we know what the (analytic) derivates are from previous section. We can write them down (except here we're replacing `x,y` with `q,z`): - -$$ -f(q,z) = q z \hspace{0.5in} \implies \hspace{0.5in} \frac{\partial f(q,z)}{\partial q} = z, \hspace{1in} \frac{\partial f(q,z)}{\partial z} = q -$$ - -Simple enough: these are the expressions for the gradient with respect to `q` and `z`. But wait, we don't want gradient with respect to `q`, but with respect to the inputs: `x` and `y`. Luckily, `q` is computed as a function of `x` and `y` (by addition in our example). We can write down the gradient for the addition gate as well, it's even simpler: - -$$ -q(x,y) = x + y \hspace{0.5in} \implies \hspace{0.5in} \frac{\partial q(x,y)}{\partial x} = 1, \hspace{1in} \frac{\partial q(x,y)}{\partial y} = 1 -$$ - -That's right, the derivatives are just 1, regardless of the actual values of `x` and `y`. If you think about it, this makes sense because to make the output of a single addition gate higher, we expect a positive tug on both `x` and `y`, regardless of their values. - -#### Backpropagation - -We are finally ready to invoke the **Chain Rule**: We know how to compute the gradient of `q` with respect to `x` and `y` (that's a single gate case with `+` as the gate). And we know how to compute the gradient of our final output with respect to `q`. The chain rule tells us how to combine these to get the gradient of the final output with respect to `x` and `y`, which is what we're ultimately interested in. Best of all, the chain rule very simply states that the right thing to do is to simply multiply the gradients together to chain them. For example, the final derivative for `x` will be: - -$$ -\frac{\partial f(q,z)}{\partial x} = \frac{\partial q(x,y)}{\partial x} \frac{\partial f(q,z)}{\partial q} -$$ - -There are many symbols there so maybe this is confusing again, but it's really just two numbers being multiplied together. Here is the code: - -```javascript -// initial conditions -var x = -2, y = 5, z = -4; -var q = forwardAddGate(x, y); // q is 3 -var f = forwardMultiplyGate(q, z); // output is -12 - -// gradient of the MULTIPLY gate with respect to its inputs -// wrt is short for "with respect to" -var derivative_f_wrt_z = q; // 3 -var derivative_f_wrt_q = z; // -4 - -// derivative of the ADD gate with respect to its inputs -var derivative_q_wrt_x = 1.0; -var derivative_q_wrt_y = 1.0; - -// chain rule -var derivative_f_wrt_x = derivative_q_wrt_x * derivative_f_wrt_q; // -4 -var derivative_f_wrt_y = derivative_q_wrt_y * derivative_f_wrt_q; // -4 -``` - -That's it. We computed the gradient (the forces) and now we can let our inputs respond to it by a bit. Lets add the gradients on top of the inputs. The output value of the circuit better increase, up from -12! - -```javascript -// final gradient, from above: [-4, -4, 3] -var gradient_f_wrt_xyz = [derivative_f_wrt_x, derivative_f_wrt_y, derivative_f_wrt_z] - -// let the inputs respond to the force/tug: -var step_size = 0.01; -x = x + step_size * derivative_f_wrt_x; // -2.04 -y = y + step_size * derivative_f_wrt_y; // 4.96 -z = z + step_size * derivative_f_wrt_z; // -3.97 - -// Our circuit now better give higher output: -var q = forwardAddGate(x, y); // q becomes 2.92 -var f = forwardMultiplyGate(q, z); // output is -11.59, up from -12! Nice! - -``` - -Looks like that worked! Lets now try to interpret intuitively what just happened. The circuit wants to output higher values. The last gate saw inputs `q = 3, z = -4` and computed output `-12`. "Pulling" upwards on this output value induced a force on both `q` and `z`: To increase the output value, the circuit "wants" `z` to increase, as can be seen by the positive value of the derivative(`derivative_f_wrt_z = +3`). Again, the size of this derivative can be interpreted as the magnitude of the force. On the other hand, `q` felt a stronger and downward force, since `derivative_f_wrt_q = -4`. In other words the circuit wants `q` to decrease, with a force of `4`. - -Now we get to the second, `+` gate which outputs `q`. By default, the `+` gate computes its derivatives which tells us how to change `x` and `y` to make `q` higher. BUT! Here is the **crucial point**: the gradient on `q` was computed as negative (`derivative_f_wrt_q = -4`), so the circuit wants `q` to *decrease*, and with a force of `4`! So if the `+` gate wants to contribute to making the final output value larger, it needs to listen to the gradient signal coming from the top. In this particular case, it needs to apply tugs on `x,y` opposite of what it would normally apply, and with a force of `4`, so to speak. The multiplication by `-4` seen in the chain rule achieves exactly this: instead of applying a positive force of `+1` on both `x` and `y` (the local derivative), the full circuit's gradient on both `x` and `y` becomes `1 x -4 = -4`. This makes sense: the circuit wants both `x` and `y` to get smaller because this will make `q` smaller, which in turn will make `f` larger. - -> If this makes sense, you understand backpropagation. - -Lets **recap** once again what we learned: - -- In the previous chapter we saw that in the case of a single gate (or a single expression), we can derive the analytic gradient using simple calculus. We interpreted the gradient as a force, or a tug on the inputs that pulls them in a direction which would make this gate's output higher. - -- In case of multiple gates everything stays pretty much the same way: every gate is hanging out by itself completely unaware of the circuit it is embedded in. Some inputs come in and the gate computes its output and the derivate with respect to the inputs. The *only* difference now is that suddenly, something can pull on this gate from above. That's the gradient of the final circuit output value with respect to the ouput this gate computed. It is the circuit asking the gate to output higher or lower numbers, and with some force. The gate simply takes this force and multiplies it to all the forces it computed for its inputs before (chain rule). This has the desired effect: - -1. If a gate experiences a strong positive pull from above, it will also pull harder on its own inputs, scaled by the force it is experiencing from above -2. And if it experiences a negative tug, this means that circuit wants its value to decrease not increase, so it will flip the force of the pull on its inputs to make its own output value smaller. - -> A nice picture to have in mind is that as we pull on the circuit's output value at the end, this induces pulls downward through the entire circuit, all the way down to the inputs. - -Isn't it beautiful? The only difference between the case of a single gate and multiple interacting gates that compute arbitrarily complex expressions is this additional multipy operation that now happens in each gate. - -#### Patterns in the "backward" flow - -Lets look again at our example circuit with the numbers filled in. The first circuit shows the raw values, and the second circuit shows the gradients that flow back to the inputs as discussed. Notice that the gradient always starts off with `+1` at the end to start off the chain. This is the (default) pull on the circuit to have its value increased. - -
    - - - (Values) - - - - -2 - 5 - -4 - - + - 3 - - - - - - - - - - - * - -12 - - (Gradients) - - - - -4 - -4 - 3 - - + - -4 - - - - - - - - - - - * - 1 - - -
    - -After a while you start to notice patterns in how the gradients flow backward in the circuits. For example, the `+` gate always takes the gradient on top and simply passes it on to all of its inputs (notice the example with -4 simply passed on to both of the inputs of `+` gate). This is because its own derivative for the inputs is just `+1`, regardless of what the actual values of the inputs are, so in the chain rule, the gradient from above is just multiplied by 1 and stays the same. Similar intuitions apply to, for example, a `max(x,y)` gate. Since the gradient of `max(x,y)` with respect to its input is `+1` for whichever one of `x`, `y` is larger and `0` for the other, this gate is during backprop effectively just a gradient "switch": it will take the gradient from above and "route" it to the input that had a higher value during the forward pass. - -**Numerical Gradient Check.** Before we finish with this section, lets just make sure that the (analytic) gradient we computed by backprop above is correct as a sanity check. Remember that we can do this simply by computing the numerical gradient and making sure that we get `[-4, -4, 3]` for `x,y,z`. Here's the code: - -```javascript -// initial conditions -var x = -2, y = 5, z = -4; - -// numerical gradient check -var h = 0.0001; -var x_derivative = (forwardCircuit(x+h,y,z) - forwardCircuit(x,y,z)) / h; // -4 -var y_derivative = (forwardCircuit(x,y+h,z) - forwardCircuit(x,y,z)) / h; // -4 -var z_derivative = (forwardCircuit(x,y,z+h) - forwardCircuit(x,y,z)) / h; // 3 -``` - -and we get `[-4, -4, 3]`, as computed with backprop. phew! :) - -### Example: Single Neuron - -In the previous section you hopefully got the basic intuition behind backpropagation. Lets now look at an even more complicated and borderline practical example. We will consider a 2-dimensional neuron that computes the following function: - -$$ -f(x,y,a,b,c) = \sigma(ax + by + c) -$$ - -In this expression, \\( \sigma \\) is the *sigmoid* function. Its best thought of as a "squashing function", because it takes the input and squashes it to be between zero and one: Very negative values are squashed towards zero and positive values get squashed towards one. For example, we have `sig(-5) = 0.006, sig(0) = 0.5, sig(5) = 0.993`. Sigmoid function is defined as: - -$$ -\sigma(x) = \frac{1}{1 + e^{-x}} -$$ - -The gradient with respect to its single input, as you can check on Wikipedia or derive yourself if you know some calculus is given by this expression: - -$$ -\frac{\partial \sigma(x)}{\partial x} = \sigma(x) (1 - \sigma(x)) -$$ - -For example, if the input to the sigmoid gate is `x = 3`, the gate will compute output `f = 1.0 / (1.0 + Math.exp(-x)) = 0.95`, and then the (local) gradient on its input will simply be `dx = (0.95) * (1 - 0.95) = 0.0475`. - -That's all we need to use this gate: we know how to take an input and *forward* it through the sigmoid gate, and we also have the expression for the gradient with respect to its input, so we can also *backprop* through it. Another thing to note is that technically, the sigmoid function is made up of an entire series of gates in a line that compute more *atomic* functions: an exponentiation gate, an addition gate and a division gate. Treating it so would work perfectly fine but for this example I chose to collapse all of these gates into a single gate that just computes sigmoid in one shot, because the gradient expression turns out to be simple. - -Lets take this opportunity to carefully structure the associated code in a nice and modular way. First, I'd like you to note that every **wire** in our diagrams has two numbers associated with it: - -1. the value it carries during the forward pass -2. the gradient (i.e the *pull*) that flows back through it in the backward pass - -Lets create a simple `Unit` structure that will store these two values on every wire. Our gates will now operate over `Unit`s: they will take them as inputs and create them as outputs. - -```javascript -// every Unit corresponds to a wire in the diagrams -var Unit = function(value, grad) { - // value computed in the forward pass - this.value = value; - // the derivative of circuit output w.r.t this unit, computed in backward pass - this.grad = grad; -} -``` - -In addition to Units we also need 3 gates: `+`, `*` and `sig` (sigmoid). Lets start out by implementing a multiply gate. I'm using Javascript here which has a funny way of simulating classes using functions. If you're not a Javascript - familiar person, all that's going on here is that I'm defining a class that has certain properties (accessed with use of `this` keyword), and some methods (which in Javascript are placed into the function's *prototype*). Just think about these as class methods. Also keep in mind that the way we will use these eventually is that we will first `forward` all the gates one by one, and then `backward` all the gates in reverse order. Here is the implementation: - -```javascript - -var multiplyGate = function(){ }; -multiplyGate.prototype = { - forward: function(u0, u1) { - // store pointers to input Units u0 and u1 and output unit utop - this.u0 = u0; - this.u1 = u1; - this.utop = new Unit(u0.value * u1.value, 0.0); - return this.utop; - }, - backward: function() { - // take the gradient in output unit and chain it with the - // local gradients, which we derived for multiply gate before - // then write those gradients to those Units. - this.u0.grad += this.u1.value * this.utop.grad; - this.u1.grad += this.u0.value * this.utop.grad; - } -} -``` - -The multiply gate takes two units that each hold a value and creates a unit that stores its output. The gradient is initialized to zero. Then notice that in the `backward` function call we get the gradient from the output unit we produced during the forward pass (which will by now hopefully have its gradient filled in) and multiply it with the local gradient for this gate (chain rule!). This gate computes multiplication (`u0.value * u1.value`) during forward pass, so recall that the gradient w.r.t `u0` is `u1.value` and w.r.t `u1` is `u0.value`. Also note that we are using `+=` to add onto the gradient in the `backward` function. This will allow us to possibly use the output of one gate multiple times (think of it as a wire branching out), since it turns out that the gradients from these different branches just add up when computing the final gradient with respect to the circuit output. The other two gates are defined analogously: - -```javascript -var addGate = function(){ }; -addGate.prototype = { - forward: function(u0, u1) { - this.u0 = u0; - this.u1 = u1; // store pointers to input units - this.utop = new Unit(u0.value + u1.value, 0.0); - return this.utop; - }, - backward: function() { - // add gate. derivative wrt both inputs is 1 - this.u0.grad += 1 * this.utop.grad; - this.u1.grad += 1 * this.utop.grad; - } -} -``` - -```javascript -var sigmoidGate = function() { - // helper function - this.sig = function(x) { return 1 / (1 + Math.exp(-x)); }; -}; -sigmoidGate.prototype = { - forward: function(u0) { - this.u0 = u0; - this.utop = new Unit(this.sig(this.u0.value), 0.0); - return this.utop; - }, - backward: function() { - var s = this.sig(this.u0.value); - this.u0.grad += (s * (1 - s)) * this.utop.grad; - } -} -``` - -Note that, again, the `backward` function in all cases just computes the local derivative with respect to its input and then multiplies on the gradient from the unit above (i.e. chain rule). To fully specify everything lets finally write out the forward and backward flow for our 2-dimensional neuron with some example values: - -```javascript -// create input units -var a = new Unit(1.0, 0.0); -var b = new Unit(2.0, 0.0); -var c = new Unit(-3.0, 0.0); -var x = new Unit(-1.0, 0.0); -var y = new Unit(3.0, 0.0); - -// create the gates -var mulg0 = new multiplyGate(); -var mulg1 = new multiplyGate(); -var addg0 = new addGate(); -var addg1 = new addGate(); -var sg0 = new sigmoidGate(); - -// do the forward pass -var forwardNeuron = function() { - ax = mulg0.forward(a, x); // a*x = -1 - by = mulg1.forward(b, y); // b*y = 6 - axpby = addg0.forward(ax, by); // a*x + b*y = 5 - axpbypc = addg1.forward(axpby, c); // a*x + b*y + c = 2 - s = sg0.forward(axpbypc); // sig(a*x + b*y + c) = 0.8808 -}; -forwardNeuron(); - -console.log('circuit output: ' + s.value); // prints 0.8808 -``` - -And now lets compute the gradient: Simply iterate in reverse order and call the `backward` function! Remember that we stored the pointers to the units when we did the forward pass, so every gate has access to its inputs and also the output unit it previously produced. - -```javascript -s.grad = 1.0; -sg0.backward(); // writes gradient into axpbypc -addg1.backward(); // writes gradients into axpby and c -addg0.backward(); // writes gradients into ax and by -mulg1.backward(); // writes gradients into b and y -mulg0.backward(); // writes gradients into a and x -``` - -Note that the first line sets the gradient at the output (very last unit) to be `1.0` to start off the gradient chain. This can be interpreted as tugging on the last gate with a force of `+1`. In other words, we are pulling on the entire circuit to induce the forces that will increase the output value. If we did not set this to 1, all gradients would be computed as zero due to the multiplications in the chain rule. Finally, lets make the inputs respond to the computed gradients and check that the function increased: - -```javascript -var step_size = 0.01; -a.value += step_size * a.grad; // a.grad is -0.105 -b.value += step_size * b.grad; // b.grad is 0.315 -c.value += step_size * c.grad; // c.grad is 0.105 -x.value += step_size * x.grad; // x.grad is 0.105 -y.value += step_size * y.grad; // y.grad is 0.210 - -forwardNeuron(); -console.log('circuit output after one backprop: ' + s.value); // prints 0.8825 -``` - -Success! `0.8825` is higher than the previous value, `0.8808`. Finally, lets verify that we implemented the backpropagation correctly by checking the numerical gradient: - -```javascript -var forwardCircuitFast = function(a,b,c,x,y) { - return 1/(1 + Math.exp( - (a*x + b*y + c))); -}; -var a = 1, b = 2, c = -3, x = -1, y = 3; -var h = 0.0001; -var a_grad = (forwardCircuitFast(a+h,b,c,x,y) - forwardCircuitFast(a,b,c,x,y))/h; -var b_grad = (forwardCircuitFast(a,b+h,c,x,y) - forwardCircuitFast(a,b,c,x,y))/h; -var c_grad = (forwardCircuitFast(a,b,c+h,x,y) - forwardCircuitFast(a,b,c,x,y))/h; -var x_grad = (forwardCircuitFast(a,b,c,x+h,y) - forwardCircuitFast(a,b,c,x,y))/h; -var y_grad = (forwardCircuitFast(a,b,c,x,y+h) - forwardCircuitFast(a,b,c,x,y))/h; -``` - -Indeed, these all give the same values as the backpropagated gradients `[-0.105, 0.315, 0.105, 0.105, 0.210]`. Nice! - -I hope it is clear that even though we only looked at an example of a single neuron, the code I gave above generalizes in a very straight-forward way to compute gradients of arbitrary expressions (including very deep expressions #foreshadowing). All you have to do is write small gates that compute local, simple derivatives w.r.t their inputs, wire it up in a graph, do a forward pass to compute the output value and then a backward pass that chains the gradients all the way to the input. - -### Becoming a Backprop Ninja - -Over time you will become much more efficient in writing the backward pass, even for complicated circuits and all at once. Lets practice backprop a bit with a few examples. In what follows, lets not worry about Unit, Circuit classes because they obfuscate things a bit, and lets just use variables such as `a,b,c,x`, and refer to their gradients as `da,db,dc,dx` respectively. Again, we think of the variables as the "forward flow" and their gradients as "backward flow" along every wire. Our first example was the `*` gate: - -```javascript -var x = a * b; -// and given gradient on x (dx), we saw that in backprop we would compute: -var da = b * dx; -var db = a * dx; -``` - -In the code above, I'm assuming that the variable `dx` is given, coming from somewhere above us in the circuit while we're doing backprop (or it is +1 by default otherwise). I'm writing it out because I want to explicitly show how the gradients get chained together. Note from the equations that the `*` gate acts as a *switcher* during backward pass, for lack of better word. It remembers what its inputs were, and the gradients on each one will be the value of the other during the forward pass. And then of course we have to multiply with the gradient from above, which is the chain rule. Here's the `+` gate in this condensed form: - -```javascript -var x = a + b; -// -> -var da = 1.0 * dx; -var db = 1.0 * dx; -``` - -Where `1.0` is the local gradient, and the multiplication is our chain rule. What about adding three numbers?: - -```javascript -// lets compute x = a + b + c in two steps: -var q = a + b; // gate 1 -var x = q + c; // gate 2 - -// backward pass: -dc = 1.0 * dx; // backprop gate 2 -dq = 1.0 * dx; -da = 1.0 * dq; // backprop gate 1 -db = 1.0 * dq; -``` - -You can see what's happening, right? If you remember the backward flow diagram, the `+` gate simply takes the gradient on top and routes it equally to all of its inputs (because its local gradient is always simply `1.0` for all its inputs, regardless of their actual values). So we can do it much faster: - -```javascript -var x = a + b + c; -var da = 1.0 * dx; var db = 1.0 * dx; var dc = 1.0 * dx; -``` - -Okay, how about combining gates?: - -```javascript -var x = a * b + c; -// given dx, backprop in-one-sweep would be => -da = b * dx; -db = a * dx; -dc = 1.0 * dx; -``` - -If you don't see how the above happened, introduce a temporary variable `q = a * b` and then compute `x = q + c` to convince yourself. And here is our neuron, lets do it in two steps: - -```javascript -// lets do our neuron in two steps: -var q = a*x + b*y + c; -var f = sig(q); // sig is the sigmoid function -// and now backward pass, we are given df, and: -var df = 1; -var dq = (f * (1 - f)) * df; -// and now we chain it to the inputs -var da = x * dq; -var dx = a * dq; -var dy = b * dq; -var db = y * dq; -var dc = 1.0 * dq; -``` - -I hope this is starting to make a little more sense. Now how about this: - -```javascript -var x = a * a; -var da = //??? -``` - -You can think of this as value `a` flowing to the `*` gate, but the wire gets split and becomes both inputs. This is actually simple because the backward flow of gradients always adds up. In other words nothing changes: - -```javascript -var da = a * dx; // gradient into a from first branch -da += a * dx; // and add on the gradient from the second branch - -// short form instead is: -var da = 2 * a * dx; -``` - -In fact, if you know your power rule from calculus you would also know that if you have \\( f(a) = a^2 \\) then \\( \frac{\partial f(a)}{\partial a} = 2a \\), which is exactly what we get if we think of it as wire splitting up and being two inputs to a gate. - -Lets do another one: - -```javascript -var x = a*a + b*b + c*c; -// we get: -var da = 2*a*dx; -var db = 2*b*dx; -var dc = 2*c*dx; -``` - -Okay now lets start to get more complex: - -```javascript -var x = Math.pow(((a * b + c) * d), 2); // pow(x,2) squares the input JS -``` - -When more complex cases like this come up in practice, I like to split the expression into manageable chunks which are almost always composed of simpler expressions and then I chain them together with chain rule: - -```javascript -var x1 = a * b + c; -var x2 = x1 * d; -var x = x2 * x2; // this is identical to the above expression for x -// and now in backprop we go backwards: -var dx2 = 2 * x2 * dx; // backprop into x2 -var dd = x1 * dx2; // backprop into d -var dx1 = d * dx2; // backprop into x1 -var da = b * dx1; -var db = a * dx1; -var dc = 1.0 * dx1; // done! -``` - -That wasn't too difficult! Those are the backprop equations for the entire expression, and we've done them piece by piece and backpropped to all the variables. Notice again how for every variable during forward pass we have an equivalent variable during backward pass that contains its gradient with respect to the circuit's final output. Here are a few more useful functions and their local gradients that are useful in practice: - -```javascript -var x = 1.0/a; // division -var da = -1.0/(a*a); -``` - -Here's what division might look like in practice then: - -```javascript -var x = (a + b)/(c + d); -// lets decompose it in steps: -var x1 = a + b; -var x2 = c + d; -var x3 = 1.0 / x2; -var x = x1 * x3; // equivalent to above -// and now backprop, again in reverse order: -var dx1 = x3 * dx; -var dx3 = x1 * dx; -var dx2 = (-1.0/(x2*x2)) * dx3; // local gradient as shown above, and chain rule -var da = 1.0 * dx1; // and finally into the original variables -var db = 1.0 * dx1; -var dc = 1.0 * dx2; -var dd = 1.0 * dx2; -``` - -Hopefully you see that we are breaking down expressions, doing the forward pass, and then for every variable (such as `a`) we derive its gradient `da` as we go backwards, one by one, applying the simple local gradients and chaining them with gradients from above. Here's another one: - -```javascript -var x = Math.max(a, b); -var da = a === x ? 1.0 * dx : 0.0; -var db = b === x ? 1.0 * dx : 0.0; -``` - -Okay this is making a very simple thing hard to read. The `max` function passes on the value of the input that was largest and ignores the other ones. In the backward pass then, the max gate will simply take the gradient on top and route it to the input that actually flowed through it during the forward pass. The gate acts as a simple switch based on which input had the highest value during forward pass. The other inputs will have zero gradient. That's what the `===` is about, since we are testing for which input was the actual max and only routing the gradient to it. - -Finally, lets look at the Rectified Linear Unit non-linearity (or ReLU), which you may have heard of. It is used in Neural Networks in place of the sigmoid function. It is simply thresholding at zero: - -```javascript -var x = Math.max(a, 0) -// backprop through this gate will then be: -var da = a > 0 ? 1.0 * dx : 0.0; -``` - -In other words this gate simply passes the value through if it's larger than 0, or it stops the flow and sets it to zero. In the backward pass, the gate will pass on the gradient from the top if it was activated during the forawrd pass, or if the original input was below zero, it will stop the gradient flow. - -I will stop at this point. I hope you got some intuition about how you can compute entire expressions (which are made up of many gates along the way) and how you can compute backprop for every one of them. - -Everything we've done in this chapter comes down to this: We saw that we can feed some input through arbitrarily complex real-valued circuit, tug at the end of the circuit with some force, and backpropagation distributes that tug through the entire circuit all the way back to the inputs. If the inputs respond slightly along the final direction of their tug, the circuit will "give" a bit along the original pull direction. Maybe this is not immediately obvious, but this machinery is a powerful *hammer* for Machine Learning. - -> "Maybe this is not immediately obvious, but this machinery is a powerful *hammer* for Machine Learning." - -Lets now put this machinery to good use. - -## Chapter 2: Machine Learning - -In the last chapter we were concerned with real-valued circuits that computed possibly complex expressions of their inputs (the forward pass), and also we could compute the gradients of these expressions on the original inputs (backward pass). In this chapter we will see how useful this extremely simple mechanism is in Machine Learning. - -### Binary Classification - -As we did before, lets start out simple. The simplest, common and yet very practical problem in Machine Learning is **binary classification**. A lot of very interesting and important problems can be reduced to it. The setup is as follows: We are given a dataset of `N` vectors and every one of them is labeled with a `+1` or a `-1`. For example, in two dimensions our dataset could look as simple as: - -``` -vector -> label ---------------- -[1.2, 0.7] -> +1 -[-0.3, 0.5] -> -1 -[-3, -1] -> +1 -[0.1, 1.0] -> -1 -[3.0, 1.1] -> -1 -[2.1, -3] -> +1 -``` - -Here, we have `N = 6` **datapoints**, where every datapoint has two **features** (`D = 2`). Three of the datapoints have **label** `+1` and the other three label `-1`. This is a silly toy example, but in practice a +1/-1 dataset could be very useful things indeed: For example spam/no spam emails, where the vectors somehow measure various features of the content of the email, such as the number of times certain enhancement drugs are mentioned. - -**Goal**. Our goal in binary classification is to learn a function that takes a 2-dimensional vector and predicts the label. This function is usually parameterized by a certain set of parameters, and we will want to tune the parameters of the function so that its outputs are consistent with the labeling in the provided dataset. In the end we can discard the dataset and use the learned parameters to predict labels for previously unseen vectors. - -#### Training protocol - -We will eventually build up to entire neural networks and complex expressions, but lets start out simple and train a linear classifier very similar to the single neuron we saw at the end of Chapter 1. The only difference is that we'll get rid of the sigmoid because it makes things unnecessarily complicated (I only used it as an example in Chapter 1 because sigmoid neurons are historically popular but modern Neural Networks rarely, if ever, use sigmoid non-linearities). Anyway, lets use a simple linear function: - -$$ -f(x, y) = ax + by + c -$$ - -In this expression we think of `x` and `y` as the inputs (the 2D vectors) and `a,b,c` as the parameters of the function that we will want to learn. For example, if `a = 1, b = -2, c = -1`, then the function will take the first datapoint (`[1.2, 0.7]`) and output `1 * 1.2 + (-2) * 0.7 + (-1) = -1.2`. Here is how the training will work: - -1. We select a random datapoint and feed it through the circuit -2. We will interpret the output of the circuit as a confidence that the datapoint has class `+1`. (i.e. very high values = circuit is very certain datapoint has class `+1` and very low values = circuit is certain this datapoint has class `-1`.) -3. We will measure how well the prediction aligns with the provided labels. Intuitively, for example, if a positive example scores very low, we will want to tug in the positive direction on the circuit, demanding that it should output higher value for this datapoint. Note that this is the case for the the first datapoint: it is labeled as `+1` but our predictor unction only assigns it value `-1.2`. We will therefore tug on the circuit in positive direction; We want the value to be higher. -4. The circuit will take the tug and backpropagate it to compute tugs on the inputs `a,b,c,x,y` -5. Since we think of `x,y` as (fixed) datapoints, we will ignore the pull on `x,y`. If you're a fan of my physical analogies, think of these inputs as pegs, fixed in the ground. -6. On the other hand, we will take the parameters `a,b,c` and make them respond to their tug (i.e. we'll perform what we call a **parameter update**). This, of course, will make it so that the circuit will output a slightly higher score on this particular datapoint in the future. -7. Iterate! Go back to step 1. - -The training scheme I described above, is commonly referred as **Stochastic Gradient Descent**. The interesting part I'd like to reiterate is that `a,b,c,x,y` are all made up of the same *stuff* as far as the circuit is concerned: They are inputs to the circuit and the circuit will tug on all of them in some direction. It doesn't know the difference between parameters and datapoints. However, after the backward pass is complete we ignore all tugs on the datapoints (`x,y`) and keep swapping them in and out as we iterate over examples in the dataset. On the other hand, we keep the parameters (`a,b,c`) around and keep tugging on them every time we sample a datapoint. Over time, the pulls on these parameters will tune these values in such a way that the function outputs high scores for positive examples and low scores for negative examples. - -#### Learning a Support Vector Machine - -As a concrete example, lets learn a **Support Vector Machine**. The SVM is a very popular linear classifier; Its functional form is exactly as I've described in previous section, \\( f(x,y) = ax + by + c\\). At this point, if you've seen an explanation of SVMs you're probably expecting me to define the SVM loss function and plunge into an explanation of slack variables, geometrical intuitions of large margins, kernels, duality, etc. But here, I'd like to take a different approach. Instead of definining loss functions, I would like to base the explanation on the *force specification* (I just made this term up by the way) of a Support Vector Machine, which I personally find much more intuitive. As we will see, talking about the force specification and the loss function are identical ways of seeing the same problem. Anyway, here it is: - -**Support Vector Machine "Force Specification":** - -- If we feed a positive datapoint through the SVM circuit and the output value is less than 1, pull on the circuit with force `+1`. This is a positive example so we want the score to be higher for it. -- Conversely, if we feed a negative datapoint through the SVM and the output is greater than -1, then the circuit is giving this datapoint dangerously high score: Pull on the circuit downwards with force `-1`. -- In addition to the pulls above, always add a small amount of pull on the parameters `a,b` (notice, not on `c`!) that pulls them towards zero. You can think of both `a,b` as being attached to a physical spring that is attached at zero. Just as with a physical spring, this will make the pull proprotional to the value of each of `a,b` (Hooke's law in physics, anyone?). For example, if `a` becomes very high it will experience a strong pull of magnitude `|a|` back towards zero. This pull is something we call **regularization**, and it ensures that neither of our parameters `a` or `b` gets disproportionally large. This would be undesirable because both `a,b` get multiplied to the input features `x,y` (remember the equation is `a*x + b*y + c`), so if either of them is too high, our classifier would be overly sensitive to these features. This isn't a nice property because features can often be noisy in practice, so we want our classifier to change relatively smoothly if they wiggle around. - -Lets quickly go through a small but concrete example. Suppose we start out with a random parameter setting, say, `a = 1, b = -2, c = -1`. Then: - -- If we feed the point `[1.2, 0.7]`, the SVM will compute score `1 * 1.2 + (-2) * 0.7 - 1 = -1.2`. This point is labeled as `+1` in the training data, so we want the score to be higher than 1. The gradient on top of the circuit will thus be positive: `+1`, which will backpropagate to `a,b,c`. Additionally, there will also be a regularization pull on `a` of `-1` (to make it smaller) and regularization pull on `b` of `+2` to make it larger, toward zero. -- Suppose instead that we fed the datapoint `[-0.3, 0.5]` to the SVM. It computes `1 * (-0.3) + (-2) * 0.5 - 1 = -2.3`. The label for this point is `-1`, and since `-2.3` is smaller than `-1`, we see that according to our force specification the SVM should be happy: The computed score is very negative, consistent with the negative label of this example. There will be no pull at the end of the circuit (i.e it's zero), since there no changes are necessary. However, there will *still* be the regularization pull on `a` of `-1` and on `b` of `+2`. - - -Okay there's been too much text. Lets write the SVM code and take advantage of the circuit machinery we have from Chapter 1: - -```javascript -// A circuit: it takes 5 Units (x,y,a,b,c) and outputs a single Unit -// It can also compute the gradient w.r.t. its inputs -var Circuit = function() { - // create some gates - this.mulg0 = new multiplyGate(); - this.mulg1 = new multiplyGate(); - this.addg0 = new addGate(); - this.addg1 = new addGate(); -}; -Circuit.prototype = { - forward: function(x,y,a,b,c) { - this.ax = this.mulg0.forward(a, x); // a*x - this.by = this.mulg1.forward(b, y); // b*y - this.axpby = this.addg0.forward(this.ax, this.by); // a*x + b*y - this.axpbypc = this.addg1.forward(this.axpby, c); // a*x + b*y + c - return this.axpbypc; - }, - backward: function(gradient_top) { // takes pull from above - this.axpbypc.grad = gradient_top; - this.addg1.backward(); // sets gradient in axpby and c - this.addg0.backward(); // sets gradient in ax and by - this.mulg1.backward(); // sets gradient in b and y - this.mulg0.backward(); // sets gradient in a and x - } -} -``` - -That's a circuit that simply computes `a*x + b*y + c` and can also compute the gradient. It uses the gates code we developed in Chapter 1. Now lets write the SVM, which doesn't care about the actual circuit. It is only concerned with the values that come out of it, and it pulls on the circuit. - -```javascript -// SVM class -var SVM = function() { - - // random initial parameter values - this.a = new Unit(1.0, 0.0); - this.b = new Unit(-2.0, 0.0); - this.c = new Unit(-1.0, 0.0); - - this.circuit = new Circuit(); -}; -SVM.prototype = { - forward: function(x, y) { // assume x and y are Units - this.unit_out = this.circuit.forward(x, y, this.a, this.b, this.c); - return this.unit_out; - }, - backward: function(label) { // label is +1 or -1 - - // reset pulls on a,b,c - this.a.grad = 0.0; - this.b.grad = 0.0; - this.c.grad = 0.0; - - // compute the pull based on what the circuit output was - var pull = 0.0; - if(label === 1 && this.unit_out.value < 1) { - pull = 1; // the score was too low: pull up - } - if(label === -1 && this.unit_out.value > -1) { - pull = -1; // the score was too high for a positive example, pull down - } - this.circuit.backward(pull); // writes gradient into x,y,a,b,c - - // add regularization pull for parameters: towards zero and proportional to value - this.a.grad += -this.a.value; - this.b.grad += -this.b.value; - }, - learnFrom: function(x, y, label) { - this.forward(x, y); // forward pass (set .value in all Units) - this.backward(label); // backward pass (set .grad in all Units) - this.parameterUpdate(); // parameters respond to tug - }, - parameterUpdate: function() { - var step_size = 0.01; - this.a.value += step_size * this.a.grad; - this.b.value += step_size * this.b.grad; - this.c.value += step_size * this.c.grad; - } -}; -``` - -Now lets train the SVM with Stochastic Gradient Descent: - -```javascript -var data = []; var labels = []; -data.push([1.2, 0.7]); labels.push(1); -data.push([-0.3, -0.5]); labels.push(-1); -data.push([3.0, 0.1]); labels.push(1); -data.push([-0.1, -1.0]); labels.push(-1); -data.push([-1.0, 1.1]); labels.push(-1); -data.push([2.1, -3]); labels.push(1); -var svm = new SVM(); - -// a function that computes the classification accuracy -var evalTrainingAccuracy = function() { - var num_correct = 0; - for(var i = 0; i < data.length; i++) { - var x = new Unit(data[i][0], 0.0); - var y = new Unit(data[i][1], 0.0); - var true_label = labels[i]; - - // see if the prediction matches the provided label - var predicted_label = svm.forward(x, y).value > 0 ? 1 : -1; - if(predicted_label === true_label) { - num_correct++; - } - } - return num_correct / data.length; -}; - -// the learning loop -for(var iter = 0; iter < 400; iter++) { - // pick a random data point - var i = Math.floor(Math.random() * data.length); - var x = new Unit(data[i][0], 0.0); - var y = new Unit(data[i][1], 0.0); - var label = labels[i]; - svm.learnFrom(x, y, label); - - if(iter % 25 == 0) { // every 10 iterations... - console.log('training accuracy at iter ' + iter + ': ' + evalTrainingAccuracy()); - } -} -``` -This code prints the following output: - -``` -training accuracy at iteration 0: 0.3333333333333333 -training accuracy at iteration 25: 0.3333333333333333 -training accuracy at iteration 50: 0.5 -training accuracy at iteration 75: 0.5 -training accuracy at iteration 100: 0.3333333333333333 -training accuracy at iteration 125: 0.5 -training accuracy at iteration 150: 0.5 -training accuracy at iteration 175: 0.5 -training accuracy at iteration 200: 0.5 -training accuracy at iteration 225: 0.6666666666666666 -training accuracy at iteration 250: 0.6666666666666666 -training accuracy at iteration 275: 0.8333333333333334 -training accuracy at iteration 300: 1 -training accuracy at iteration 325: 1 -training accuracy at iteration 350: 1 -training accuracy at iteration 375: 1 -``` - -We see that initially our classifier only had 33% training accuracy, but by the end all training examples are correctly classifier as the parameters `a,b,c` adjusted their values according to the pulls we exerted. We just trained an SVM! But please don't use this code anywhere in production :) We will see how we can make things much more efficient once we understand what is going on at the core. - -**Number of iterations needed**. With this example data, with this example initialization, and with the setting of step size we used, it took about 300 iterations to train the SVM. In practice, this could be many more or many less depending on how hard or large the problem is, how you're initializating, normalizing your data, what step size you're using, and so on. This is just a toy demonstration, but later we will go over all the best practices for actually training these classifiers in practice. For example, it will turn out that the setting of the step size is very imporant and tricky. Small step size will make your model slow to train. Large step size will train faster, but if it is too large, it will make your classifier chaotically jump around and not converge to a good final result. We will eventually use witheld validation data to properly tune it to be just in the sweet spot for your particular data. - -One thing I'd like you to appreciate is that the circuit can be arbitrary expression, not just the linear prediction function we used in this example. For example, it can be an entire neural network. - -By the way, I intentionally structured the code in a modular way, but we could have trained an SVM with a much simpler code. Here is really what all of these classes and computations boil down to: - -```javascript -var a = 1, b = -2, c = -1; // initial parameters -for(var iter = 0; iter < 400; iter++) { - // pick a random data point - var i = Math.floor(Math.random() * data.length); - var x = data[i][0]; - var y = data[i][1]; - var label = labels[i]; - - // compute pull - var score = a*x + b*y + c; - var pull = 0.0; - if(label === 1 && score < 1) pull = 1; - if(label === -1 && score > -1) pull = -1; - - // compute gradient and update parameters - var step_size = 0.01; - a += step_size * (x * pull - a); // -a is from the regularization - b += step_size * (y * pull - b); // -b is from the regularization - c += step_size * (1 * pull); -} -``` - -this code gives an identical result. Perhaps by now you can glance at the code and see how these equations came about. - -**Variable pull?** A quick note to make at this point: You may have noticed that the pull is always 1,0, or -1. You could imagine doing other things, for example making this pull proportional to how bad the mistake was. This leads to a variation on the SVM that some people refer to as *squared hinge loss* SVM, for reasons that will later become clear. Depending on various features of your dataset, that may work better or worse. For example, if you have very bad outliers in your data, e.g. a negative data point that gets a score `+100`, its influence will be relatively minor on our classifier because we will only pull with force of `-1` regardless of how bad the mistake was. In practice we refer to this property of a classifier as **robustness** to outliers. - -Lets **recap**. We introduced the **binary classification** problem, where we are given N D-dimensional vectors and a label +1/-1 for each. We saw that we can combine these features with a set of parameters inside a real-valued circuit (such as a **Support Vector Machine** circuit in our example). Then, we can repeatedly pass our data through the circuit and each time tweak the parameters so that the circuit's output value is consistent with the provided labels. The tweaking relied, crucially, on our ability to **backpropagate** gradients through the circuit. In the end, the final circuit can be used to predict values for unseen instances! - -#### Generalizing the SVM into a Neural Network - -Of interest is the fact that an SVM is just a particular type of a very simple circuit (circuit that computes `score = a*x + b*y + c` where `a,b,c` are weights and `x,y` are data points). This can be easily extended to more complicated functions. For example, lets write a 2-layer Neural Network that does the binary classification. The forward pass will look like this: - -```javascript -// assume inputs x,y -var n1 = Math.max(0, a1*x + b1*y + c1); // activation of 1st hidden neuron -var n2 = Math.max(0, a2*x + b2*y + c2); // 2nd neuron -var n3 = Math.max(0, a3*x + b3*y + c3); // 3rd neuron -var score = a4*n1 + b4*n2 + c4*n3 + d4; // the score -``` - -The specification above is a 2-layer Neural Network with 3 hidden neurons (n1, n2, n3) that uses Rectified Linear Unit (ReLU) non-linearity on each hidden neuron. As you can see, there are now several parameters involved, which means that our classifier is more complex and can represent more intricate decision boundaries than just a simple linear decision rule such as an SVM. Another way to think about it is that every one of the three hidden neurons is a linear classifier and now we're putting an extra linear classifier on top of that. Now we're starting to go *deeper* :). Okay, lets train this 2-layer Neural Network. The code looks very similar to the SVM example code above, we just have to change the forward pass and the backward pass: - -```javascript -// random initial parameters -var a1 = Math.random() - 0.5; // a random number between -0.5 and 0.5 -// ... similarly initialize all other parameters to randoms -for(var iter = 0; iter < 400; iter++) { - // pick a random data point - var i = Math.floor(Math.random() * data.length); - var x = data[i][0]; - var y = data[i][1]; - var label = labels[i]; - - // compute forward pass - var n1 = Math.max(0, a1*x + b1*y + c1); // activation of 1st hidden neuron - var n2 = Math.max(0, a2*x + b2*y + c2); // 2nd neuron - var n3 = Math.max(0, a3*x + b3*y + c3); // 3rd neuron - var score = a4*n1 + b4*n2 + c4*n3 + d4; // the score - - // compute the pull on top - var pull = 0.0; - if(label === 1 && score < 1) pull = 1; // we want higher output! Pull up. - if(label === -1 && score > -1) pull = -1; // we want lower output! Pull down. - - // now compute backward pass to all parameters of the model - - // backprop through the last "score" neuron - var dscore = pull; - var da4 = n1 * dscore; - var dn1 = a4 * dscore; - var db4 = n2 * dscore; - var dn2 = b4 * dscore; - var dc4 = n3 * dscore; - var dn3 = c4 * dscore; - var dd4 = 1.0 * dscore; // phew - - // backprop the ReLU non-linearities, in place - // i.e. just set gradients to zero if the neurons did not "fire" - var dn3 = n3 === 0 ? 0 : dn3; - var dn2 = n2 === 0 ? 0 : dn2; - var dn1 = n1 === 0 ? 0 : dn1; - - // backprop to parameters of neuron 1 - var da1 = x * dn1; - var db1 = y * dn1; - var dc1 = 1.0 * dn1; - - // backprop to parameters of neuron 2 - var da2 = x * dn2; - var db2 = y * dn2; - var dc2 = 1.0 * dn2; - - // backprop to parameters of neuron 3 - var da3 = x * dn3; - var db3 = y * dn3; - var dc3 = 1.0 * dn3; - - // phew! End of backprop! - // note we could have also backpropped into x,y - // but we do not need these gradients. We only use the gradients - // on our parameters in the parameter update, and we discard x,y - - // add the pulls from the regularization, tugging all multiplicative - // parameters (i.e. not the biases) downward, proportional to their value - da1 += -a1; da2 += -a2; da3 += -a3; - db1 += -b1; db2 += -b2; db3 += -b3; - da4 += -a4; db4 += -b4; dc4 += -c4; - - // finally, do the parameter update - var step_size = 0.01; - a1 += step_size * da1; - b1 += step_size * db1; - c1 += step_size * dc1; - a2 += step_size * da2; - b2 += step_size * db2; - c2 += step_size * dc2; - a3 += step_size * da3; - b3 += step_size * db3; - c3 += step_size * dc3; - a4 += step_size * da4; - b4 += step_size * db4; - c4 += step_size * dc4; - d4 += step_size * dd4; - // wow this is tedious, please use for loops in prod. - // we're done! -} -``` - -And that's how you train a neural network. Obviously, you want to modularize your code nicely but I expended this example for you in the hope that it makes things much more concrete and simpler to understand. Later, we will look at best practices when implementing these networks and we will structure the code much more neatly in a modular and more sensible way. - -But for now, I hope your takeaway is that a 2-layer Neural Net is really not such a scary thing: we write a forward pass expression, interpret the value at the end as a score, and then we pull on that value in a positive or negative direction depending on what we want that value to be for our current particular example. The parameter update after backprop will ensure that when we see this particular example in the future, the network will be more likely to give us a value we desire, not the one it gave just before the update. - - -### A more Conventional Approach: Loss Functions - -Now that we understand the basics of how these circuits function with data, lets adopt a more conventional approach that you might see elsewhere on the internet and in other tutorials and books. You won't see people talking too much about **force specifications**. Instead, Machine Learning algorithms are specified in terms of **loss functions** (or **cost functions**, or **objectives**). - -As I develop this formalism I would also like to start to be a little more careful with how we name our variables and parameters. I'd like these equations to look similar to what you might see in a book or some other tutorial, so let me use more standard naming conventions. - -#### Example: 2-D Support Vector Machine -Lets start with an example of a 2-dimensional SVM. We are given a dataset of \\( N \\) examples \\( (x\_{i0}, x\_{i1}) \\) and their corresponding labels \\( y\_{i} \\) which are allowed to be either \\( +1/-1 \\) for positive or negative example respectively. Most importantly, as you recall we have three parameters \\( (w\_0, w\_1, w\_2) \\). The SVM loss function is then defined as follows: - -$$ -L = [\sum\_{i=1}^N max(0, -y\_{i}( w\_0x\_{i0} + w\_1x\_{i1} + w\_2 ) + 1 )] + \alpha [w\_0^2 + w\_1^2] -$$ - -Notice that this expression is always positive, due to the thresholding at zero in the first expression and the squaring in the regularization. The idea is that we will want this expression to be as small as possible. Before we dive into some of its subtleties let me first translate it to code: - -```javascript -var X = [ [1.2, 0.7], [-0.3, 0.5], [3, 2.5] ] // array of 2-dimensional data -var y = [1, -1, 1] // array of labels -var w = [0.1, 0.2, 0.3] // example: random numbers -var alpha = 0.1; // regularization strength - -function cost(X, y, w) { - - var total_cost = 0.0; // L, in SVM loss function above - N = X.length; - for(var i=0;i cost computed to be ' + costi.toFixed(3)); - total_cost += costi; - } - - // regularization cost: we want small weights - reg_cost = alpha * (w[0]*w[0] + w[1]*w[1]) - console.log('regularization cost for current model is ' + reg_cost.toFixed(3)); - total_cost += reg_cost; - - console.log('total cost is ' + total_cost.toFixed(3)); - return total_cost; -} -``` - -And here is the output: - -``` -cost for example 0 is 0.440 -cost for example 1 is 1.370 -cost for example 2 is 0.000 -regularization cost for current model is 0.005 -total cost is 1.815 -``` - -Notice how this expression works: It measures how *bad* our SVM classifier is. Lets step through this explicitly: - -- The first datapoint `xi = [1.2, 0.7]` with label `yi = 1` will give score `0.1*1.2 + 0.2*0.7 + 0.3`, which is `0.56`. Notice, this is a positive example so we want to the score to be greater than `+1`. `0.56` is not enough. And indeed, the expression for cost for this datapoint will compute: `costi = Math.max(0, -1*0.56 + 1)`, which is `0.44`. You can think of the cost as quantifying the SVM's unhappiness. -- The second datapoint `xi = [-0.3, 0.5]` with label `yi = -1` will give score `0.1*(-0.3) + 0.2*0.5 + 0.3`, which is `0.37`. This isn't looking very good: This score is very high for a negative example. It should be less than -1. Indeed, when we compute the cost: `costi = Math.max(0, 1*0.37 + 1)`, we get `1.37`. That's a very high cost from this example, as it is being misclassified. -- The last example `xi = [3, 2.5]` with label `yi = 1` gives score `0.1*3 + 0.2*2.5 + 0.3`, and that is `1.1`. In this case, the SVM will compute `costi = Math.max(0, -1*1.1 + 1)`, which is in fact zero. This datapoint is being classified correctly and there is no cost associated with it. - -> A cost function is an expression that measuress how bad your classifier is. When the training set is perfectly classified, the cost (ignoring the regularization) will be zero. - -Notice that the last term in the loss is the regularization cost, which says that our model parameters should be small values. Due to this term the cost will never actually become zero (because this would mean all parameters of the model except the bias are exactly zero), but the closer we get, the better our classifier will become. - -> The majority of cost functions in Machine Learning consist of two parts: 1. A part that measures how well a model fits the data, and 2: Regularization, which measures some notion of how complex or likely a model is. - -I hope I convinced you then, that to get a very good SVM we really want to make the **cost as small as possible**. Sounds familiar? We know exactly what to do: The cost function written above is our circuit. We will forward all examples through the circuit, compute the backward pass and update all parameters such that the circuit will output a *smaller* cost in the future. Specifically, we will compute the *gradient* and then update the parameters in the *opposite direction* of the gradient (since we want to make the cost small, not large). - -> "We know exactly what to do: The cost function written above is our circuit." - -todo: clean up this section and flesh it out a bit... - -## Chapter 3: Backprop in Practice - -### Building up a library - -### Example: Practical Neural Network Classifier - -- Multiclass: Structured SVM -- Multiclass: Logistic Regression, Softmax - -### Example: Regression - -Tiny changes needed to cost function. L2 regularization. - -### Example: Structured Prediction - -Basic idea is to train an (unnormalized) energy model - -### Vectorized Implementations - -Writing a Neural Net classfier in Python with numpy.... - -### Backprop in practice: Tips/Tricks - -- Monitoring of Cost function -- Monitoring training/validation performance -- Tweaking initial learning rates, learning rate schedules -- Optimization: Using Momentum -- Optimization: LBFGS, Nesterov accelerated gradient -- Importance of Initialization: weights and biases -- Regularization: L2, L1, Group sparsity, Dropout -- Hyperparameter search, cross-validations -- Common pitfalls: (e.g. dying ReLUs) -- Handling unbalanced datasets -- Approaches to debugging nets when something doesnt work - -## Chapter 4: Networks in the Wild - -Case studies of models that work well in practice and have been deployed in the wild. - -### Case Study: Convolutional Neural Networks for images - -Convolutional layers, pooling, AlexNet, etc. - -### Case Study: Recurrent Neural Networks for Speech and Text - -Vanilla Recurrent nets, bi-directional recurrent nets. Maybe overview of LSTM - -### Case Study: Word2Vec - -Training word vector representations in NLP - -### Case Study: t-SNE - -Training embeddings for visualizing data - -## Acknowledgements - -Thanks a lot to the following people who made this guide better: wodenokoto (HN), zackmorris (HN). - -## Comments - -This guide is a work in progress and I appreciate feedback, especially regarding parts that were unclear or only made half sense. Thank you! - -Some of the Javascript code in this tutorial has been translated to Python by Ajit, find it over on [Github](https://github.com/urwithajit9/HG_NeuralNetwork).