// API callback
showlatestposts({"version":"1.0","encoding":"UTF-8","feed":{"xmlns":"http://www.w3.org/2005/Atom","xmlns$openSearch":"http://a9.com/-/spec/opensearchrss/1.0/","xmlns$blogger":"http://schemas.google.com/blogger/2008","xmlns$georss":"http://www.georss.org/georss","xmlns$gd":"http://schemas.google.com/g/2005","xmlns$thr":"http://purl.org/syndication/thread/1.0","id":{"$t":"tag:blogger.com,1999:blog-15418143"},"updated":{"$t":"2022-02-11T18:04:27.581-05:00"},"category":[{"term":"computer vision"},{"term":"cvpr"},{"term":"google"},{"term":"machine learning"},{"term":"philosophy"},{"term":"deep learning"},{"term":"MIT"},{"term":"papers"},{"term":"research"},{"term":"MATLAB"},{"term":"categorization"},{"term":"code"},{"term":"robotics"},{"term":"CMU"},{"term":"object detection"},{"term":"object recognition"},{"term":"scene understanding"},{"term":"artificial intelligence"},{"term":"github"},{"term":"image understanding"},{"term":"kickstarter"},{"term":"perception"},{"term":"visualization"},{"term":"CVPR 2012"},{"term":"concepts"},{"term":"conference"},{"term":"entrepreneurship"},{"term":"fractals"},{"term":"mathematics"},{"term":"programming"},{"term":"segmentation"},{"term":"VMX"},{"term":"VMX project"},{"term":"berkeley"},{"term":"computer graphics"},{"term":"computer science"},{"term":"exemplars"},{"term":"iccv"},{"term":"image segmentation"},{"term":"kinect"},{"term":"paradigm"},{"term":"psychology"},{"term":"publishing"},{"term":"vision"},{"term":"vision.ai"},{"term":"visual memex"},{"term":"workshop"},{"term":"3d recognition"},{"term":"Microsoft"},{"term":"alyosha efros"},{"term":"antonio torralba"},{"term":"cognitive science"},{"term":"context"},{"term":"cvpr 2013"},{"term":"distance function learning"},{"term":"face detection"},{"term":"newton's method"},{"term":"prototypes"},{"term":"sharing"},{"term":"stanford"},{"term":"startups"},{"term":"svm"},{"term":"torralba"},{"term":"CVPR 2015"},{"term":"HOG"},{"term":"Takeo Kanade"},{"term":"abhinav gupta"},{"term":"academia"},{"term":"best paper"},{"term":"big data"},{"term":"blogging"},{"term":"c++"},{"term":"classification"},{"term":"cvpr 2010"},{"term":"cvpr 2011"},{"term":"exemplar-svm"},{"term":"felzenszwalb"},{"term":"future directions"},{"term":"geoff hinton"},{"term":"geometry"},{"term":"inference"},{"term":"knowledge"},{"term":"large dataset"},{"term":"lecun"},{"term":"networking"},{"term":"nips"},{"term":"phd"},{"term":"picasa"},{"term":"rosch"},{"term":"sfm"},{"term":"training"},{"term":"wittgenstein"},{"term":"2.5d"},{"term":"AI"},{"term":"API"},{"term":"CNNs"},{"term":"ConvNets"},{"term":"HOGgles"},{"term":"ICCP 2013"},{"term":"ICCV 2013"},{"term":"ICML"},{"term":"MATLAB code"},{"term":"academics"},{"term":"action recognition"},{"term":"advice"},{"term":"andrew ng"},{"term":"association"},{"term":"aude oliva"},{"term":"beyond categories"},{"term":"blog"},{"term":"books"},{"term":"brain"},{"term":"california"},{"term":"clarifai"},{"term":"complex numbers"},{"term":"data science"},{"term":"data-driven"},{"term":"dato"},{"term":"deepmind"},{"term":"depth"},{"term":"descartes"},{"term":"detection"},{"term":"deva ramanan"},{"term":"embedded computer vision"},{"term":"empiricism"},{"term":"exemplar svms"},{"term":"exemplarsvm"},{"term":"face tracking"},{"term":"facebook"},{"term":"features"},{"term":"gist"},{"term":"google research"},{"term":"gpu"},{"term":"graduate student life"},{"term":"graphical models"},{"term":"graphics"},{"term":"graphs"},{"term":"graphviz"},{"term":"gupta"},{"term":"hacking"},{"term":"hierarchy"},{"term":"hinton"},{"term":"iccv 2011"},{"term":"iccv 2015"},{"term":"image interpretation"},{"term":"indexing"},{"term":"internship"},{"term":"jay yagnik"},{"term":"jianxiong xiao"},{"term":"joint regulariztion"},{"term":"joseph lim"},{"term":"josh tenenbaum"},{"term":"kant"},{"term":"karpathy"},{"term":"kristen grauman"},{"term":"lesson"},{"term":"lessons"},{"term":"linux"},{"term":"martial hebert"},{"term":"metamind"},{"term":"metric learning"},{"term":"mid-level patch discovery"},{"term":"multi-task"},{"term":"multiclass sharing"},{"term":"nips 2011"},{"term":"non-parametric"},{"term":"oral presentation"},{"term":"parts"},{"term":"philosophy of science"},{"term":"photobios"},{"term":"physics"},{"term":"pose"},{"term":"presentation"},{"term":"puzzles"},{"term":"pyimagesearch"},{"term":"python"},{"term":"realism"},{"term":"reconstruction"},{"term":"research papers"},{"term":"rgbd"},{"term":"running"},{"term":"segmentation-driven recognition"},{"term":"sift"},{"term":"siggraph asia"},{"term":"snapchat"},{"term":"startup"},{"term":"statcounter"},{"term":"students"},{"term":"summary"},{"term":"svetlana lazebnik"},{"term":"talks"},{"term":"teaching"},{"term":"truth"},{"term":"vision as a service"},{"term":"william james"},{"term":"wordle"},{"term":"yann lecun"},{"term":"youtube"},{"term":"1970s"},{"term":"3D reconstruction"},{"term":"3D vision"},{"term":"3d"},{"term":"3d model"},{"term":"AI agents"},{"term":"Amnon Shashua"},{"term":"CRF"},{"term":"CTO"},{"term":"DTAM"},{"term":"DynamicFusion"},{"term":"GANs"},{"term":"ICLR"},{"term":"IIT-at-MIT"},{"term":"Ira Kemelmacher-Shlizerman"},{"term":"KinectFusion"},{"term":"LSD-SLAM"},{"term":"LSTM"},{"term":"MIT skolkovo tech innovation"},{"term":"Marr prize"},{"term":"PTAM"},{"term":"SLAM"},{"term":"SUN"},{"term":"SUN360"},{"term":"SVMs"},{"term":"VIEW workshop"},{"term":"Yaroslav Bulatov"},{"term":"abbeel"},{"term":"abhinav shrivastava"},{"term":"abstraction"},{"term":"action"},{"term":"active learning"},{"term":"adrian rosebrock"},{"term":"adrien gaidon"},{"term":"affordances"},{"term":"alex berg"},{"term":"algorithm"},{"term":"algorithms"},{"term":"america"},{"term":"analogies"},{"term":"andrew davison"},{"term":"andrew gallagher"},{"term":"angela dai"},{"term":"angularjs"},{"term":"annotation"},{"term":"appearance"},{"term":"apple"},{"term":"aristotle"},{"term":"article"},{"term":"arxiv"},{"term":"associations"},{"term":"att"},{"term":"attributes"},{"term":"autonomous cars"},{"term":"autonomous vehicle"},{"term":"average explorer"},{"term":"award"},{"term":"backpropagation"},{"term":"baidu"},{"term":"barcelona"},{"term":"barcode"},{"term":"barrow"},{"term":"bay area"},{"term":"bayesian"},{"term":"bengio"},{"term":"berkely"},{"term":"bias-variance"},{"term":"biconvex"},{"term":"big-data"},{"term":"black friday"},{"term":"blocks world"},{"term":"bolei zhou"},{"term":"boston"},{"term":"browser"},{"term":"bundle adjustment"},{"term":"business"},{"term":"caffe"},{"term":"carl doersch"},{"term":"carl vondrick"},{"term":"carlos guestrin"},{"term":"caroline pantofaru"},{"term":"chile"},{"term":"chips"},{"term":"christmas"},{"term":"chrome"},{"term":"citations"},{"term":"classifier"},{"term":"cloud-based computer vision"},{"term":"cluster"},{"term":"clusters"},{"term":"coding n00bs"},{"term":"colorado"},{"term":"comedy"},{"term":"company"},{"term":"compiled"},{"term":"computer vision blog"},{"term":"computer vision jobs"},{"term":"confidence"},{"term":"context challenge"},{"term":"convolution"},{"term":"convolutions"},{"term":"copernicus"},{"term":"copyright"},{"term":"cosegmentation"},{"term":"coursera"},{"term":"courses"},{"term":"crowdsourcing"},{"term":"cvpapers"},{"term":"dalal"},{"term":"dalal triggs"},{"term":"daniel cremers"},{"term":"datasets"},{"term":"david hume"},{"term":"david marr"},{"term":"day 2"},{"term":"decision forests"},{"term":"deep calculators"},{"term":"deep compression"},{"term":"deep features"},{"term":"deepfake"},{"term":"definition"},{"term":"deformable part model"},{"term":"demo"},{"term":"demos"},{"term":"dennett"},{"term":"dennis strelow"},{"term":"density estimation"},{"term":"derivative work"},{"term":"descriptors"},{"term":"discriminative"},{"term":"distributed systems"},{"term":"dpm"},{"term":"dropout"},{"term":"dyson"},{"term":"early event detection"},{"term":"edelman"},{"term":"edinburgh"},{"term":"education"},{"term":"efros"},{"term":"ego"},{"term":"egocentric vision"},{"term":"epistemology"},{"term":"etymology"},{"term":"europe"},{"term":"everything is misc"},{"term":"exemplar-svms"},{"term":"face memex"},{"term":"face recognition"},{"term":"face transfer"},{"term":"face2face"},{"term":"faces"},{"term":"facetime"},{"term":"faculty"},{"term":"fake news"},{"term":"feature engineering"},{"term":"fei-fei li"},{"term":"fernando de la torre"},{"term":"fgvc"},{"term":"filters"},{"term":"firefox"},{"term":"first-person vision"},{"term":"flash"},{"term":"forensics"},{"term":"founding"},{"term":"four steps to the epiphany"},{"term":"fourier"},{"term":"frontal faces"},{"term":"fun"},{"term":"future"},{"term":"ge research"},{"term":"geometry transfer"},{"term":"gibson"},{"term":"girshick"},{"term":"gists"},{"term":"git"},{"term":"gnu screen"},{"term":"gold rush"},{"term":"google glass"},{"term":"google internship"},{"term":"google scholar"},{"term":"google street view"},{"term":"gradslam"},{"term":"grammar"},{"term":"granada"},{"term":"graph cuts"},{"term":"graphlab"},{"term":"great talk"},{"term":"grouping"},{"term":"guestrin"},{"term":"guitar"},{"term":"gunhee kim"},{"term":"hackers"},{"term":"hadoop"},{"term":"hamed pirsiavash"},{"term":"healthcare"},{"term":"hebert"},{"term":"heroku"},{"term":"high school"},{"term":"hiking"},{"term":"history"},{"term":"hockey"},{"term":"horst bichof"},{"term":"hossein mobahi"},{"term":"hugo larochelle"},{"term":"hypercolumns"},{"term":"ibm"},{"term":"iccv 2019"},{"term":"idealism"},{"term":"ideas"},{"term":"identification"},{"term":"image forensics"},{"term":"image matching"},{"term":"image parsing"},{"term":"image retrieval"},{"term":"imagenet"},{"term":"imitation"},{"term":"indoor recognition"},{"term":"induction"},{"term":"innovation"},{"term":"intelligence"},{"term":"intentional stance"},{"term":"internet"},{"term":"internet-scale"},{"term":"interns"},{"term":"interpretation"},{"term":"intuition"},{"term":"inverse optics"},{"term":"invited talk"},{"term":"iphone"},{"term":"jakob engel"},{"term":"james hays"},{"term":"jedi"},{"term":"jia deng"},{"term":"jon barron"},{"term":"joseph tighe"},{"term":"jsfeat"},{"term":"justus thies"},{"term":"kernels"},{"term":"keyboard"},{"term":"kiri wagstaff"},{"term":"kitware"},{"term":"knithealth"},{"term":"knol"},{"term":"kornia"},{"term":"labelme"},{"term":"labelme3D"},{"term":"language"},{"term":"laser"},{"term":"lean startup"},{"term":"learning"},{"term":"legal"},{"term":"libraries"},{"term":"linear SVM"},{"term":"linear classifier"},{"term":"local installation"},{"term":"local server"},{"term":"localization"},{"term":"logic"},{"term":"loopy belief propagation"},{"term":"lowe"},{"term":"lucas-kanade"},{"term":"mac"},{"term":"mac os x"},{"term":"machine learning that matters"},{"term":"machine perception"},{"term":"made to stick"},{"term":"marathon"},{"term":"marc levoy"},{"term":"marc pollefeys"},{"term":"marcel proust"},{"term":"marketing"},{"term":"markets"},{"term":"marvin minsky"},{"term":"matching"},{"term":"matthias grundmann"},{"term":"matthias niessner"},{"term":"max-margin"},{"term":"mean face"},{"term":"meaning"},{"term":"meetup"},{"term":"memex"},{"term":"meta-data transfer"},{"term":"millions of images"},{"term":"minds"},{"term":"minh hoai"},{"term":"mirror mirror"},{"term":"mlimpact"},{"term":"mobileye"},{"term":"mooc"},{"term":"moshe bar"},{"term":"mother"},{"term":"motivation"},{"term":"multiple segmentations"},{"term":"nearest neighbor"},{"term":"netflix"},{"term":"new API"},{"term":"nips 2009"},{"term":"nips 2016"},{"term":"nms"},{"term":"nn"},{"term":"noah snavely"},{"term":"non-maximum suppression"},{"term":"novel objects"},{"term":"object interpretation"},{"term":"objectness"},{"term":"ontologies"},{"term":"openai"},{"term":"opencv"},{"term":"optical flow"},{"term":"optimization"},{"term":"orals"},{"term":"ordonez"},{"term":"oregon"},{"term":"overview"},{"term":"oxford"},{"term":"paas"},{"term":"paintings"},{"term":"panel"},{"term":"paper"},{"term":"paradigm shift"},{"term":"parameter estimation"},{"term":"parametric"},{"term":"paris"},{"term":"pattern recognition"},{"term":"pbs"},{"term":"pedro"},{"term":"peer review"},{"term":"per-exemplar detectors"},{"term":"personal robotics"},{"term":"personal story"},{"term":"peter norvig"},{"term":"peter tu"},{"term":"phish"},{"term":"photography"},{"term":"pink floyd"},{"term":"pinspeck camera"},{"term":"pittpatt"},{"term":"pivot"},{"term":"plenary"},{"term":"plenoptic function"},{"term":"poggio"},{"term":"pooling ramanan"},{"term":"popper"},{"term":"popular posts"},{"term":"portland"},{"term":"poselets"},{"term":"positive reinforcement"},{"term":"post-processing"},{"term":"postdoc"},{"term":"pragmatism"},{"term":"predictions"},{"term":"presentations"},{"term":"primal"},{"term":"probabilistic graphical models"},{"term":"professors"},{"term":"progress"},{"term":"project glass"},{"term":"providence"},{"term":"quine"},{"term":"ramanan"},{"term":"rant"},{"term":"raphael"},{"term":"rationalism"},{"term":"reading group"},{"term":"real-time"},{"term":"renaissance"},{"term":"resnet"},{"term":"reverse-engineering"},{"term":"review"},{"term":"reward"},{"term":"rgb-d dataset"},{"term":"richard newcombe"},{"term":"risks"},{"term":"rnns"},{"term":"rob fergus"},{"term":"roberts"},{"term":"roboticist"},{"term":"ruslan salakhutdinov"},{"term":"satkin"},{"term":"saurabh singh"},{"term":"saxena"},{"term":"scene parsing"},{"term":"scene recognition"},{"term":"seeing"},{"term":"seeing as"},{"term":"segnet"},{"term":"self-driving car"},{"term":"selfie"},{"term":"service"},{"term":"sfdp"},{"term":"sharing knowledge"},{"term":"shimon ullman"},{"term":"siggraph"},{"term":"silicon valley"},{"term":"silvio savarese"},{"term":"simple"},{"term":"sirfs"},{"term":"siri"},{"term":"sketch tokens"},{"term":"sketches"},{"term":"skolkovo tech"},{"term":"smart software"},{"term":"smile detection"},{"term":"socher"},{"term":"software"},{"term":"software engineering"},{"term":"soup of segments"},{"term":"sparse coding"},{"term":"spin image"},{"term":"sponsor"},{"term":"sports"},{"term":"springs"},{"term":"sri"},{"term":"ssh"},{"term":"stem"},{"term":"stereo"},{"term":"steve blank"},{"term":"street view"},{"term":"structural SVM"},{"term":"structure from motion"},{"term":"summer internship"},{"term":"suns 2009"},{"term":"suns 2011"},{"term":"super-resolution"},{"term":"supernatural"},{"term":"superpixel"},{"term":"supervised learning"},{"term":"sven"},{"term":"svr"},{"term":"symposium"},{"term":"synthesis"},{"term":"talk"},{"term":"ted adelson"},{"term":"templates"},{"term":"tenenbaum"},{"term":"tensorflow"},{"term":"text"},{"term":"theano"},{"term":"theories"},{"term":"theory of mind"},{"term":"thesis proposal"},{"term":"time travel"},{"term":"timothy leary"},{"term":"tombone"},{"term":"torque"},{"term":"torralba art"},{"term":"toyota"},{"term":"trace operator"},{"term":"transduction"},{"term":"transfer learning"},{"term":"travel"},{"term":"trending topic"},{"term":"trevor darrell"},{"term":"tricks"},{"term":"tutorial"},{"term":"typicality effects"},{"term":"udacity"},{"term":"uncertainty"},{"term":"unified field theory"},{"term":"unsupervised"},{"term":"up-goer fixe"},{"term":"update"},{"term":"urtasun"},{"term":"ut austin"},{"term":"uw"},{"term":"uwashington"},{"term":"vampires"},{"term":"vatic"},{"term":"venture pitch contest"},{"term":"victor prisacariu"},{"term":"video"},{"term":"video annotation"},{"term":"virtual image"},{"term":"virtualbox"},{"term":"vision products"},{"term":"vision software"},{"term":"visual data"},{"term":"visual forgery"},{"term":"visual slam"},{"term":"visual training GUI"},{"term":"vladlen koltun"},{"term":"vmx credits"},{"term":"vocabulary"},{"term":"vondrick"},{"term":"warp"},{"term":"waterfalls"},{"term":"wearable computing"},{"term":"web"},{"term":"weinberger"},{"term":"werewolves"},{"term":"wittgentein"},{"term":"workshops"},{"term":"xiaofeng ren"},{"term":"xkcd"},{"term":"yarin gal"},{"term":"yc"},{"term":"yosemite"},{"term":"yoshua bengio"},{"term":"zeiler"},{"term":"zisserman"}],"title":{"type":"text","$t":"Tombone's Computer Vision Blog"},"subtitle":{"type":"html","$t":"Deep Learning, Computer Vision, and the algorithms that are shaping the future of Artificial Intelligence."},"link":[{"rel":"http://schemas.google.com/g/2005#feed","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/posts\/default"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default?alt=json-in-script\u0026orderby=published"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/"},{"rel":"hub","href":"http://pubsubhubbub.appspot.com/"},{"rel":"next","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default?alt=json-in-script\u0026start-index=26\u0026max-results=25\u0026orderby=published"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"generator":{"version":"7.00","uri":"http://www.blogger.com","$t":"Blogger"},"openSearch$totalResults":{"$t":"288"},"openSearch$startIndex":{"$t":"1"},"openSearch$itemsPerPage":{"$t":"25"},"entry":[{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-2934467168970752428"},"published":{"$t":"2019-11-19T05:18:00.001-05:00"},"updated":{"$t":"2019-11-19T22:13:06.249-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"action"},{"scheme":"http://www.blogger.com/atom/ns#","term":"adrien gaidon"},{"scheme":"http://www.blogger.com/atom/ns#","term":"AI agents"},{"scheme":"http://www.blogger.com/atom/ns#","term":"angela dai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"autonomous cars"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"conference"},{"scheme":"http://www.blogger.com/atom/ns#","term":"daniel cremers"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"gradslam"},{"scheme":"http://www.blogger.com/atom/ns#","term":"iccv 2019"},{"scheme":"http://www.blogger.com/atom/ns#","term":"kornia"},{"scheme":"http://www.blogger.com/atom/ns#","term":"panel"},{"scheme":"http://www.blogger.com/atom/ns#","term":"research"},{"scheme":"http://www.blogger.com/atom/ns#","term":"victor prisacariu"},{"scheme":"http://www.blogger.com/atom/ns#","term":"visual slam"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vladlen koltun"}],"title":{"type":"text","$t":"Computer Vision and Visual SLAM vs. AI Agents"},"content":{"type":"html","$t":"\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EWith all the recent advancements in end-to-end deep learning, it is now possible to train AI agents to perform many different tasks (some in simulation and some in the real-world). End-to-end learning allows one to replace a multi-component, hand-engineered system with a single learning network that can process raw sensor data and output actions for the AI to take in the physical world. I will discuss the implications of these ideas while highlighting some new research trends regarding Deep Learning for Visual SLAM and conclude with some predictions regarding the kinds of spatial reasoning algorithms that we will need in the future.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-cM2rvEvC2hc\/XdO8N33q7JI\/AAAAAAAAQys\/LBF93qVtv3wHe_wt7yY65h55KHUmpnBiACLcBGAsYHQ\/s1600\/computer-vision-vs-ai-agents-cover.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"450\" data-original-width=\"800\" height=\"225\" src=\"https:\/\/1.bp.blogspot.com\/-cM2rvEvC2hc\/XdO8N33q7JI\/AAAAAAAAQys\/LBF93qVtv3wHe_wt7yY65h55KHUmpnBiACLcBGAsYHQ\/s400\/computer-vision-vs-ai-agents-cover.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EIn today's article, we will go over three ideas:\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;I.) Does Computer Vision Matter for Action?\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;II.) Visual SLAM for AI agents\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;III.) Quō vādis Visual SLAM? Trends and research forecast\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan style=\"font-size: large;\"\u003EI. Does Computer Vision Matter for Action?\u003C\/span\u003E\u003C\/strong\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EAt last month's International Conference of Computer Vision (ICCV 2019), I heard the following thought-provoking question,\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\"What do Artificial Intelligence Agents need (if anything) from the field of Computer Vision?\"\u003C\/em\u003E\u003C\/blockquote\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EThe question was posed by \u003Ca href=\"http:\/\/vladlen.info\/\"\u003EVladlen Koltun\u003C\/a\u003E\u0026nbsp;(from Intel Research) during his talk at the Deep Learning for Visual SLAM Workshop at ICCV 2019 in Seoul. He spoke about building AI agents with and without the aid of computer vision to guide representation learning. While Koltun has worked on classical Visual SLAM (see his \u003Ca href=\"http:\/\/vladlen.info\/publications\/direct-sparse-odometry\/\"\u003EDirect Sparse Odometry (DSO) system\u003C\/a\u003E [2]), at this workshop, he decided to not speak about his older work on geometry, alignment, or 3D point cloud processing. His talk included numerous ideas spanning several of his team's research papers, some humor (see video), and plenty of Koltun's philosophical views towards general artificial intelligence.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003ERecent techniques show that it is possible to learn actions (the output quantities that we really want) from pixels (raw inputs) directly without any intermediate computer vision processing like object recognition, depth estimation, and segmentation. But just because it is possible to solve some AI tasks without intermediate representations (i.e., the computer vision stuff), does that mean that we should abandon computer vision research and let end-to-end learning take care of everything?\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EProbably not.\u003C\/em\u003E\u003C\/strong\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EFrom a very practical standpoint, let's ask the following question:\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\"Is an agent who is aware of computer vision stuff more robust than an agent trained without intermediate representations?\"\u0026nbsp;\u003C\/span\u003E\u003C\/blockquote\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003ERecent research from Koltun's lab [3] indicates that the answer is\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Eyes\u003C\/em\u003E\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E: training with intermediate representations, as done by supervision from per-frame computer vision tasks, gives rise to more robust agents that learn faster and are more robust in a variety of performance tasks! The next natural question is: which computer vision tasks matter most for agent robustness? Koltun's research suggests that depth estimation is one particular task that works well as an auxiliary task when training agents that have to move through space (i.e., most video games). A depth estimation network should help an AI agent navigate an unknown environment as depth estimation is one key component in many of today's RGBD Visual SLAM systems. The best way to learn about Koltun's paper, titled \u003Ca href=\"http:\/\/vladlen.info\/publications\/computer-vision-matter-action\/\"\u003EDoes Computer Vision Matter for Action?\u003C\/a\u003E, is to see the video on YouTube.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/4MfWa2yZ0Jc\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003EVideo describing Koltun's Does Computer Vision Matter for Action? [3]\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003ELet's imagine that you want to deploy a robot into the world sometimes from now until 2025 based on your large-scale AI agent training, and you're debating whether you should avoid intermediate representations or not.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EIntermediate representations facilitate explainability, debuggability, and testing. Explainability is a key to success when systems require spatial reasoning capabilities in the real-world. If your agents are misbehaving, take a look at their intermediate representations. If you want to improve your AI, you can analyze the computer vision systems to prioritize better your data collection effort. Visualization should be a first-order citizen in your deep learning toolbox.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EBut today's computer vision ecosystem offers more than algorithms that process individual images. Visual SLAM systems rapidly process images while updating the camera's trajectory and updating the 3D map of the world. Visual SLAM, or VSLAM, algorithms are the real-time variants of Structure-from-Motion (SfM), which has been around for a while. SfM uses bundle adjustment -- a minimization of reprojection error, usually solved with Levenberg Marquardt. If there any kind of robot you see moving around today (2019), it is likely that it is running some variant of SLAM (localization and mapping) and not an end-to-end trained network -- at least not today. So what does Visual SLAM mean for AI agents?\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan style=\"font-size: large;\"\u003EII. Visual SLAM for AI Agents\u003C\/span\u003E\u003C\/strong\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EWhile no single per-frame computer vision algorithm is close to sufficient to enable robust action in an environment, there is a class of real-time computer vision systems like Visual SLAM that can be used to guide agents through space. The \u003Ca href=\"http:\/\/visualslam.ai\/\"\u003EWorkshop on Deep Learning for Visual SLAM at ICCV 2019\u003C\/a\u003E\u0026nbsp;showcased a variety of different Visual SLAM approaches and included a discussion panel. The workshop featured talks on Visual SLAM on mobile platforms (\u003Ca href=\"http:\/\/www.robots.ox.ac.uk\/~victor\/\"\u003EVictor Prisacariu\u003C\/a\u003E\u0026nbsp;from \u003Ca href=\"http:\/\/6d.ai\/\"\u003E6d.ai\u003C\/a\u003E), autonomous cars (\u003Ca href=\"https:\/\/vision.in.tum.de\/members\/cremers\"\u003EDaniel Cremers\u003C\/a\u003E from TUM and \u003Ca href=\"http:\/\/artisense.ai\/\"\u003EArtiSense.ai\u003C\/a\u003E), high-detail indoor modeling (\u003Ca href=\"https:\/\/angeladai.github.io\/\"\u003EAngela Dai \u003C\/a\u003Efrom TUM), AI Agents (\u003Ca href=\"http:\/\/vladlen.info\/\"\u003EVladlen Koltun\u003C\/a\u003E from Intel Research) and mixed-reality (\u003Ca href=\"http:\/\/tom.ai\/\"\u003ETomasz Malisiewicz\u003C\/a\u003E from Magic Leap).\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-DTGBOTHDobE\/XdOdgi_kp4I\/AAAAAAAAQx0\/i48ux7V_9cwSqFomb7rqTu270ElHxkV8QCLcBGAsYHQ\/s1600\/2nd_workshop_on_visual_slam_iccv_2019.png\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg alt=\"2nd Workshop on Deep Learning for Visual SLAM\" border=\"0\" data-original-height=\"523\" data-original-width=\"1141\" height=\"182\" src=\"https:\/\/1.bp.blogspot.com\/-DTGBOTHDobE\/XdOdgi_kp4I\/AAAAAAAAQx0\/i48ux7V_9cwSqFomb7rqTu270ElHxkV8QCLcBGAsYHQ\/s400\/2nd_workshop_on_visual_slam_iccv_2019.png\" title=\"\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cb\u003ETeaser Image for the 2nd Workshop on Deep Learning for Visual SLAM from \u003Ca href=\"http:\/\/www.ronnieclark.co.uk\/\"\u003ERonnie Clark\u003C\/a\u003E.\u003Cbr \/\u003ESee info at \u003Ca href=\"http:\/\/visualslam.ai\/\"\u003Ehttp:\/\/visualslam.ai\u003C\/a\u003E\u003C\/b\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EWhen it comes to spatial perception capabilities, Koltun's talk made it clear that we, as computer vision researchers, could think bolder. There is a spectrum of spatial perception capabilities that AI agents need that only somewhat overlaps with traditional Visual SLAM (whether deep learning-based or not).\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EKoltun's work is in favor of using intermediate representations based on computer vision to produce more robust AI agents. However, Koltun is not convinced that 6dof Visual SLAM, as is currently defined, needs to be solved for AI agents. Let's consider ordinary human tasks like walking, washing your hands, and flossing your teeth -- each one requires a different amount of spatial reasoning abilities. It is reasonable to assume that AI agents would need varying degrees of spatial localization and mapping capabilities to perform such tasks.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EVisual SLAM techniques, like the ones used inside Augmented Reality systems, build metric 3D maps of the environment for the task of high-precision placement of digital content -- but such high-precision systems might never be used directly inside AI agents. When the camera is hand-held (augmented reality) or head-mounted (mixed reality), a human decides where to move.  AI agents have to make their own movement decisions, and this requires more than feature correspondences and bundle adjustment -- more than what is inside the scope of computer vision.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EInside a head-mounted display, you might look at digital content 30 feet away from you, and for everything to look correct geometrically, you must have a decent 3D map of the world (spanning at least 30 feet) and a reasonable estimate of your pose. But for many tasks that AI agents need to perform, metric-level representations of far-away geometry are un-necessary. It is as if proper action requires local, high-quality metric maps and something coarser like topological maps for large-range maps. Visual SLAM systems (stereo-based and depth-sensor based) are likely to find numerous applications in industry such as mixed reality and some branches of robotics, where millimeter precision matters.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EMore general end-to-end learning for AI agents will show us new kinds of spatial intelligence, automatically learned from data. There is a lot of exciting research to be done to answer questions like the following: What kind of tasks can we train Visual AI Agents for such that map-building and localization capabilities arise? Or What type of core spatial reasoning capabilities can we pre-build to enable further self-supervised learning from the 3D world?\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan style=\"font-size: large;\"\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EIII.\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EQuō vādis Visual SLAM? Trends and research forecast\u003C\/strong\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EAt the Deep Learning Workshop for Visual SLAM, an interesting question that came up in the panel focused on the convergence of methods in Visual SLAM. Or alternatively,\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\"Will a single Visual SLAM framework rule them all?\"\u003C\/em\u003E\u003C\/blockquote\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EThe world of applied research is moving towards more deep learning -- by 2019, many of the critical tasks inside computer vision exist as some form of a (convolutional\/graph) neural network. I don't believe that we will see a single SLAM framework\/paradigm dominate all others -- I think we will see a plurality of Visual SLAM systems based on inter-changeable deep learning components. This new generation of deep learning-based components will allow more creative applications of end-to-end learning and be typically useful as modules within other real-world systems. We should create tools that will enable others to make better tools.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EPyTorch is making it easy to build multiple-view geometry tools like \u003Ca href=\"https:\/\/kornia.github.io\/\"\u003EKornia\u003C\/a\u003E\u0026nbsp;-- such that the right parts of computer vision are brought directly into today's deep learning ecosystem as first-order citizens. And PyTorch is winning over the world of research. A dramatic increase in usage happened from 2017 to 2019, with PyTorch now the recommended framework amongst most of my fellow researchers.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003ETo take a look at what the end goal in terms of end-to-end deep learning for visual SLAM might look like, take a look at \u003Ca href=\"http:\/\/montrealrobotics.ca\/gradSLAM\/\"\u003EgradSLAM\u003C\/a\u003E from \u003Ca href=\"https:\/\/krrish94.github.io\/\"\u003EKrishna\u0026nbsp;\u003C\/a\u003E\u003C\/span\u003E\u003Ca href=\"https:\/\/krrish94.github.io\/\"\u003EMurthy\u003C\/a\u003E, a Ph.D. student in MILA, and collaborators at CMU. Their paper offers a new way of thinking of SLAM as made up of differentiable blocks. From the article, \"This amalgamation of dense SLAM with computational graphs enables us to backprop from 3D maps to 2D pixels, opening up new possibilities in gradient-based learning for SLAM.\"\u003Cbr \/\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-XCfJDu1p44g\/XdOmBzwclgI\/AAAAAAAAQyU\/fyrzqoXNlA0-IC7H6WUz3-BDqMmVwGoIgCLcBGAsYHQ\/s1600\/gradslam.png\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg alt=\"Key Figure from the gradSLAM paper on end-to-end learning for SLAM.\" border=\"0\" data-original-height=\"494\" data-original-width=\"1600\" height=\"122\" src=\"https:\/\/1.bp.blogspot.com\/-XCfJDu1p44g\/XdOmBzwclgI\/AAAAAAAAQyU\/fyrzqoXNlA0-IC7H6WUz3-BDqMmVwGoIgCLcBGAsYHQ\/s400\/gradslam.png\" title=\"\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cb\u003EKey Figure from the \u003Ca href=\"http:\/\/montrealrobotics.ca\/gradSLAM\/\"\u003EgradSLAM\u003C\/a\u003E paper on end-to-end learning for SLAM. [5]\u003C\/b\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003EAnother key trend that seems to be on the rise inside the context of Deep Visual SLAM is self-supervised learning. We are seeing more and more practical successes of self-supervised learning for multi-view problems where geometry enables us to get away from strong supervision. Even the ConvNet-based point detector \u003Ca href=\"https:\/\/arxiv.org\/abs\/1712.07629\"\u003ESuperPoint\u003C\/a\u003E [7], which my team and I developed at Magic Leap, uses self-supervision to train more robust interest point detectors. In our case, it was impossible to get ground truth interest points on images, and self-labeling was the only way out. One of my favorite researchers working on self-supervised techniques is \u003Ca href=\"https:\/\/twitter.com\/adnothing\"\u003EAdrien Gaidon\u003C\/a\u003E from TRI, who studies how such methods can be used to make smarter cars. Adrien gave some great talks at other ICCV 2019 Workshops related to autonomous vehicles, and his work is closely related to Visual SLAM and useful for anybody working on similar problems.\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/SLEK2vAgjOI\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003EAdrien Gaidon's talk from October 11th, 2019 on Self-Supervised Learning in the context of Autonomous Cars\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003EAnother excellent presentation about this topic from \u003Ca href=\"https:\/\/people.eecs.berkeley.edu\/~efros\/\"\u003EAlyosha Efros\u003C\/a\u003E. He does a great job convincing you why you should love self-supervision.\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/_V-WpE8cmpc\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003EA presentation about self-supervision from Alyosha Efros on May 25th, 2018\u003C\/b\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan style=\"font-size: large;\"\u003EConclusion\u003C\/span\u003E\u003C\/strong\u003E\u003Cbr \/\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan style=\"font-size: large;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/strong\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EAs more and more spatial reasoning skills get baked into deep networks, we must face two opposing forces. On the one hand, specifying internal representations makes it difficult to scale to new tasks -- it is easier to trick the deep nets into doing all the hard work for you. On the other hand, we want interpretability and some amount of safety when we deploy AI agents into the real world, so some intermediate tasks like object recognition are likely to be involved in today's spatial perception recipe. Lots of exciting work is happening with \u003Ca href=\"https:\/\/openai.com\/blog\/emergent-tool-use\/\"\u003Emulti-agents from OpenAI\u003C\/a\u003E [6], but full end-to-end learning will not give real-world robots such as autonomous cars anytime soon.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/kopoLzvh5jY\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003EVideo from OpenAI showing Multi-Agent Hide and Seek. \u003C\/b\u003E[6]\u0026nbsp; \u0026nbsp;\u003C\/div\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EMore practical Visual SLAM research will focus on differentiable high-level blocks. As more deep learning happens in Visual SLAM, it will create a renaissance in Visual SLAM as sharing entire SLAM systems will be as easy as sharing CNNs today. I cannot wait until the following is possible:\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cb\u003Epip install DeepSLAM\u003C\/b\u003E\u003C\/span\u003E\u003C\/blockquote\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EI hope you enjoyed learning about the different approaches to Visual SLAM, and that you have found my blog post insightful and educational. Until next time!\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EReferences:\u003C\/strong\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[1].\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Vladlen Koltun.\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Chief Scientist for Intelligent Systems at Intel.\u0026nbsp;\u003C\/span\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"http:\/\/vladlen.info\/\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Ehttp:\/\/vladlen.info\/\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[2].\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EDirect Sparse Odometry.\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Jakob Engel, Vladlen Koltun, and Daniel Cremers. IEEE Transactions on Pattern Analysis and Machine Intelligence, 40(3), 2018.\u0026nbsp;\u003C\/span\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"http:\/\/vladlen.info\/publications\/direct-sparse-odometry\/\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Ehttp:\/\/vladlen.info\/publications\/direct-sparse-odometry\/\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[3].\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EDoes Computer Vision Matter for Action?\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Brady Zhou, Philipp Krähenbühl, and Vladlen Koltun. Science Robotics, 4(30), 2019.\u0026nbsp;\u003C\/span\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"http:\/\/vladlen.info\/publications\/computer-vision-matter-action\/\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Ehttp:\/\/vladlen.info\/publications\/computer-vision-matter-action\/\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[4].\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EKornia: an Open Source Differentiable Computer Vision Library for PyTorch\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E. Edgar Riba, Dmytro Mishkin, Daniel Ponsa, Ethan Rublee, and Gary Bradski. Winter Conference on Applications of Computer Vision, 2019.\u0026nbsp;\u003C\/span\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"https:\/\/kornia.github.io\/\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Ehttps:\/\/kornia.github.io\/\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[5].\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EgradSLAM: Dense SLAM meets Automatic Differentiation.\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Krishna Murthy J., Ganesh\u0026nbsp;\u003C\/span\u003EIyer,\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;and\u0026nbsp;\u003C\/span\u003ELiam\u0026nbsp;\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EPaull. In\u0026nbsp;\u003C\/span\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EarXiv\u003C\/em\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E, 2019.\u0026nbsp;\u003C\/span\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"http:\/\/montrealrobotics.ca\/gradSLAM\/\" style=\"color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003Ehttp:\/\/montrealrobotics.ca\/gradSLAM\/\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E[6]\u0026nbsp;\u003C\/span\u003E\u003Cstrong style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EEmergent tool use from multi-agent autocurricula.\u003C\/strong\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u0026nbsp;Bowen Baker, Ingmar Kanitscheider, Todor Markov, Yi Wu, Glenn Powell, Bob McGrew, and Igor Mordatch. In\u0026nbsp;\u003C\/span\u003E\u003Cem style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003EarXiv\u0026nbsp;\u003C\/em\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; margin-bottom: 0pt; margin-top: 0pt;\"\u003E2019.\u0026nbsp;\u003C\/span\u003E\u003Cspan data-preserver-spaces=\"true\" style=\"color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Ca class=\"_e75a791d-denali-editor-page-rtfLink\" href=\"https:\/\/openai.com\/blog\/emergent-tool-use\/\" style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #4a6ee0; margin-bottom: 0pt; margin-top: 0pt;\" target=\"_blank\"\u003Ehttps:\/\/openai.com\/blog\/emergent-tool-use\/\u003C\/a\u003E\u003C\/span\u003E\u003Cbr \/\u003E[7] \u003Cb\u003ESuperPoint: Self-supervised interest point detection and description.\u003C\/b\u003E Daniel DeTone, Tomasz Malisiewicz, and Andrew Rabinovich. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops. 2018.\u0026nbsp;\u003Ca href=\"https:\/\/arxiv.org\/abs\/1712.07629\"\u003Ehttps:\/\/arxiv.org\/abs\/1712.07629\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"background-attachment: initial; background-clip: initial; background-image: initial; background-origin: initial; background-position: initial; background-repeat: initial; background-size: initial; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"background: transparent; color: #1c1e29; margin-bottom: 0pt; margin-top: 0pt;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/2934467168970752428\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2019\/11\/computer-vision-and-visual-slam-vs-ai.html#comment-form","title":"2 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2934467168970752428"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2934467168970752428"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2019\/11\/computer-vision-and-visual-slam-vs-ai.html","title":"Computer Vision and Visual SLAM vs. AI Agents"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/1.bp.blogspot.com\/-cM2rvEvC2hc\/XdO8N33q7JI\/AAAAAAAAQys\/LBF93qVtv3wHe_wt7yY65h55KHUmpnBiACLcBGAsYHQ\/s72-c\/computer-vision-vs-ai-agents-cover.png","height":"72","width":"72"},"thr$total":{"$t":"2"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-7872786804655838317"},"published":{"$t":"2018-05-16T14:22:00.001-05:00"},"updated":{"$t":"2018-05-16T14:22:45.975-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"alyosha efros"},{"scheme":"http://www.blogger.com/atom/ns#","term":"cvpr"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deepfake"},{"scheme":"http://www.blogger.com/atom/ns#","term":"descartes"},{"scheme":"http://www.blogger.com/atom/ns#","term":"face detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"face transfer"},{"scheme":"http://www.blogger.com/atom/ns#","term":"face2face"},{"scheme":"http://www.blogger.com/atom/ns#","term":"fake news"},{"scheme":"http://www.blogger.com/atom/ns#","term":"GANs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"Ira Kemelmacher-Shlizerman"},{"scheme":"http://www.blogger.com/atom/ns#","term":"justus thies"},{"scheme":"http://www.blogger.com/atom/ns#","term":"matthias niessner"},{"scheme":"http://www.blogger.com/atom/ns#","term":"realism"},{"scheme":"http://www.blogger.com/atom/ns#","term":"siggraph"},{"scheme":"http://www.blogger.com/atom/ns#","term":"snapchat"},{"scheme":"http://www.blogger.com/atom/ns#","term":"truth"},{"scheme":"http://www.blogger.com/atom/ns#","term":"visual forgery"}],"title":{"type":"text","$t":"DeepFakes: AI-powered deception machines"},"content":{"type":"html","$t":"Driven by computer vision and deep learning techniques, a new wave of imaging attacks has recently emerged which allows anyone to easily create highly realistic \"fake\" videos. These false videos are known as \u003Cb\u003EDeep Fakes. \u003C\/b\u003EWhile highly entertaining at times, DeepFakes can be used to perturb society and some would argue that the pre-shock has already begun. A rogue DeepFake which goes viral can spread misinformation across the internet like wildfire.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\"\u003Ci\u003EThe ability to effortlessly create visually plausible editing of faces in videos has the potential to severely undermine trust in any form of digital communication. \u003C\/i\u003E\"\u003C\/blockquote\u003E\u003Cblockquote class=\"tr_bq\" style=\"text-align: right;\"\u003E--Rössler et al. FaceForensics [3]\u003C\/blockquote\u003E\u003Cbr \/\u003EBecause DeepFakes contain a unique combination of realism and novelty, they are more difficult to detect on social networks as compared to traditional \"bad\" content like pornography and copyrighted movies. Video hashing might work for finding duplicates or copyright-infringing content, but not good enough for DeepFakes. To fight face-manipulating DeepFake AI, one needs an even stronger AI.\u003Cbr \/\u003E\u003Cbr \/\u003EAs today's DeepFakes are based on Deep Learning, and Deep Learning tools like TensorFlow and PyTorch are accessible to anybody with a modern GPU, such face manipulation tools are particularly disruptive. The democratization of Artificial Intelligence has brought us near infinite use-cases. \u003Cb\u003EFrom the DeepDream phenomenon of 2015 to the Deep Style Transfer Art apps of 2016, 2018 is the year of the DeepFake. \u003C\/b\u003EToday's computer vision technology allows a hobbyist to create a Deep Fake video of just about any person they want performing any action they want, in a matter of hours, using commodity computer hardware.\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-HYA207hkaf4\/WvvXnzXXTyI\/AAAAAAAAP1o\/O1MGUy4EOtARBrocGmsu2-r_7UtE5vtzQCLcBGAs\/s1600\/mind.png\" imageanchor=\"1\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" data-original-height=\"350\" data-original-width=\"730\" height=\"190\" src=\"https:\/\/2.bp.blogspot.com\/-HYA207hkaf4\/WvvXnzXXTyI\/AAAAAAAAP1o\/O1MGUy4EOtARBrocGmsu2-r_7UtE5vtzQCLcBGAs\/s400\/mind.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cspan style=\"font-size: small;\"\u003EFig 1. DeepFakes generate \"false impressions\" which are attacks on the human mind.\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cb\u003E\u003Cbr \/\u003EWhat is a Deep Fake?\u003C\/b\u003E\u003Cbr \/\u003EA deep fake is a video generated from a modern computer vision puppeteering face-swap algorithm which can be used to generate a video of target person X performing target action A, usually given a video of another person Y performing action A. The underlying system learns two face models, one of target person X, and of for person Y, the person in the original video. It then learns a mapping between the two faces, which can be used to create the resulting \"fake\" video. Techniques for facial reenactment have been pioneered by movie studios for driving character animations from real actors' faces, but these techniques are now emerging as deep learning-based software packages, letting the deep convolutional neural networks do most of the work during model training.\u003Cbr \/\u003E\u003Cbr \/\u003EConsider the following collage of faces. Can you guess which ones are real and which ones are DeepFakes?\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-yOv9HEZSkZo\/Wvvd_o8kCsI\/AAAAAAAAP2A\/8pliA-DpPdgmz6Ncgp-GhSojmZG-cP12wCLcBGAs\/s1600\/faceforensics.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"652\" data-original-width=\"1600\" height=\"162\" src=\"https:\/\/1.bp.blogspot.com\/-yOv9HEZSkZo\/Wvvd_o8kCsI\/AAAAAAAAP2A\/8pliA-DpPdgmz6Ncgp-GhSojmZG-cP12wCLcBGAs\/s400\/faceforensics.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 2. Can you tell which faces are real and which ones are fake?\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFigure from Face Forensics[3]\u003C\/div\u003E\u003Cbr \/\u003EIt is not so easy to tell which image is modified and which one is unadulterated. And if you do a little bit of searching for DeepFakes (warning, unless you are careful, you will encounter lots of pornographic content) you notice that the faces in those videos look very realistic.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EHow are Deep Fakes made?\u003C\/b\u003E\u003Cbr \/\u003EWhile there are conceptually many different ways to make Deep Fakes, today we'll focus on two key underlying techniques: face detection from videos, and deep learning for creating frame alignments between source face X and target face Y.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: left;\"\u003EA lot of this research started with the Face2face work [1] presented at CVPR 2016. This paper was a modernization of the group's earlier SIGGRAPH paper and focused a lot more on the computer vision details. At this time the tools were good enough to create SIGGRAPH-quality videos, but it took a lot of work to put together a facial reenactment rig. In addition, the underlying algorithms did not use any deep learning, so a lot of domain-knowledge (i.e., face modeling expertise) went into making these algorithms work robustly.\u0026nbsp;\u003Cspan style=\"text-align: center;\"\u003EThe TUM\/Stanford guys filed their Real-time facial reenactment patent in 2016 [4], and have more recently worked on FaceForensics[3] to detect such manipulated imagery.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"autoplay; encrypted-media\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/ohmajJTcpNk\" width=\"460\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003EFig3. \u003Cb\u003EFace2Face technique from 2016\u003C\/b\u003E. It is 2018 now, so just imagine how much better this works now!\u003C\/div\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E In addition to the Face2face guys (who have now a handful of similarly themed papers), it is interesting to note that a lot of key early ideas in face puppeteering were pioneered by \u003Ca href=\"https:\/\/homes.cs.washington.edu\/~kemelmi\/\"\u003EIra Kemelmacher-Shlizerman\u003C\/a\u003E who is now a computer vision and graphics assistant professor at University of Washington. She worked on early face puppeteering technology for the 2010 paper Being John Malkovich, continued with the Photobios work, and later founded Dreambit (based on a SIGGRAPH 2016 paper), which was acquired by Facebook. :-)\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-ZSwRTGeWHj8\/WvvbI_W2lNI\/AAAAAAAAP10\/a_sptQou7asj7oG3Q-zVcsfLelyDm326wCLcBGAs\/s1600\/ira_early_deep_fake.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"475\" data-original-width=\"1600\" height=\"118\" src=\"https:\/\/4.bp.blogspot.com\/-ZSwRTGeWHj8\/WvvbI_W2lNI\/AAAAAAAAP10\/a_sptQou7asj7oG3Q-zVcsfLelyDm326wCLcBGAs\/s400\/ira_early_deep_fake.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 4. \u003Cb\u003EIra's early work on face swapping in 2010.\u003C\/b\u003E\u0026nbsp;See the Being John Malkovich paper[2].\u003C\/div\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003ETake a look at Ira's Dreambit video, which shows some high-quality \"entertainment\" value out of rapidly produced non-malicious DeepFakes!\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allow=\"autoplay; encrypted-media\" allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/mILLFK1Rwhk\" width=\"460\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003EFig 5. \u003Cb\u003EIra's Dreambit system\u003C\/b\u003E. Lets her imagine herself in different eras, with different hairstyles, etc.\u003C\/div\u003E\u003Cdiv style=\"font-weight: bold;\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003EThe origin of Ira's\u0026nbsp;Dreambit system is the\u0026nbsp;Transfiguring Portraits SIGGRAPH 2016 paper[6]. What's important to note is that this is 2016 and we're starting to see some use of Deep Learning. The transfiguring portraits work used a big mix of features, using some CNN features computed from early Caffe networks. It is not an entirely easy-to-use system at this point, but good enough to make SIGGRAPH videos, take a one minute to generate other cool outputs, and definitely cool enough for Facebook to acquire.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; font-weight: bold; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"font-weight: bold;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; font-weight: bold; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-pljf2NHUUWs\/WvvuOynn5YI\/AAAAAAAAP20\/d0m63YDr0ZgT1YNVZadRft8YIjFl23J0gCLcBGAs\/s1600\/transfiguring_portraits_deepfake.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"514\" data-original-width=\"1384\" height=\"147\" src=\"https:\/\/2.bp.blogspot.com\/-pljf2NHUUWs\/WvvuOynn5YI\/AAAAAAAAP20\/d0m63YDr0ZgT1YNVZadRft8YIjFl23J0gCLcBGAs\/s400\/transfiguring_portraits_deepfake.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 6. \u003Cb\u003ETransfiguring Portraits\u003C\/b\u003E. The system used lots of features, but Deep Learning-based CNN features are starting to show up.\u003C\/div\u003E\u003Cdiv style=\"font-weight: bold;\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cb\u003EFighting against DeepFakes\u003C\/b\u003E\u003Cbr \/\u003EThere are now published algorithms which try to battle DeepFakes by determining if faces\/videos are fake or not. FaceForensics[3] introduces a large DeepFake dataset based on their earlier Face2face work. This dataset contains both real and \"fake\" Face2face output videos. More importantly, the new dataset is big enough to train a deep learning system to determine if an image is counterfeit. In addition, they are able to both 1.) determine which pixels have likely been manipulated, and 2.) perform a deep cleanup stage to make even better DeepFakes.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-uZCHrqaOI5A\/WvwaGqmurPI\/AAAAAAAAP3U\/1WXHG6gBZSUbrSuvCLngsdJ6RkqGdSW5ACLcBGAs\/s1600\/deepmask_deepfake_detection.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"613\" data-original-width=\"1387\" height=\"176\" src=\"https:\/\/4.bp.blogspot.com\/-uZCHrqaOI5A\/WvwaGqmurPI\/AAAAAAAAP3U\/1WXHG6gBZSUbrSuvCLngsdJ6RkqGdSW5ACLcBGAs\/s400\/deepmask_deepfake_detection.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 7. \u003Cb\u003EThe \"fakeness\" masks in FaceForensics[3] are based on XceptionNet\u003C\/b\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EAnother fake detection approach, this time from a Berkeley AI Research group called Image Splice Detection, focuses on detecting where an image was spliced to create a fake composite image. This allows them to determine which part of the image was likely \"photoshopped\" and the technique is not specific to faces. And because this is a 2018 paper, it should not be a surprise that this kind of work is all based on deep learning techniques.\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/--5baWKzAjDQ\/WvviZIa5XdI\/AAAAAAAAP2Y\/4O2IbDtKtfgAjH3UD76YYRyJNMaBzku1QCLcBGAs\/s1600\/efros_fake_news.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" data-original-height=\"649\" data-original-width=\"1388\" height=\"186\" src=\"https:\/\/2.bp.blogspot.com\/--5baWKzAjDQ\/WvviZIa5XdI\/AAAAAAAAP2Y\/4O2IbDtKtfgAjH3UD76YYRyJNMaBzku1QCLcBGAs\/s400\/efros_fake_news.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 8. \u003Cb\u003EFighting Fake News: Image Splice Detection\u003C\/b\u003E[5]\u003Cb\u003E. \u003C\/b\u003EResponse maps are aggregated to determine the combined probability mask.[5]\u003C\/div\u003E\u003Cbr \/\u003EFrom the Fighting Fake News paper,\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\"\u003Ci\u003EAs new advances in computer vision and image-editing emerge, there is an increasingly urgent need for effective visual forensics methods. We see our approach, which successfully detects manipulations without seeing examples of manipulated images, as being an initial step toward building general-purpose forensics tools\u003C\/i\u003E.\"\u003C\/blockquote\u003E\u003Cbr \/\u003E\u003Cb\u003EConcluding Remarks\u003C\/b\u003E\u003Cbr \/\u003EThe early DeepFake tools were pioneered in the early 2010s and were producing SIGGRAPH-quality results by 2015. It was only a matter of years until DeepFake generators became publicly available. 2018's DeepFake generators, being written on top of open-source Deep Learning libraries, are much easier to use than the researchy systems from only a few years back. Today, just about any hobbyist with minimal computer programming knowledge and a GPU can build their own DeepFakes.\u003Cbr \/\u003E\u003Cbr \/\u003EJust as Deep Fakes are getting better, Generative Adversarial Networks are showing more promise for photorealistic image generation. It is likely that we will soon see lots of exciting new work on both the generative side (deep fake generation) and the discriminative side (deep fake detection and image forensics) which incorporate more and more ideas from the machine learning community.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003EReferences\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E[1] Justus Thies, Michael Zollhöfer, Marc Stamminger, Christian Theobalt, and Matthias Nießner. \"\u003Ca href=\"https:\/\/web.stanford.edu\/~zollhoef\/papers\/CVPR2016_Face2Face\/paper.pdf\"\u003EFace2face: Real-time face capture and reenactment of rgb videos\u003C\/a\u003E.\" In Computer Vision and Pattern Recognition (CVPR), 2016 IEEE Conference on, pp. 2387-2395. IEEE, 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E[2] Ira Kemelmacher-Shlizerman, Aditya Sankar, Eli Shechtman, and Steven M. Seitz. \"\u003Ca href=\"http:\/\/grail.cs.washington.edu\/projects\/malkovich\/\"\u003EBeing john malkovich\u003C\/a\u003E.\" In European Conference on Computer Vision, pp. 341-353. Springer, 2010.\u003Cbr \/\u003E\u003Cbr \/\u003E[3] Andreas Rössler, Davide Cozzolino, Luisa Verdoliva, Christian Riess, Justus Thies, and Matthias Nießner. \"\u003Ca href=\"https:\/\/arxiv.org\/abs\/1803.09179\"\u003EFaceForensics: A Large-scale Video Dataset for Forgery Detection in Human Faces\u003C\/a\u003E.\" arXiv preprint arXiv:1803.09179, 2018.\u003Cbr \/\u003E\u003Cbr \/\u003E[4] Christian Theobalt, Michael Zollhöfer, Marc Stamminger, Justus Thies, Matthias Nießner. Real-time Expression Transfer for Facial Reenactment Invention. 2018\/3\/8. Application Number\u0026nbsp;15256710\u003Cbr \/\u003E\u003Cbr \/\u003E[5]\u0026nbsp;Minyoung Huh, Andrew Liu, Andrew Owens, Alexei A. Efros, \"\u003Ca href=\"https:\/\/arxiv.org\/abs\/1805.04096\"\u003EFighting Fake News: Image Splice Detection via Learned Self-Consistency.\u003C\/a\u003E\" arXiv preprint arXiv:1805.04096, 2018\u003Cbr \/\u003E\u003Cbr \/\u003E[6] Ira Kemelmacher-Shlizerman, \"\u003Ca href=\"https:\/\/homes.cs.washington.edu\/~kemelmi\/Transfiguring_Portraits_Kemelmacher_SIGGRAPH2016.pdf\"\u003ETransfiguring portraits\u003C\/a\u003E.\" ACM Transactions on Graphics (TOG), 35(4), p.94. 2016\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/7872786804655838317\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2018\/05\/deepfakes-ai-powered-deception-machines.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/7872786804655838317"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/7872786804655838317"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2018\/05\/deepfakes-ai-powered-deception-machines.html","title":"DeepFakes: AI-powered deception machines"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/2.bp.blogspot.com\/-HYA207hkaf4\/WvvXnzXXTyI\/AAAAAAAAP1o\/O1MGUy4EOtARBrocGmsu2-r_7UtE5vtzQCLcBGAs\/s72-c\/mind.png","height":"72","width":"72"},"thr$total":{"$t":"0"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-6547994887448818346"},"published":{"$t":"2016-12-16T00:13:00.000-05:00"},"updated":{"$t":"2016-12-16T03:55:19.368-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"advice"},{"scheme":"http://www.blogger.com/atom/ns#","term":"andrew ng"},{"scheme":"http://www.blogger.com/atom/ns#","term":"bias-variance"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"nips 2016"},{"scheme":"http://www.blogger.com/atom/ns#","term":"research"},{"scheme":"http://www.blogger.com/atom/ns#","term":"supervised learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"synthesis"}],"title":{"type":"text","$t":"Nuts and Bolts of Building Deep Learning Applications: Ng @ NIPS2016"},"content":{"type":"html","$t":"You might go to a cutting-edge machine learning research conference like NIPS hoping to find some mathematical insight that will help you take your deep learning system's performance to the next level. Unfortunately, as Andrew Ng reiterated to a live crowd of 1,000+ attendees this past Monday, there is no secret AI equation that will let you escape your machine learning woes. All you need is some \u003Cb\u003E\u003Ci\u003Erigor\u003C\/i\u003E\u003C\/b\u003E, and much of what Ng covered is his remarkable NIPS 2016 presentation titled \"\u003Ci\u003EThe Nuts and Bolts of Building Applications using Deep Learning\u003C\/i\u003E\" is not rocket science. Today we'll dissect the lecture and Ng's key takeaways. Let's begin.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-uA-shnpwkcQ\/WFNia2-WVGI\/AAAAAAAAPSM\/L4K304x-7dwhNEhJ7RpLxNRXKczap6PSQCLcB\/s1600\/nuts_and_bolts_andrew_ng.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"258\" src=\"https:\/\/3.bp.blogspot.com\/-uA-shnpwkcQ\/WFNia2-WVGI\/AAAAAAAAPSM\/L4K304x-7dwhNEhJ7RpLxNRXKczap6PSQCLcB\/s400\/nuts_and_bolts_andrew_ng.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EFigure 1.\u003C\/b\u003E Andrew Ng delivers a powerful message at NIPS 2016.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003EAndrew Ng and the Lecture\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003EAndrew Ng's lecture at NIPS 2016 in Barcelona was phenomenal -- truly one of the best presentations I have seen in a long time. In a juxtaposition of two influential presentation styles, the \u003Ci\u003ECEO-style\u003C\/i\u003E and the \u003Ci\u003EProfessor-style\u003C\/i\u003E, Andrew Ng mesmerized the audience for two hours. Andrew Ng's wisdom from managing large scale AI projects at Baidu, Google, and Stanford really shows. In his talk, Ng spoke to the audience and discussed one of they key challenges facing most of the NIPS audience -- \u003Ci\u003Ehow do you make your deep learning systems better\u003C\/i\u003E? Rather than showing off new research findings from his cutting-edge projects, Andrew Ng presented a simple recipe for analyzing and debugging today's large scale systems. With no need for equations, a handful of diagrams, and several checklists, Andrew Ng delivered a two-whiteboards-in-front-of-a-video-camera lecture, something you would expect at a group research meeting. However, Ng made sure to not delve into Research-y areas, likely to make your brain fire on all cylinders, but making you and your company very little dollars in the foreseeable future.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cdiv\u003E\u003Cb\u003EMoney-making deep learning vs Idea-generating deep learning\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003EAndrew Ng highlighted the fact that while NIPS is a research conference, many of the newly generated ideas are simply ideas, not yet battle-tested vehicles for converting mathematical acumen into dollars. The bread and butter of money-making deep learning\u0026nbsp;is\u0026nbsp;supervised learning with recurrent neural networks such as LSTMs in second place. Research areas such as Generative Adversarial Networks (GANs), Deep Reinforcement Learning (Deep RL), and just about anything branding itself as unsupervised learning, are simply Research, with a capital R. These ideas are likely to influence the next 10 years of Deep Learning research, so it is wise to focus on publishing and tinkering if you really love such open-ended Research endeavours. Applied deep learning research is much more about\u0026nbsp;taming\u0026nbsp;your problem (understanding the inputs and outputs), casting the problem as a supervised learning problem, and hammering it with ample data and ample experiments.\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cb\u003E\"It takes surprisingly long time to grok bias and variance deeply, but people that understand bias and variance deeply are often able to drive very rapid progress.\"\u0026nbsp;\u003C\/b\u003E\u003C\/blockquote\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Ci\u003E--Andrew Ng\u0026nbsp;\u003C\/i\u003E\u003C\/blockquote\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003EThe 5-step method of building better systems\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003EMost issues in applied deep learning come from a training-data \/ testing-data mismatch. In some scenarios this issue just doesn't come up, but you'd be surprised how often applied machine learning projects use training data (which is easy to collect and annotate) that is different from the target application. Andrew Ng's discussion is centered around the basic idea of bias-variance tradeoff. You want a classifier with a good ability to fit the data (low bias is good) that also generalizes to unseen examples (low variance is good). Too often, applied machine learning projects running as scale forget this critical dichotomy. Here are the four numbers you should always report:\u003C\/div\u003E\u003Cdiv\u003E\u003Cul\u003E\u003Cli\u003ETraining set error\u003C\/li\u003E\u003Cli\u003ETesting set error\u003C\/li\u003E\u003Cli\u003EDev (aka Validation) set error\u003C\/li\u003E\u003Cli\u003ETrain-Dev (aka Train-Val) set error\u003C\/li\u003E\u003C\/ul\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv\u003EAndrew Ng suggests following the following recipe:\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-duzBNDYdDGA\/WFNtNi0DcNI\/AAAAAAAAPSc\/AHuvDXl6EhAgweD6IxGAbqOBK5qM_W05QCLcB\/s1600\/nuts-and-bolts-checklist.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"300\" src=\"https:\/\/3.bp.blogspot.com\/-duzBNDYdDGA\/WFNtNi0DcNI\/AAAAAAAAPSc\/AHuvDXl6EhAgweD6IxGAbqOBK5qM_W05QCLcB\/s400\/nuts-and-bolts-checklist.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EFigure 2. \u003C\/b\u003EAndrew Ng's \"Applied Bias-Variance for Deep Learning Flowchart\"\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003Efor building better deep learning systems.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003ETake all of your data, split it into 60% for training and 40% for testing. Use half of the test set for evaluation purposes only, and the other half for development (aka validation). Now take the training set, leave out a little chunk, and call it the training-dev data. This 4-way split isn't always necessary, but consider the worse case where you start with two separate sets of data, and not just one: a large set of training data and a smaller set of test data. You'll still want to split the testing into validation and testing, but also consider leaving out a small chunk of the training data for the training-validation. By reporting the data on the training set vs the training-validation set, you measure the \"variance.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-kGCFHEtvSCc\/WFNydTfCWwI\/AAAAAAAAPSs\/Zt-8Hxy0b6AhSKQLW8eVC2YmH7LOL2pyQCLcB\/s1600\/bias-variance-andrew-ng.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"267\" src=\"https:\/\/2.bp.blogspot.com\/-kGCFHEtvSCc\/WFNydTfCWwI\/AAAAAAAAPSs\/Zt-8Hxy0b6AhSKQLW8eVC2YmH7LOL2pyQCLcB\/s400\/bias-variance-andrew-ng.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EFigure 3. \u003C\/b\u003EHuman-level vs Training vs Training-dev vs Dev vs Test.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ETaken from Andrew Ng's 2016 talk.\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EIn addition to these four accuracies, you might want to report the human-level accuracy, for a total of 5 quantities to report. The difference between human-level and training set performance is the Bias. The difference between the training set and the training-dev set is the Variance. The difference between the training-dev and dev sets is the train-test mismatch, which is much more common in real-world applications that you'd think. And finally, the difference between the dev and test sets measures how overfitting.\u003Cbr \/\u003E\u003Cbr \/\u003ENowhere in Andrew Ng's presentation does he mention how to use unsupervised learning, but he does include a brief discussion about \"Synthesis.\" Such synthesis ideas are all about blending pre-existing data or using a rendering engine to augment your training set.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003Cbr \/\u003EIf you want to lose weight, gain muscle, and improve your overall physical appearance, there is no magical protein shake and no magical bicep-building exercise. The fundamentals such as reduced caloric intake, getting adequate sleep, cardiovascular exercise, and core strength exercises like squats and bench presses will get you there. In this sense, fitness is just like machine learning -- there is no secret sauce. I guess that makes \u003Ci\u003EAndrew Ng the Arnold Schwarzenegger of Machine Learning\u003C\/i\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003EWhat you are most likely missing in your life is the rigor of reporting a handful of useful numbers such as performance on the 4 main data splits (see Figure 3). Analyzing these numbers will let you know if you need more data or better models, and will ultimately let you hone in your expertise on the conceptual bottleneck in your system (see Figure 2).\u003Cbr \/\u003E\u003Cbr \/\u003EWith a prolific research track record that never ceases to amaze, we all know Andrew Ng as one hell of an applied machine learning researcher. But the new Andrew Ng is not just another data-nerd. His personality is bigger than ever -- more confident, more entertaining, and his experience with a large number of academic and industrial projects makes him much wiser. With enlightening lectures as \"The Nuts and Bolts of Building Applications with Deep Learning\" Andrew Ng is likely to be an individual whose future keynotes you might not want to miss.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003Cb\u003EAppendix\u003C\/b\u003E\u003Cbr \/\u003EYou can watch a September 27th, 2016 version of the \u003Ca href=\"https:\/\/www.youtube.com\/watch?v=F1ka6a13S9I\"\u003EAndrew Ng Nuts and Bolts of Applying Deep Learning Lecture on YouTube\u003C\/a\u003E, which he delivered at the Deep Learning School. If you are working on machine learning problems in a startup, then definitely give the video a watch. I will update the video link once\/if the newer NIPS 2016 version shows up online.\u003Cbr \/\u003E\u003Cbr \/\u003EYou can also check out \u003Ca href=\"https:\/\/kevinzakka.github.io\/2016\/09\/26\/applying-deep-learning\/\"\u003EKevin Zakka's blog post\u003C\/a\u003E for ample illustrations and writeup corresponding to Andrew Ng's entire talk.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cdiv\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/6547994887448818346\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/12\/nuts-and-bolts-of-building-deep.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6547994887448818346"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6547994887448818346"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/12\/nuts-and-bolts-of-building-deep.html","title":"Nuts and Bolts of Building Deep Learning Applications: Ng @ NIPS2016"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-uA-shnpwkcQ\/WFNia2-WVGI\/AAAAAAAAPSM\/L4K304x-7dwhNEhJ7RpLxNRXKczap6PSQCLcB\/s72-c\/nuts_and_bolts_andrew_ng.png","height":"72","width":"72"},"thr$total":{"$t":"0"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-8839595873640006183"},"published":{"$t":"2016-06-17T06:24:00.000-05:00"},"updated":{"$t":"2016-06-17T06:31:00.089-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"arxiv"},{"scheme":"http://www.blogger.com/atom/ns#","term":"bayesian"},{"scheme":"http://www.blogger.com/atom/ns#","term":"confidence"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"dropout"},{"scheme":"http://www.blogger.com/atom/ns#","term":"geoff hinton"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hugo larochelle"},{"scheme":"http://www.blogger.com/atom/ns#","term":"ICML"},{"scheme":"http://www.blogger.com/atom/ns#","term":"papers"},{"scheme":"http://www.blogger.com/atom/ns#","term":"segnet"},{"scheme":"http://www.blogger.com/atom/ns#","term":"uncertainty"},{"scheme":"http://www.blogger.com/atom/ns#","term":"yarin gal"}],"title":{"type":"text","$t":"Making Deep Networks Probabilistic via Test-time Dropout"},"content":{"type":"html","$t":"In Quantum Mechanics, Heisenberg's Uncertainty Principle states that there is a fundamental limit to how well one can measure a particle's \u003Cb\u003Eposition\u003C\/b\u003E and \u003Cb\u003Emomentum\u003C\/b\u003E. In the context of machine learning systems, a similar principle has emerged, but relating \u003Cb\u003Einterpretability\u003C\/b\u003E and \u003Cb\u003Eperformance\u003C\/b\u003E. By using a manually wired or shallow machine learning model, you'll have no problem understanding the moving pieces, but you will seldom be happy with the results. Or you can use a black-box deep neural network and enjoy the model's exceptional performance. Today we'll see one simple and effective trick to make our deep black boxes a bit more intelligible. The trick allows us to convert neural network outputs into probabilities, with no cost to performance, and minimal computational overhead.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ctable cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"float: left; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px;\"\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-NGrIzxbkvR8\/V2PfIKuNdFI\/AAAAAAAAOzc\/k91SxOfkDSkQvDGaLtmCUYjCKH33pbSJACLcB\/s1600\/interpretable_vs_deep_neural_networks.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"112\" src=\"https:\/\/3.bp.blogspot.com\/-NGrIzxbkvR8\/V2PfIKuNdFI\/AAAAAAAAOzc\/k91SxOfkDSkQvDGaLtmCUYjCKH33pbSJACLcB\/s400\/interpretable_vs_deep_neural_networks.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px;\"\u003E\u003Cb\u003EInterpretability vs Performance: \u003C\/b\u003EDeep Neural Networks perform well on most computer vision tasks, yet they are notoriously difficult to interpret.\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EThe desire to understand deep neural networks has triggered a flurry of research into Neural Network Visualization, but in practice we are often forced to treat deep learning systems as black-boxes. (See my recent \u003Ca href=\"http:\/\/www.computervisionblog.com\/2016\/06\/deep-learning-trends-iclr-2016.html\"\u003EDeep Learning Trends @ ICLR 2016\u003C\/a\u003E\u0026nbsp;post for an overview of recent neural network visualization techniques.) But just because we can't grok the inner-workings of our favorite deep models, it doesn't mean we can't ask more out of our deep learning systems.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan style=\"font-size: large;\"\u003EThere exists a simple trick for upgrading black-box neural network outputs into probability distributions.\u003C\/span\u003E\u0026nbsp;\u003C\/blockquote\u003E\u003Cbr \/\u003EThe probabilistic approach provides confidences, or \"uncertainty\" measures, alongside predictions and can make almost any deep learning systems into a smarter one. For robotic applications or any kind of software that must make decisions based on the output of a deep learning system, being able to provide meaningful uncertainties is a true game-changer.\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"clear: left; margin-bottom: 1em; margin-left: auto; margin-right: auto; text-align: right;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-t4hhbhJbvFE\/V2Jmj2NJ_gI\/AAAAAAAAOy0\/_X5QrKOSx447h-CMBsc1ChX1nhb2CcItQCLcB\/s1600\/brain_zap_neural_network_dropout.jpg\" imageanchor=\"1\" style=\"clear: left; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"274\" src=\"https:\/\/2.bp.blogspot.com\/-t4hhbhJbvFE\/V2Jmj2NJ_gI\/AAAAAAAAOy0\/_X5QrKOSx447h-CMBsc1ChX1nhb2CcItQCLcB\/s320\/brain_zap_neural_network_dropout.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px; text-align: center;\"\u003EApplying\u003Cb\u003E Dropout\u003C\/b\u003E\u0026nbsp;to your Deep Neural Network is like occasionally zapping your brain\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cspan style=\"background-color: white;\"\u003EThe key ingredient is \u003Cb\u003Edropout\u003C\/b\u003E, an anti-overfitting deep learning trick handed down from Hinton himself (Krizhevsky's pioneering 2012 paper). Dropout sets some of the weights to zero during training, reducing feature co-adaptation, thus improving generalization.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan style=\"font-size: large;\"\u003EWithout dropout, it is too easy to make a moderately deep network attain 100% accuracy on the training set.\u0026nbsp;\u003C\/span\u003E\u003C\/blockquote\u003EThe accepted knowledge is that an un-regularized network (one without dropout) is too good at memorizing the training set. For a great introductory machine learning video lecture on dropout, I highly recommend you watch Hugo Larochelle's lecture on Dropout for Deep learning.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ccenter\u003E \u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/UcKPdAM8cnI\" width=\"420\"\u003E\u003C\/iframe\u003E \u003C\/center\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003EGeoff Hinton's dropout lecture, also a great introduction, focuses on interpreting dropout as an ensemble method. If you're looking for new research ideas in the dropout space, a thorough understanding of Hinton's interpretation is a must.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ccenter\u003E \u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/G3KUvHx9GDY\" width=\"420\"\u003E\u003C\/iframe\u003E \u003C\/center\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003EBut while dropout is typically used at\u0026nbsp;\u003C\/span\u003Etraining-time\u003Cspan style=\"background-color: white;\"\u003E, today we'll highlight the keen observation that\u0026nbsp;\u003C\/span\u003E\u003Cb style=\"background-color: white;\"\u003Edropout used at\u0026nbsp;test-time\u0026nbsp;is one of the simplest ways to turn raw neural network outputs into probability distributions\u003C\/b\u003E\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\" style=\"background-color: white;\"\u003E. Not only does this probabilistic \"free upgrade\" often improve classification results, it provides a meaningful notion of uncertainty, something typically\u0026nbsp;\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\"\u003Emissing\u003C\/span\u003E\u0026nbsp;in Deep Learning systems.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan style=\"font-size: large;\"\u003EThe idea is quite simple: t\u003C\/span\u003E\u003Cspan style=\"text-align: center;\"\u003E\u003Cspan style=\"font-size: large;\"\u003Eo estimate the predictive mean and predictive uncertainty, simply collect the results of stochastic forward passes through the model using dropout.\u003C\/span\u003E\u0026nbsp;\u003C\/span\u003E\u003C\/blockquote\u003E\u003Ch3\u003E\u003Cb\u003EHow to use dropout: 2016 edition\u003C\/b\u003E\u003C\/h3\u003E\u003Col\u003E\u003Cli\u003EStart with a moderately sized network\u003C\/li\u003E\u003Cli\u003EIncrease your network size with dropout turned off until you perfectly fit your data\u003C\/li\u003E\u003Cli\u003EThen, train with dropout turned on\u003C\/li\u003E\u003Cli\u003EAt test-time, turn on dropout and run the network T times to get T samples\u003C\/li\u003E\u003Cli\u003EThe mean of the samples is your output and the variance is your measure of uncertainty\u003C\/li\u003E\u003C\/ol\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1506.02142\"\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cdiv\u003ERemember that drawing more samples will increase computation time during testing unless you're clever about re-using partial computations in the network. Please note that if you're only using dropout near the end of your network, you can reuse most of the computations. If you're not happy with the uncertainty estimates, consider adding more layers of dropout at test-time. Since you'll already have a pre-trained network, experimenting with test-time dropout layers is easy.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003E\u003Cb\u003EBayesian Convolutional Neural Networks\u003C\/b\u003E\u003C\/h3\u003ETo be truly Bayesian about a deep network's parameters, we wouldn't learn a single set of parameters\u0026nbsp;\u003Cb\u003Ew\u003C\/b\u003E, we would infer a distribution over weights given the data, p(\u003Cb\u003Ew\u003C\/b\u003E|\u003Cb\u003EX\u003C\/b\u003E,\u003Cb\u003EY\u003C\/b\u003E). Training is already quite expensive, requiring large datasets and expensive GPUs.\u003Cbr \/\u003E\u003Cblockquote class=\"tr_bq\"\u003E\u003Cspan style=\"font-size: large;\"\u003EBayesian learning algorithms can in theory provide much better parameter estimates for ConvNets and I'm sure some of our friends at Google are working on this already.\u0026nbsp;\u003C\/span\u003E\u003C\/blockquote\u003EBut today we aren't going to talk about such full Bayesian Deep Learning systems, only systems that \"upgrade\" the model prediction\u0026nbsp;\u003Cb\u003Ey\u003C\/b\u003E\u0026nbsp;to p(\u003Cb\u003Ey\u003C\/b\u003E|\u003Cb\u003Ex\u003C\/b\u003E,\u003Cb\u003Ew\u003C\/b\u003E). In other words, only the network outputs gain a probabilistic interpretation.\u003Cbr \/\u003E\u003Cbr \/\u003EAn excellent deep learning computer vision system which uses test-time dropout comes from a recent University of Cambridge technique called SegNet. The SegNet approach introduced an Encoder-Decoder framework for dense semantic segmentation. More recently, SegNet includes a Bayesian extension that uses dropout at test-time for providing uncertainty estimates. Because the system provides a dense per-pixel labeling, the confidences can be visualized as per-pixel heatmaps. Segmentation system is not performing well? Just look at the confidence heatmaps!\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-DlcqJ0a4h9I\/V2CJFJiKGtI\/AAAAAAAAOwQ\/tmGgcBUKd7sy1lEqsXFx6llhMqMb8lJpACLcB\/s1600\/bayesian_segnet_uncertainty_dropout.png\" imageanchor=\"1\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"110\" src=\"https:\/\/1.bp.blogspot.com\/-DlcqJ0a4h9I\/V2CJFJiKGtI\/AAAAAAAAOwQ\/tmGgcBUKd7sy1lEqsXFx6llhMqMb8lJpACLcB\/s400\/bayesian_segnet_uncertainty_dropout.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px;\"\u003E\u003Cdiv style=\"font-size: 12.8px;\"\u003E\u003Cb\u003EBayesian SegNet.\u003C\/b\u003E\u0026nbsp;A fully convolutional neural network architecture which provides\u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"font-size: 12.8px;\"\u003Eper-pixel class uncertainty estimates using dropout.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EThe Bayesian SegNet authors tested different strategies for dropout placement and determined that a handful of dropout layers near the encoder-decoder bottleneck is better than simply using dropout near the output layer. Interestingly, Bayesian SegNet improves the accuracy over vanilla SegNet. Their confidence maps shown high uncertainty near object boundaries, but different test-time dropout schemes could provide a more diverse set of uncertainty estimates.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1511.02680\"\u003EBayesian SegNet: Model Uncertainty in Deep Convolutional Encoder-Decoder Architectures for Scene Understanding\u003C\/a\u003E\u0026nbsp;Alex Kendall, Vijay Badrinarayanan, Roberto Cipolla, in arXiv:1511.02680, November 2015. [\u003Ca href=\"http:\/\/mi.eng.cam.ac.uk\/projects\/segnet\/\"\u003Eproject page with videos\u003C\/a\u003E]\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cbr \/\u003EConfidences are quite useful for evaluation purposes, because instead of providing a single average result across all pixels in all images, we can sort the pixels and\/or images by the overall confidence in prediction. When evaluation the top 10% most confident pixels, we should expect significantly higher performance. For example, the Bayesian SegNet approach achieves 75.4% global accuracy on the SUN RGBD dataset, and an astonishing 97.6% on most confident 10% of the test-set [personal communication with Bayesian SegNet authors]. This kind of sort-by-confidence evaluation was popularized by the PASCAL VOC Object Detection Challenge, where precision\/recall curves were the norm. Unfortunately, as the research community moved towards large-scale classification, the notion of confidence was pushed aside. Until now.\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Ch3\u003E\u003Cb\u003ETheoretical Bayesian Deep Learning\u003C\/b\u003E\u003C\/h3\u003E\u003Cspan style=\"background-color: white;\"\u003EDeep networks that model uncertainty are truly meaningful machine learning systems. It ends up that we don't really have to understand how a deep network's neurons process image features\u0026nbsp;\u003C\/span\u003E\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\" style=\"background-color: white;\"\u003Eto\u003C\/span\u003E\u003Cspan style=\"background-color: white;\"\u003E\u0026nbsp;trust the system to make decisions. As long as the model provides uncertainty estimates, we'll know when the model is struggling. This is particularly important when your network is given\u0026nbsp;\u003C\/span\u003E\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\" style=\"background-color: white;\"\u003E\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\"\u003E\u003Cspan data-mce-style=\"font-family: Times; font-size: medium; line-height: normal;\"\u003Einputs\u003C\/span\u003E\u003C\/span\u003E\u003C\/span\u003E\u003Cspan style=\"background-color: white;\"\u003E\u0026nbsp;that are far from the training data.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-y_QxwwRRcUg\/V2ERx6CArWI\/AAAAAAAAOxk\/uxsSlL_SssU8TxODMLI7q_Rno0EHFT9AwCLcB\/s1600\/gaussian_process_confidence_values.png\" imageanchor=\"1\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"132\" src=\"https:\/\/2.bp.blogspot.com\/-y_QxwwRRcUg\/V2ERx6CArWI\/AAAAAAAAOxk\/uxsSlL_SssU8TxODMLI7q_Rno0EHFT9AwCLcB\/s400\/gaussian_process_confidence_values.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px;\"\u003E\u003Cb\u003EThe Gaussian Process:\u003C\/b\u003E A machine learning approach with built-in uncertainty modeling\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003C\/h3\u003EIn a recent ICML 2016 paper, \u003Ca href=\"http:\/\/mlg.eng.cam.ac.uk\/yarin\/\"\u003EYarin Gal\u003C\/a\u003E and \u003Ca href=\"http:\/\/mlg.eng.cam.ac.uk\/zoubin\/\"\u003EZoubin Ghahramani\u003C\/a\u003E\u0026nbsp;develop\u0026nbsp;\u003Cspan style=\"background-color: white;\"\u003Ea new theoretical framework casting dropout training in deep neural networks as approximate Bayesian inference in deep Gaussian processes. Gal's paper gives a complete theoretical treatment of the link between Gaussian processes and dropout, and develops the tools necessary to represent uncertainty in deep learning. They show that a neural network with arbitrary depth and non-linearities, with dropout applied before every weight layer, is mathematically equivalent to an approximation to the probabilistic deep Gaussian process. I have yet to see researchers use dropout between every layer, so the discrepancy between theory and practice suggests that more research is necessary.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1506.02142\"\u003EDropout as a Bayesian Approximation: Representing Model Uncertainty in Deep Learning\u003C\/a\u003E\u0026nbsp;Yarin Gal, Zoubin Ghahramani, in ICML. June 2016. [\u003Ca href=\"https:\/\/arxiv.org\/abs\/1506.02157\"\u003EAppendix\u003C\/a\u003E\u0026nbsp;with relationship to Gaussian Processes]\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1512.05287\"\u003EA Theoretically Grounded Application of Dropout in Recurrent Neural Networks\u003C\/a\u003E\u0026nbsp;Yarin Gal, in\u0026nbsp;arXiv:1512.05287. May 2016.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Ca href=\"http:\/\/mlg.eng.cam.ac.uk\/yarin\/blog_3d801aa532c1ce.html\"\u003EWhat My Deep Model Doesn't Know\u003C\/a\u003E. Yarin Gal. Blog Post. July 2015\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Ca href=\"https:\/\/github.com\/yaringal\/HeteroscedasticDropoutUncertainty\"\u003EHomoscedastic and Heteroscedastic Regression with Dropout Uncertainty\u003C\/a\u003E. Yarin Gal. Blog Post. February 2016.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: right;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-vwollbVk8dA\/V2J4Ekoi9AI\/AAAAAAAAOzE\/Y4aYoKkYZN0FvWJbssXZ-fpnaHtLPIAvACLcB\/s1600\/black_box.png\" imageanchor=\"1\" style=\"clear: right; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"228\" src=\"https:\/\/4.bp.blogspot.com\/-vwollbVk8dA\/V2J4Ekoi9AI\/AAAAAAAAOzE\/Y4aYoKkYZN0FvWJbssXZ-fpnaHtLPIAvACLcB\/s400\/black_box.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003ETest-time dropout is used to provide uncertainty estimates for deep learning systems.\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003Cbr \/\u003EIn conclusion, maybe we can never get both interpretability and performance when it comes to deep learning systems. But, we can all agree that providing confidences, or uncertainty estimates, alongside predictions is \u003Ci\u003Ealways\u003C\/i\u003E a good idea. Dropout, the very single regularization trick used to battle overfitting in deep models, shows up, yet again. Sometimes all you need is to add some random variations to your input, and average the results over many trials. Dropout lets you not only wiggle the network inputs but the entire architecture.\u003Cbr \/\u003E\u003Cbr \/\u003EI do wonder what Yann LeCun thinks about Bayesian ConvNets... Last I heard, he was allergic to sampling.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated Posts\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/04\/deep-learning-vs-probabilistic.html\"\u003EDeep Learning vs Probabilistic Graphical Models vs Logic\u003C\/a\u003E April 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2016\/06\/deep-learning-trends-iclr-2016.html\"\u003EDeep Learning Trends\u0026nbsp;@ ICLR 2016\u003C\/a\u003E June 2016\u003Cbr \/\u003E\u003Cbr \/\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/8839595873640006183\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/06\/making-deep-networks-probabilistic-via.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/8839595873640006183"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/8839595873640006183"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/06\/making-deep-networks-probabilistic-via.html","title":"Making Deep Networks Probabilistic via Test-time Dropout"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-NGrIzxbkvR8\/V2PfIKuNdFI\/AAAAAAAAOzc\/k91SxOfkDSkQvDGaLtmCUYjCKH33pbSJACLcB\/s72-c\/interpretable_vs_deep_neural_networks.png","height":"72","width":"72"},"thr$total":{"$t":"0"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-5529343758137673151"},"published":{"$t":"2016-06-01T04:21:00.000-05:00"},"updated":{"$t":"2016-06-14T03:38:45.285-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"deep calculators"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep compression"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deepmind"},{"scheme":"http://www.blogger.com/atom/ns#","term":"facebook"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"ICLR"},{"scheme":"http://www.blogger.com/atom/ns#","term":"karpathy"},{"scheme":"http://www.blogger.com/atom/ns#","term":"LSTM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"metric learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"openai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"resnet"},{"scheme":"http://www.blogger.com/atom/ns#","term":"rnns"},{"scheme":"http://www.blogger.com/atom/ns#","term":"robotics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"tensorflow"},{"scheme":"http://www.blogger.com/atom/ns#","term":"urtasun"},{"scheme":"http://www.blogger.com/atom/ns#","term":"visualization"},{"scheme":"http://www.blogger.com/atom/ns#","term":"yann lecun"},{"scheme":"http://www.blogger.com/atom/ns#","term":"yoshua bengio"}],"title":{"type":"text","$t":"Deep Learning Trends @ ICLR 2016"},"content":{"type":"html","$t":"Started by the youngest members of the Deep Learning Mafia [1], namely\u0026nbsp;\u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann LeCun\u003C\/a\u003E and \u003Ca href=\"http:\/\/www.iro.umontreal.ca\/~bengioy\/yoshua_en\/index.html\"\u003EYoshua Bengio\u003C\/a\u003E,\u0026nbsp;the ICLR conference is quickly becoming a strong contender for the single \u003Ci\u003Emost important venue in the Deep Learning space\u003C\/i\u003E. More intimate than NIPS and less benchmark-driven than CVPR, the world of ICLR is arXiv-based and moves fast.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-kVgTFEwiQ-0\/V06d4M-6i7I\/AAAAAAAAOqQ\/TxxSFstFdpw8Y3H9Q3SiRBBRvhXRY5VfwCLcB\/s1600\/deep_learning_machine_learning_conference_iclr_2016.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"261\" src=\"https:\/\/3.bp.blogspot.com\/-kVgTFEwiQ-0\/V06d4M-6i7I\/AAAAAAAAOqQ\/TxxSFstFdpw8Y3H9Q3SiRBBRvhXRY5VfwCLcB\/s400\/deep_learning_machine_learning_conference_iclr_2016.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EToday's post is all about ICLR 2016. I’ll highlight new strategies for building deeper and more powerful neural networks, ideas for compressing big networks into smaller ones, as well as techniques for building “deep learning calculators.” A host of new artificial intelligence problems is being hit hard with the newest wave of deep learning techniques, and from a computer vision point of view, there's no doubt that \u003Ci\u003Edeep convolutional neural networks are today's \"master algorithm\" for dealing with perceptual data\u003C\/i\u003E.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EDeep Powwow in Paradise?\u003C\/b\u003E \u003Ca href=\"http:\/\/www.iclr.cc\/doku.php?id=iclr2016:main\"\u003EICLR 2016\u003C\/a\u003E was held in Puerto Rico.\u0026nbsp;\u003C\/div\u003E\u003Cbr \/\u003EWhether you're working in Robotics, Augmented Reality, or dealing with a computer vision-related problem, the following summary of ICLR research trends will give you a taste of what's possible on top of today's Deep Learning stack. Consider today's blog post a reading group conversation-starter.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EPart I: ICLR vs CVPR\u003C\/b\u003E\u003Cbr \/\u003E\u003Cb\u003EPart II: ICLR 2016 Deep Learning Trends\u003C\/b\u003E\u003Cbr \/\u003E\u003Cb\u003EPart III: Quo Vadis Deep Learning?\u003C\/b\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cbr \/\u003EPart I: ICLR vs CVPR\u003C\/h3\u003ELast month's International Conference of Learning Representations, known briefly as ICLR 2016, and commonly pronounced as “eye-clear,” could more appropriately be called the \u003Ci\u003EInternational Conference on Deep Learning\u003C\/i\u003E. The ICLR 2016 conference was held May 2nd-4th 2016 in lovely Puerto Rico. This year was the 4th installment of the conference -- the first was in 2013 and it was initially so small that it had to be co-located with another conference. Because it was started by none other than the Deep Learning Mafia, it should be no surprise that just about everybody at the conference was studying and\/or applying Deep Learning Methods. Convolutional Neural Networks (which dominate image recognition tasks) were all over the place, with LSTMs and other Recurrent Neural Networks (used to model sequences and build \"deep learning calculators\") in second place. Most of my own research conference experiences come from CVPR (Computer Vision and Pattern Recognition), and I've been a regular CVPR attendee since 2004. Compared to ICLR, CVPR has a somewhat colder, more-emprical feel. To describe the difference between ICLR and CVPR, Yan LeCun, quoting \u003Ca href=\"http:\/\/www.cs.toronto.edu\/~urtasun\/\"\u003ERaquel Urtasun\u003C\/a\u003E (who got the original saying from \u003Ca href=\"http:\/\/www.cs.utoronto.ca\/~fidler\/\"\u003ESanja Fidler\u003C\/a\u003E), put it best on Facebook.\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cspan style=\"font-size: large;\"\u003E\u003Cb\u003ECVPR:\u003C\/b\u003E What can Deep Nets do for me?\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cspan style=\"font-size: large;\"\u003E\u003Cb\u003EICLR:\u003C\/b\u003E What can I do for Deep Nets?\u003C\/span\u003E\u003C\/div\u003E\u003Cbr \/\u003EThe ICLR 2016 conference was my first official powwow that truly felt like a close-knit \"let's share knowledge\" event. 3 days of the main conference, plenty of evening networking events, and no workshops. With a total attendance of about 500, ICLR is about 1\/4 the size of CVPR. In fact, CVPR 2004 in D.C. was my first conference ever, and CVPRs are infamous for their packed poster sessions, multiple sessions, and enough workshops\/tutorials to make CVPRs last an entire week. At the end of CVPR, you'll have a research hangover and will need a few days to recuperate. I prefer the size and length of ICLR.\u003Cbr \/\u003E\u003Cbr \/\u003ECVPR and NIPS, like many other top-tier conferences heavily utilizing machine learning techniques, have grown to gargantuan sizes, and paper acceptance rates at these mega conferences are close to 20%. It not necessarily true that the research papers at ICLR were any more half-baked than some CVPR papers, but the amount of experimental validation for an ICLR paper makes it a different kind of beast than CVPR. CVPR’s main focus is to produce papers that are ‘state-of-the-art’ and this essentially means you have to run your algorithm on a benchmark and beat last season’s leading technique. ICLR’s main focus it to highlight new and promising techniques in the analysis and design of deep convolutional neural networks, initialization schemes for such models, and the training algorithms to learn such models from raw data.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EDeep Learning is Learning Representations\u003C\/b\u003E\u003Cbr \/\u003EYann LeCun and Yoshua Bengio started this conference in 2013 because there was a need to a new, small, high-quality venue with an explicit focus on deep methods. Why is the conference called “Learning Representations?” Because the typical deep neural networks that are trained in an end-to-end fashion actually learn such intermediate representations. Traditional shallow methods are based on manually-engineered features on top of a trainable classifier, but deep methods learn a network of layers which learns those highly-desired features as well as the classifier. So what do you get when you blur the line between features and classifiers? You get representation learning. And this is what Deep Learning is all about.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EICLR Publishing Model: arXiv or bust\u003C\/b\u003E\u003Cbr \/\u003EAt ICLR, papers get posted on arXiv directly. And if you had any doubts that arXiv is just about the single awesomest thing to hit the research publication model since the Gutenberg press, let the success of ICLR be one more data point towards enlightenment. ICLR has essentially bypassed the old-fashioned publishing model where some third party like Elsevier says “you can publish with us and we’ll put our logo on your papers and then charge regular people $30 for each paper they want to read.” Sorry Elsevier, research doesn’t work that way. Most research papers aren’t good enough to be worth $30 for a copy. It is \u003Cb\u003Ethe entire body of academic research that provides true value, for which a single paper just a mere door\u003C\/b\u003E. You see, \u003Ci\u003EElsevier\u003C\/i\u003E, if you actually gave the world an exceptional research paper search engine, together with the ability to have 10-20 papers printed on decent quality paper for a $30\/month subscription, then you would make a killing on researchers and I would endorse such a subscription. So ICLR, rightfully so, just said fuck it, we’ll use arXiv as the method for disseminating our ideas. \u003Cb\u003EAll future research conferences should use arXiv to disseminate papers\u003C\/b\u003E. Anybody can download the papers, see when newer versions with corrections are posted, and they can print their own physical copies. But be warned: \u003Cb\u003EDeep Learning moves so fast, that you’ve gotta be hitting refresh or arXiv on a weekly basis or you’ll be schooled by some grad students in Canada.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EAttendees of ICLR\u003C\/b\u003E\u003Cbr \/\u003EGoogle DeepMind and Facebook’s FAIR constituted a large portion of the attendees. A lot of startups, researchers from the Googleplex, Twitter, NVIDIA, and startups such as Clarifai and Magic Leap. Overall a very young and vibrant crowd, and a very solid representation by super-smart 28-35 year olds.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EPart II: Deep Learning Themes @ ICLR 2016\u003C\/h3\u003E\u003Cb\u003EIncorporating Structure into Deep Learning\u003C\/b\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~urtasun\/\"\u003ERaquel Urtasun\u003C\/a\u003E\u0026nbsp;from the University of Toronto gave a talk about Incorporating Structure in Deep Learning. See \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_urtasun_incoporating_structure\/\"\u003ERaquel's Keynote video\u003C\/a\u003E here. Many ideas from structure learning and graphical models were presented in her keynote. Raquel’s computer vision focus makes her work stand out, and she additionally showed some recent research snapshots from her upcoming CVPR 2016 work.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~fidler\/courses\/tutorialCVPR15.jpg\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/www.cs.toronto.edu\/~fidler\/courses\/tutorialCVPR15.jpg\" height=\"55\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ERaquel gave a wonderful\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~fidler\/3DsceneTutorialCVPR15.html\"\u003E3D Indoor Understanding Tutorial\u003C\/a\u003E\u0026nbsp;at last year's CVPR 2015.\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EOne of Raquel's strengths is her strong command of geometry, and her work covers both learning-based methods as well as multiple-view geometry. I strongly recommend keeping a close look at her upcoming research ideas. Below are two bleeding edge papers from Raquel's group -- the first one focuses on soccer field localization from a broadcast of such a game using\u0026nbsp;branch and bound inference in a MRF.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-tnT9Yh6Q1Fk\/Vz4RWM1UsDI\/AAAAAAAAOpk\/NZzwDimA0I8QQXMi8MZO69_y1De03j55wCLcB\/s1600\/soccer_field.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"126\" src=\"https:\/\/4.bp.blogspot.com\/-tnT9Yh6Q1Fk\/Vz4RWM1UsDI\/AAAAAAAAOpk\/NZzwDimA0I8QQXMi8MZO69_y1De03j55wCLcB\/s400\/soccer_field.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ERaquel's new work. Soccer Field Localization from Single Image. Homayounfar et al, 2016.\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1604.02715\"\u003ESoccer Field Localization from a Single Image\u003C\/a\u003E. \u003Ca href=\"http:\/\/www.cs.toronto.edu\/~namdar\/\"\u003ENamdar Homayounfar\u003C\/a\u003E, Sanja Fidler, Raquel Urtasun. in arXiv:1604.02715.\u003Cbr \/\u003E\u003Cbr \/\u003EThe second upcoming paper from Raquel's group is on using Deep Learning for Dense Optical Flow, in the spirit of \u003Ca href=\"http:\/\/arxiv.org\/abs\/1504.06852\"\u003EFlowNet\u003C\/a\u003E, which I discussed in my \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/12\/iccv-2015-twenty-one-hottest-research.html\"\u003EICCV 2015 hottest papers blog post\u003C\/a\u003E.\u0026nbsp;The technique is built on the observation that the scene is typically composed of a static background, as well as a relatively small number of traffic participants which move rigidly in 3D. The dense optical flow technique is applied to autonomous driving.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-d2rhi4k0TkQ\/Vz4STflNjhI\/AAAAAAAAOps\/R03T3ZfjrOIo_sl6ojUpb0gV9Jw6fVJQwCLcB\/s1600\/optical_flow_raquel.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"230\" src=\"https:\/\/3.bp.blogspot.com\/-d2rhi4k0TkQ\/Vz4STflNjhI\/AAAAAAAAOps\/R03T3ZfjrOIo_sl6ojUpb0gV9Jw6fVJQwCLcB\/s400\/optical_flow_raquel.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1604.01827\"\u003EDeep Semantic Matching for Optical Flow\u003C\/a\u003E.\u0026nbsp;Min Bai, Wenjie Luo, Kaustav Kundu, Raquel Urtasun. In arXiv:1604.01827.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EReinforcement Learning\u003C\/b\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/homes.cs.washington.edu\/~svlevine\/\"\u003ESergey Levine\u003C\/a\u003E\u0026nbsp;gave an excellent Keynote on deep reinforcement learning and its application to Robotics[3]. See \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_levine_deep_learning\/\"\u003ESergey's Keynote video\u003C\/a\u003E here. This kind of work is still the future, and there was very little robotics-related research in the main conference. It might not be surprising, because having an assembly of robotic arms is not cheap, and such gear is simply not present in most grad student research labs. Most ICLR work is pure software and some math theory, so a single GPU is all that is needed to start with a typical Deep Learning pipeline.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-q3ZcSqgSTQY\/VzuSF9yDc0I\/AAAAAAAAOos\/CAvlFLniqsg2m7JqHpCG9mQV3erpuHytgCLcB\/s1600\/robotarms.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"197\" src=\"https:\/\/1.bp.blogspot.com\/-q3ZcSqgSTQY\/VzuSF9yDc0I\/AAAAAAAAOos\/CAvlFLniqsg2m7JqHpCG9mQV3erpuHytgCLcB\/s400\/robotarms.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAn army of robot arms jointly learning to grasp somewhere inside Google.\u003C\/div\u003E\u003Cbr \/\u003ETake a look at the following interesting work which shows what Alex\u0026nbsp;\u003Ca href=\"https:\/\/www.cs.toronto.edu\/~kriz\/\"\u003EKrizhevsky\u003C\/a\u003E, the author of the legendary 2012 AlexNet paper which rocked the world of object recognition, is currently doing. And it has to do with Deep Learning for Robotics, currently at Google.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1603.02199\"\u003ELearning Hand-Eye Coordination for Robotic Grasping with Deep Learning and Large-Scale Data Collection\u003C\/a\u003E\u0026nbsp;Sergey Levine, Peter Pastor, Alex Krizhevsky, Deirdre Quillen. In arXiv:1603.02199.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E For those of you who want to learn more about Reinforcement Learning, perhaps it is time to check out \u003Ca href=\"http:\/\/karpathy.github.io\/2016\/05\/31\/rl\/\"\u003EAndrej Karpathy's Deep Reinforcement Learning: Pong From Pixels\u003C\/a\u003E tutorial. One thing is for sure: when it comes to deep reinforcement learning, OpenAI is all-in.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003ECompressing Networks\u003C\/b\u003E\u003Cbr \/\u003E\u003Ctable cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"float: right; margin-left: 1em; text-align: right;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/hyhK-UtSvzNbxzc_ub-RqRgyoghf4IA5DL4cuxspUGWOyHxdD4YO9Ckxsym7mDFxLGqg=w300\" style=\"clear: right; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"200\" src=\"https:\/\/3.bp.blogspot.com\/hyhK-UtSvzNbxzc_ub-RqRgyoghf4IA5DL4cuxspUGWOyHxdD4YO9Ckxsym7mDFxLGqg=w300\" width=\"200\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"font-size: 12.8px; text-align: center;\"\u003EModel Compression: The WinZip of\u003Cbr \/\u003ENeural Nets?\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003EWhile NVIDIA might be today’s king of Deep Learning Hardware, I can’t help the feeling that there is a new player lurking in the shadows. You see, GPU-based mining of bitcoin didn’t last very long once people realized the economic value of owning bitcoins. Bitcoin very quickly transitioned into specialized FPGA hardware for running the underlying bitcoin computations, and the FPGAs of Deep Learning are right around the corner. Will NVIDIA remain the King? I see a fork in NVIDIA's future. You can continue producing hardware which pleases both gamers and machine learning researchers, or you can specialize. There is a plethora of interesting companies like Nervana Systems, Movidius, and most importantly Google, that don’t want to rely on power-hungry heatboxes known as GPUs, especially when it comes to scaling already trained deep learning models. Just take a look at \u003Ca href=\"http:\/\/www.movidius.com\/solutions\/machine-vision-algorithms\/machine-learning\"\u003EFathom by Movidius\u003C\/a\u003E or the \u003Ca href=\"https:\/\/cloudplatform.googleblog.com\/2016\/05\/Google-supercharges-machine-learning-tasks-with-custom-chip.html\"\u003EGoogle TPU\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EBut the world has already seen the economic value of Deep Nets, and the “software” side of deep nets isn't waiting for the FPGAs of neural nets. \u003Cb\u003EThe software version of compressing neural networks is a very trendy topic.\u003C\/b\u003E You basically want to take a beefy neural network and compress it down into smaller, more efficient model. Binarizing the weights is one such strategy. Student-Teacher networks where a smaller network is trained to mimic the larger network are already here. And don’t be surprised if within the next year we’ll see 1MB sized networks performing at the level of Oxford’s VGGNet on the ImageNet 1000-way classification task.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-h73q0MJPdZQ\/VzuH6aglbtI\/AAAAAAAAOnQ\/Df8Uqn343ZwrE_VPWFNL0Ccb_eQTZMMEgCLcB\/s1600\/deep_compress.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"283\" src=\"https:\/\/4.bp.blogspot.com\/-h73q0MJPdZQ\/VzuH6aglbtI\/AAAAAAAAOnQ\/Df8Uqn343ZwrE_VPWFNL0Ccb_eQTZMMEgCLcB\/s400\/deep_compress.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ESummary from ICLR 2016's Deep Compression paper by Han et al.\u003C\/div\u003E\u003Cbr \/\u003EThis year's ICLR brought a slew of Compression papers, the three which stood out are listed below.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1510.00149\"\u003EDeep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding\u003C\/a\u003E.\u0026nbsp;Song Han, Huizi Mao, and Bill Dally. In ICLR 2016. This paper won the Best Paper Award. See Han give the \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_han_deep_compression\/\"\u003EDeep Compression\u003C\/a\u003E\u0026nbsp;talk.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1510.03009\"\u003ENeural Networks with Few Multiplications\u003C\/a\u003E. Zhouhan Lin, Matthieu Courbariaux, Roland Memisevic, Yoshua Bengio. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.04561\"\u003E8-Bit Approximations for Parallelism in Deep Learning\u003C\/a\u003E. Tim Dettmers. In ICLR 2016.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003EUnsupervised Learning\u003C\/b\u003E\u003Cbr \/\u003EPhilip Isola presented a very Efrosian paper on using Siamese Networks defined on patches to learn a patch similarity function in an unsupervised way. This patch-patch similarity function was used to create a local similarity graph defined over an image which can be used to discover the extent of objects. This reminds me of the Object Discovery line of research started by Alyosha Efros and the MIT group, where the basic idea is to abstain from using class labels in learning a similarity function.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-8a73VKcm2WI\/VzuEfhXpKWI\/AAAAAAAAOmk\/iu50QDSoGx8hejfW_bUmpDwdhsyyBAOqQCLcB\/s1600\/isola.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"118\" src=\"https:\/\/3.bp.blogspot.com\/-8a73VKcm2WI\/VzuEfhXpKWI\/AAAAAAAAOmk\/iu50QDSoGx8hejfW_bUmpDwdhsyyBAOqQCLcB\/s400\/isola.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EIsola et al: A Siamese network has shared weights and can be used for learning embeddings or \"similarity functions.\"\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1511.06811.pdf\"\u003E\u003Cbr \/\u003E\u003C\/a\u003E \u003Ca href=\"http:\/\/arxiv.org\/pdf\/1511.06811.pdf\"\u003ELearning visual groups from co-occurrences in space and time\u003C\/a\u003E \u003Ca href=\"http:\/\/web.mit.edu\/phillipi\/\"\u003EPhillip Isola\u003C\/a\u003E, Daniel Zoran, Dilip Krishnan, Edward H. Adelson. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-ALMDlo_icyA\/VzuFbnYbjVI\/AAAAAAAAOms\/ND7_YlEBIwEW22pPDFO1cUpzIL0PuUAugCLcB\/s1600\/isola2.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"211\" src=\"https:\/\/1.bp.blogspot.com\/-ALMDlo_icyA\/VzuFbnYbjVI\/AAAAAAAAOms\/ND7_YlEBIwEW22pPDFO1cUpzIL0PuUAugCLcB\/s400\/isola2.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EIsola et al: Visual groupings applied to image patches, frames of a video, and a large scene dataset.\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EInitializing Networks: And why BatchNorm matters\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003EGetting a neural network up and running is more difficult than it seems. Several papers in ICLR 2016 suggested new ways of initializing networks. But practically speaking, deep net initialization is “essentially solved.” Initialization seems to be\u0026nbsp;an area of research that truly became more of a “science” than an “art” once researchers introduced \u003Ca href=\"http:\/\/arxiv.org\/abs\/1502.03167\"\u003EBatchNorm\u003C\/a\u003E into their neural networks. \u003Ci\u003EBatchNorm is the butter of Deep Learning -- add it to everything and everything will taste better.\u0026nbsp;\u003C\/i\u003EBut this wasn’t always the case!\u003Cbr \/\u003E\u003Cbr \/\u003EIn the early days, researchers had lots of problems with constructing an initial set of weights of a deep neural network such that the back propagation could learn anything. In fact, one of the reasons why the Neural Networks of the 90s died as a research program, is precisely because it was well-known that a handful of top researchers knew how to tune their networks so that they could start automatically learning from data, but the other research didn’t know all of the right initialization tricks. It was as if the “black magic” inside the 90s NNs was just too intense. At some point, convex methods and kernel SVMs because the tools of choice — with no need to initialize in a convex optimization setting, for almost a decade (1995 to 2005) researchers just ran away from deep methods. Once 2006 hit, Deep Architectures were working again with Hinton’s magical deep Boltzmann Machines and unsupervised pretraining. Unsupervised pretaining didn’t last long, as researchers got GPUs and found that once your data set is large enough (think ~2 million images in ImageNet), that simple discriminative back-propagation does work. Random weight initialization strategies and cleverly tuned learning rates were quickly shared amongst researchers once 100s of them jumped on the ImageNet dataset. People started sharing code, and wonderful things happened!\u003Cbr \/\u003E\u003Cbr \/\u003EBut designing new neural networks for new problems was still problematic -- one wouldn't know exactly the best way to set multiple learning rates and random initialization magnitudes. But researchers got to work, and a handful of solid hackers from Google found out that the key problem was that poorly initialized networks were having a hard time flowing information through the networks. It’s as if layer N was producing activations in one range and the subsequent layers were expecting information to be of another order of magnitude. So Szegedy and Ioffe from Google proposed a simple “trick” to whiten the flow of data as it passes through the network. Their trick, called “BatchNorm” involves using a normalization layer after each convolutional and\/or fully-connected layer in a deep network. This normalization layer whitens the data by subtracting a mean and dividing by a standard deviation, thus producing roughly gaussian numbers as information flows through the network. So simple, yet so sweet. \u003Ci\u003EThe idea of whitening data is so prevalent in all of machine learning, that it’s silly that it took deep learning researchers so long to re-discover the trick in the context of deep nets.\u003C\/i\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1511.06856\"\u003EData-dependent Initializations of Convolutional Neural Networks\u003C\/a\u003E Philipp Krähenbühl, Carl Doersch, Jeff Donahue, Trevor Darrell. In ICLR 2016. Carl Doersch, a fellow CMU PhD, is going to DeepMind, so there goes another point for DeepMind.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EBackprop Tricks\u003C\/b\u003E\u003Cbr \/\u003EInjecting noise into the gradient seems to work. And this reminds me of the common grad student dilemma where you fix a bug in your gradient calculation, and your learning algorithm does worse. You see, when you were computing the derivative on the white board, you probably made a silly mistake like messing up a coefficient that balances two terms or forgetting an additive \/ multiplicative term somewhere. \u0026nbsp;However, with a high probability, your “buggy gradient” was actually correlated with the true “gradient”. And in many scenarios, a quantity correlated with the true gradient is better than the true gradient. \u0026nbsp;It is a certain form of regularization that hasn’t been adequately addressed in the research community. \u003Cb\u003EWhat kinds of “buggy gradients” are actually good for learning?\u003C\/b\u003E And is there a space of “buggy gradients” that are cheaper to compute than “true gradients”? These “FastGrad” methods could speed up training deep networks, at least for the first several epochs. Maybe by ICLR 2017 somebody will decide to pursue this research track.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-aNwn04qZ7Go\/VzuHQd9OyaI\/AAAAAAAAOnE\/wDc-Pg1xrCQPn_BmvXalZx-f8y_MEzqpgCLcB\/s1600\/noise.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"205\" src=\"https:\/\/3.bp.blogspot.com\/-aNwn04qZ7Go\/VzuHQd9OyaI\/AAAAAAAAOnE\/wDc-Pg1xrCQPn_BmvXalZx-f8y_MEzqpgCLcB\/s400\/noise.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06807\"\u003EAdding Gradient Noise Improves Learning for Very Deep Networks\u003C\/a\u003E. Arvind Neelakantan, Luke Vilnis, Quoc V. Le, Ilya Sutskever, Lukasz Kaiser, Karol Kurach, James Martens. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06306\"\u003ERobust Convolutional Neural Networks under Adversarial Noise\u003C\/a\u003E Jonghoon Jin, Aysegul Dundar, Eugenio Culurciello. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EAttention: Focusing Computations\u003C\/b\u003E\u003Cbr \/\u003EAttention-based methods are all about treating different \"interesting\" areas with more care than the \"boring\" areas. Not all pixels are equal, and people are able to quickly focus on the interesting bits of a static picture. ICLR 2016's most interesting \"attention\" paper was the Dynamic Capacity Networks paper from \u003Ca href=\"https:\/\/aaroncourville.wordpress.com\/\"\u003EAaron Courville\u003C\/a\u003E's group at the University of Montreal. \u003Ca href=\"http:\/\/www.dmi.usherb.ca\/~larocheh\/index_en.html\"\u003EHugo Larochelle\u003C\/a\u003E, another key researcher with strong ties to the Deep Learning mafia, is now a Research Scientist at Twitter.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-x8A-5ffHjAk\/Vz4P6Mugm7I\/AAAAAAAAOpY\/dhgutS6Alv8Ra8bDaF7uIe96GQ-7fQ80ACLcB\/s1600\/dcn.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"167\" src=\"https:\/\/4.bp.blogspot.com\/-x8A-5ffHjAk\/Vz4P6Mugm7I\/AAAAAAAAOpY\/dhgutS6Alv8Ra8bDaF7uIe96GQ-7fQ80ACLcB\/s400\/dcn.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.07838\"\u003EDynamic Capacity Networks\u003C\/a\u003E Amjad Almahairi, Nicolas Ballas, Tim Cooijmans, Yin Zheng, Hugo Larochelle, Aaron Courville. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EThe “ResNet trick”: Going Mega Deep because it's Mega Fun\u003C\/b\u003E\u003Cbr \/\u003EWe saw some new papers on the new “ResNet” trick which emerged within the last few months in the Deep Learning Community. The ResNet trick is the “Residual Net” trick that gives us a rule for creating a deep stack of layers. Because each residual layer essentially learns to either pass the raw data through or mix in some combination of a non-linear transformation, the flow of information is much smoother. This “control of flow” that comes with residual blocks, lets you build VGG-style networks that are quite deep.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1602.07261\"\u003EInception-v4, Inception-ResNet and the Impact of Residual Connections on Learning\u003C\/a\u003E\u0026nbsp;Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-t5YKN7kWXUk\/VzuPH1020KI\/AAAAAAAAOoU\/aGQSTQJoZFMPzWNC0t0Le5zgEHZbDQD4QCLcB\/s1600\/rir.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"147\" src=\"https:\/\/4.bp.blogspot.com\/-t5YKN7kWXUk\/VzuPH1020KI\/AAAAAAAAOoU\/aGQSTQJoZFMPzWNC0t0Le5zgEHZbDQD4QCLcB\/s400\/rir.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1603.08029\"\u003EResnet in Resnet: Generalizing Residual Architectures\u003C\/a\u003E\u0026nbsp;Sasha Targ, Diogo Almeida, Kevin Lyman. In ICLR 2016.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cb\u003EDeep Metric Learning and Learning Subcategories\u003C\/b\u003E\u003Cbr \/\u003EA great paper, presented by Manohar Paluri of Facebook, focused on a new way to think about deep metric learning. The paper is “Metric Learning with Adaptive Density Discrimination” and reminds me of my own research from CMU. Their key idea can be distilled to the “anti-category” argument. Basically, you build into your algorithm the intuition that not all elements of a category C1 should collapse into a single unique representation. Due to the visual variety within a category, you only make the assumption that an element X of category C is going to be similar to a subset of other Cs, and not all of them. In their paper, they make the assumption that all members of category C belong to a set of latent subcategories, and EM-like learning alternates between finding subcategory assignments and updating the distance metric. During my PhD, we took this idea even further and build Exemplar-SVMs which were the smallest possible subcategories with a single positive “exemplar” member.\u003Cbr \/\u003E\u003Cbr \/\u003EManohar started his research as a member of the FAIR team, which focuses more on R\u0026amp;D work, but metric learning ideas are very product-focused, and the paper is a great example of a technology that seems to be \"product-ready.\" I envision dozens of Facebook products that can benefit from such data-derived adaptive deep distance metrics.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-gTAhKIRASiM\/VzuOHfb5YiI\/AAAAAAAAOoM\/lQEaQntSn9sx1gk89oL4ItZb0FXCplRhACLcB\/s1600\/magnet.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"171\" src=\"https:\/\/3.bp.blogspot.com\/-gTAhKIRASiM\/VzuOHfb5YiI\/AAAAAAAAOoM\/lQEaQntSn9sx1gk89oL4ItZb0FXCplRhACLcB\/s400\/magnet.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.05939\"\u003EMetric Learning with Adaptive Density Discrimination\u003C\/a\u003E. Oren Rippel, Manohar Paluri, Piotr Dollar, Lubomir Bourdev. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EDeep Learning Calculators\u003C\/b\u003E\u003Cbr \/\u003ELSTMs, Deep Neural Turing Machines, and what I call “Deep Learning Calculators” were big at the conference. Some people say, “Just because you can use deep learning to build a calculator, it doesn’t mean you should.\" And for some people, Deep Learning is the Holy-Grail-Titan-Power-Hammer, and everything that can be described with words should be built using deep learning components. Nevertheless, it's an exciting time for Deep Turing Machines.\u003Cbr \/\u003E\u003Cbr \/\u003EThe winner of the Best Paper Award was the paper, Neural Programmer-Interpreters by Scott Reed and Nando de Freitas. An interesting way to blend deep learning with the theory of computation. If you’re wondering what it would look like to use Deep Learning to learn quicksort, then check out their paper. And it seems like Scott Reed is going to Google DeepMind, so you can tell where they’re placing their bets.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-0LB4ZLszSOo\/VzuI9FcyjOI\/AAAAAAAAOnY\/2FLtPToVyNstjBdPbLoIavWiNU5B7G7RACLcB\/s1600\/npi.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"167\" src=\"https:\/\/1.bp.blogspot.com\/-0LB4ZLszSOo\/VzuI9FcyjOI\/AAAAAAAAOnY\/2FLtPToVyNstjBdPbLoIavWiNU5B7G7RACLcB\/s400\/npi.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06279\"\u003ENeural Programmer-Interpreters\u003C\/a\u003E. Scott Reed, Nando de Freitas. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003EAnother interesting paper by some OpenAI guys is “Neural Random-Access Machines” which is going to be another fan favorite for those who love Deep Learning Calculators.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-rPhlusUS8BQ\/VzuJcpzPGII\/AAAAAAAAOnc\/PQh4xKXy9_42k9UkiaWh6ig13Cuot58BQCLcB\/s1600\/nram.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"165\" src=\"https:\/\/2.bp.blogspot.com\/-rPhlusUS8BQ\/VzuJcpzPGII\/AAAAAAAAOnc\/PQh4xKXy9_42k9UkiaWh6ig13Cuot58BQCLcB\/s400\/nram.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06392\"\u003ENeural Random-Access Machines\u003C\/a\u003E. Karol Kurach, Marcin Andrychowicz, Ilya Sutskever. In ICLR 2016.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003EComputer Vision Applications\u003C\/b\u003E\u003Cbr \/\u003EBoundary detection is a common computer vision task, where the goal is to predict boundaries between objects. CV folks have been using image pyramids, or multi-level processing, for quite some time. Check out the following Deep Boundary paper which aggregates information across multiple spatial resolutions.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-ueMFLkPpDWg\/Vz4Nr6MCqBI\/AAAAAAAAOpA\/nUDIxovuT6QRsYfNazo1DPfmHCvD30MjgCLcB\/s1600\/segmentation.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"268\" src=\"https:\/\/2.bp.blogspot.com\/-ueMFLkPpDWg\/Vz4Nr6MCqBI\/AAAAAAAAOpA\/nUDIxovuT6QRsYfNazo1DPfmHCvD30MjgCLcB\/s400\/segmentation.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.07386\"\u003EPushing the Boundaries of Boundary Detection using Deep Learning\u003C\/a\u003E Iasonas Kokkinos, In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003EA great application for RNNs is to \"unfold\" an image into multiple layers. In the context of object detection, the goal is to decompose an image into its parts. The following figure explains it best, but if you've been wondering where to use RNNs in your computer vision pipeline, check out their paper.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-yRIrE3-QqJk\/Vz4OHzSgxUI\/AAAAAAAAOpE\/AhGwpsQdQE8vswQ59Kss2r1xb8bbhcVEgCLcB\/s1600\/decompnet.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"225\" src=\"https:\/\/2.bp.blogspot.com\/-yRIrE3-QqJk\/Vz4OHzSgxUI\/AAAAAAAAOpE\/AhGwpsQdQE8vswQ59Kss2r1xb8bbhcVEgCLcB\/s400\/decompnet.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06449\"\u003ELearning to decompose for object detection and instance segmentation\u003C\/a\u003E Eunbyung Park, Alexander C. Berg. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003EDilated convolutions are a \"trick\" which allows you to increase your network's receptive field size and scene segmentation is one of the best application domains for such dilations.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-2LqF23x7i7w\/Vz4OpSVHzzI\/AAAAAAAAOpM\/tnydeP_1zLQ7BmuW7_ndoNUcJ0JjrQLOgCLcB\/s1600\/dilated_convolutions.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"211\" src=\"https:\/\/4.bp.blogspot.com\/-2LqF23x7i7w\/Vz4OpSVHzzI\/AAAAAAAAOpM\/tnydeP_1zLQ7BmuW7_ndoNUcJ0JjrQLOgCLcB\/s400\/dilated_convolutions.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1511.07122\"\u003EMulti-Scale Context Aggregation by Dilated Convolutions\u003C\/a\u003E Fisher Yu, Vladlen Koltun. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E \u003Cb\u003EVisualizing Networks\u003C\/b\u003E\u003Cbr \/\u003ETwo of the best “visualization” papers were “Do Neural Networks Learn the same thing?” by\u003Cbr \/\u003E\u003Ca href=\"http:\/\/yosinski.com\/\"\u003EJason Yosinski\u003C\/a\u003E (now going to \u003Ca href=\"http:\/\/www.geometricintelligence.com\/\"\u003EGeometric Intelligence, Inc.\u003C\/a\u003E) and “Visualizing and Understanding Recurrent Networks” presented by Andrej Karpathy (now going to \u003Ca href=\"https:\/\/openai.com\/\"\u003EOpenAI\u003C\/a\u003E). Yosinski presented his work on studying what happens when you learn two different networks using different initializations. Do the nets learn the same thing? I remember a great conversation with Jason about figuring out if the neurons in network A can be represented as linear combinations of network B, and his visualizations helped make the case. Andrej’s visualizations of recurrent networks are best consumed in presentation\/blog form[2]. For those of you that haven’t yet seen Andrej’s analysis of Recurrent Nets on Hacker News, check it out \u003Ca href=\"http:\/\/karpathy.github.io\/2015\/05\/21\/rnn-effectiveness\/\"\u003Ehere\u003C\/a\u003E.\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.07543\"\u003E\u003Cbr \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-xUV5xuzWXss\/VzuJ8xC4n9I\/AAAAAAAAOnk\/Fvqx11nQDPc45cCFJ-aIjfghE2hhubaIgCLcB\/s1600\/jason.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"187\" src=\"https:\/\/3.bp.blogspot.com\/-xUV5xuzWXss\/VzuJ8xC4n9I\/AAAAAAAAOnk\/Fvqx11nQDPc45cCFJ-aIjfghE2hhubaIgCLcB\/s400\/jason.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.07543\"\u003EConvergent Learning: Do different neural networks learn the same representations?\u003C\/a\u003E Yixuan Li, Jason Yosinski, Jeff Clune, Hod Lipson, John Hopcroft. In ICLR 2016. See \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_yosinski_convergent_learning\/\"\u003EYosinski's video here.\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-6uqAR3BM4ew\/VzuKdEhrhFI\/AAAAAAAAOns\/Z-MeozwrTDwAVBtw_T-YKquhNezChFOBACLcB\/s1600\/karpathy.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"202\" src=\"https:\/\/2.bp.blogspot.com\/-6uqAR3BM4ew\/VzuKdEhrhFI\/AAAAAAAAOns\/Z-MeozwrTDwAVBtw_T-YKquhNezChFOBACLcB\/s400\/karpathy.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1506.02078\"\u003EVisualizing and Understanding Recurrent Networks\u003C\/a\u003E Andrej Karpathy, Justin Johnson, Li Fei-Fei. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EDo Deep Convolutional Nets Really Need to be Deep (Or Even Convolutional)?\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003E\u003Ctable cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"float: right; margin-left: 1em; text-align: right;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-zyh8fC4l0Dg\/VzuLLC5doJI\/AAAAAAAAOn4\/EYCgIR07ZWYfDTBKGTEXjKsqmjUWNc4ywCLcB\/s1600\/caruana.png\" style=\"clear: right; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"200\" src=\"https:\/\/1.bp.blogspot.com\/-zyh8fC4l0Dg\/VzuLLC5doJI\/AAAAAAAAOn4\/EYCgIR07ZWYfDTBKGTEXjKsqmjUWNc4ywCLcB\/s200\/caruana.png\" width=\"113\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003EFigure from Do Nets have to be Deep?\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003EThis was the key question asked in the paper presented by Rich Caruana. (Dr. Caruana is now at Microsoft, but I remember meeting him at Cornell eleven years ago) Their papers' two key results which are quite meaningful if you sit back and think about them. First, there is something truly special about convolutional layers that when applied to images, they are significantly better than using solely fully connected layers -- there’s something about the 2D structure of images and the 2D structures of filters that makes convolutional layers get a lot of value out of their parameters. Secondly, we now have teacher-student training algorithms which you can use to have a shallower network “mimic” the teacher’s responses on a large dataset. These shallower networks are able to learn much better using a teacher and in fact, such shallow networks produce inferior results when the are trained on the teacher’s training set. \u0026nbsp;So it seems you get go [Data to MegaDeep], and [MegaDeep to MiniDeep], but you cannot directly go from [Data to MiniDeep].\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"https:\/\/arxiv.org\/abs\/1603.05691\"\u003EDo Deep Convolutional Nets Really Need to be Deep (Or Even Convolutional)?\u003C\/a\u003E Gregor Urban, Krzysztof J. Geras, Samira Ebrahimi Kahou, Ozlem Aslan, Shengjie Wang, Rich Caruana, Abdelrahman Mohamed, Matthai Philipose, Matt Richardson. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EAnother interesting idea on the [MegaDeep to MiniDeep] and [MiniDeep to MegaDeep] front,\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-bshX-UYGcBw\/VzuLyYRek6I\/AAAAAAAAOoA\/MGatuBrhEMw5yFVX89O1AdHxCTTpkHldgCLcB\/s1600\/net2deepernet.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"97\" src=\"https:\/\/4.bp.blogspot.com\/-bshX-UYGcBw\/VzuLyYRek6I\/AAAAAAAAOoA\/MGatuBrhEMw5yFVX89O1AdHxCTTpkHldgCLcB\/s400\/net2deepernet.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.05641\"\u003ENet2Net: Accelerating Learning via Knowledge Transfer\u003C\/a\u003E Tianqi Chen, Ian Goodfellow, Jonathon Shlens. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ELanguage Modeling with LSTMs\u003C\/b\u003E\u003Cbr \/\u003EThere was also considerable focus on methods that deal with large bodies of text. Chris Dyer (who is supposedly also going to DeepMind), gave a keynote asking the question “Should Model Architecture Reflect Linguistic Structure?” See \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_dyer_model_architecture\/\"\u003EChris Dyer's Keynote video here.\u003C\/a\u003E Some of his key take-aways from comparing word-level embedding vs character-level embeddings\u0026nbsp;is that for different languages, different methods work better. \u0026nbsp;For languages which have a rich syntax, character-level encodings outperform word-level encodings.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1508.00657\"\u003EImproved Transition-Based Parsing by Modeling Characters instead of Words with LSTMs\u003C\/a\u003E Miguel Ballesteros, Chris Dyer, Noah A. Smith.\u0026nbsp;In Proceedings of EMNLP 2015.\u003Cbr \/\u003E\u003Cbr \/\u003EAn interesting approach, with a great presentation by Ivan Vendrov, was “Order-Embeddings of Images and Language\" by Ivan Vendrov, Ryan Kiros, Sanja Fidler, and Raquel Urtasun which showed a great intuitive coordinate-system-y way for thinking about concepts. I really love these coordinate system analogies and I’m all for new ways of thinking about classical problems.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-MZtgHEPVtgc\/VzuRW9bBTMI\/AAAAAAAAOok\/cUasXjkQArU8RcdN5Sbx1XpZ0XVba6XuwCLcB\/s1600\/order.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"320\" src=\"https:\/\/1.bp.blogspot.com\/-MZtgHEPVtgc\/VzuRW9bBTMI\/AAAAAAAAOok\/cUasXjkQArU8RcdN5Sbx1XpZ0XVba6XuwCLcB\/s320\/order.png\" width=\"314\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06361\"\u003E\u003Cbr \/\u003E\u003C\/a\u003E \u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.06361\"\u003EOrder-Embeddings of Images and Language\u003C\/a\u003E Ivan Vendrov, Ryan Kiros, Sanja Fidler, Raquel Urtasun. In ICLR 2016. \u003Ca href=\"http:\/\/videolectures.net\/iclr2016_vendrov_order_embeddings\/\"\u003ESee Video here.\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ETraining-Free Methods: Brain-dead applications of CNNs to Image Matching\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EThese techniques use the activation maps of deep neural networks trained on an ImageNet classification task for other important computer vision tasks. These techniques employ clever ways of matching image regions and from the following ICLR paper, are applied to smart image retrieval.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/4.bp.blogspot.com\/-klhAxCYUJtU\/VzuGlS1XnrI\/AAAAAAAAOm4\/q4XI90vBGno8jGZdG87IJEzbIkkJRQumgCLcB\/s1600\/mac.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"130\" src=\"https:\/\/4.bp.blogspot.com\/-klhAxCYUJtU\/VzuGlS1XnrI\/AAAAAAAAOm4\/q4XI90vBGno8jGZdG87IJEzbIkkJRQumgCLcB\/s400\/mac.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1511.05879\"\u003EParticular object retrieval with integral max-pooling of CNN activations\u003C\/a\u003E. Giorgos Tolias, Ronan Sicre, Hervé Jégou. In ICLR 2016.\u003Cbr \/\u003E\u003Cbr \/\u003EThis reminds me of the RSS 2015 paper which uses ConvNets to match landmarks for a relocalization-like SLAM task.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/2.bp.blogspot.com\/-_gd5f9tSbv8\/VzuGqXBphKI\/AAAAAAAAOm8\/w0umiPYN8yA6Mhke88kHXhoDZZhF4KUxgCLcB\/s1600\/rss.png\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"71\" src=\"https:\/\/2.bp.blogspot.com\/-_gd5f9tSbv8\/VzuGqXBphKI\/AAAAAAAAOm8\/w0umiPYN8yA6Mhke88kHXhoDZZhF4KUxgCLcB\/s400\/rss.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/eprints.qut.edu.au\/84931\/1\/rss15_placeRec.pdf\"\u003EPlace Recognition with ConvNet Landmarks: Viewpoint-Robust, Condition-Robust, Training-Free\u003C\/a\u003E.\u0026nbsp;Niko Sunderhauf, Sareh Shirazi, Adam Jacobson, Feras Dayoub, Edward Pepperell, Ben Upcroft, and Michael Milford. In RSS 2015.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr class=\"Apple-interchange-newline\" \/\u003EGaussian Processes and Auto Encoders\u003C\/b\u003E\u003Cbr \/\u003EGaussian Processes used to be quite popular at NIPS, sometimes used for vision problems, but mostly “forgotten” in the era of Deep Learning. VAEs or Variational Auto Encoders used to be much more popular when pertaining was the only way to train deep neural nets. However, with new techniques like adversarial networks, people keep revisiting Auto Encoders, because we still “hope” that something as simple as an encoder \/ decoder network should give us the unsupervised learning power we all seek, deep down inside. VAEs got quite a lot of action but didn't make the cut for today's blog post.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EGeometric Methods\u003C\/b\u003E\u003Cbr \/\u003EOverall, very little content pertaining to the SfM \/ SLAM side of the vision problem was present at ICLR 2016. This kind of work is very common at CVPR, and it's a bit of a surprise that there wasn't a lot of Robotics work at ICLR. It should be noted that the techniques used in SfM\/SLAM are more based on multiple-view geometry and linear algebra than the data-driven deep learning of today.\u003Cbr \/\u003E\u003Cbr \/\u003EPerhaps a better venue for Robotics and Deep Learning will be the June 2016 workshop titled\u0026nbsp;\u003Ca href=\"http:\/\/juxi.net\/workshop\/deep-learning-rss-2016\/\"\u003EAre the Sceptics Right? Limits and Potentials of Deep Learning in Robotics.\u003C\/a\u003E\u0026nbsp;This workshop is being held at RSS 2016, one of the world's leading Robotics conferences.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EPart III: Quo Vadis Deep Learning?\u003C\/h3\u003ENeural Net Compression is going to be big -- real-world applications demand it. The algos guys aren't going to wait for TPU and VPUs to become mainstream. Deep Nets which can look at a picture and tell you what’s going on are going to be inside every single device which has a camera. In fact, I don’t see any reason why all cameras by 2020 won’t be able to produce a high-quality RGB image as well as a neural network response vector. New image formats will even have such “deep interpretation vectors” directly saved alongside the image. And it's all going to be a neural net, in one shape or another.\u003Cbr \/\u003E\u003Cbr \/\u003EOpenAI had a strong presence at ICLR 2016, and I feel like every week a new PhD joins OpenAI. Google DeepMind and Facebook FAIR had a large number of papers. Google demoed a real-time version of deep-learning based style transfer using TensorFlow. Microsoft is no longer King of research. Startups were giving out little toys -- Clarifai even gave out free sandals. Graduates with well-tuned Deep Learning skills will continue being in high-demand, but once the next generation of AI-driven startups emerge, it is only those willing to transfer their academic skills into a product world-facing focus, aka the upcoming wave of deep entrepreneurs, that will make serious $$$.\u003Cbr \/\u003E\u003Cbr \/\u003EResearch-wise, arXiv is a big productivity booster. Hopefully, now you know where to place your future deep learning research bets, have enough new insights to breath some inspiration into your favorite research problem, and you've gotten a taste of where the top researchers are heading. I encourage you to turn off your computer and have a white-board conversation with your colleagues about deep learning. Grab a friend, teach him some tricks.\u003Cbr \/\u003E\u003Cbr \/\u003EI'll see you all at CVPR 2016. Until then, keep learning.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003ERelated computervisionblog.com Blog Posts\u003C\/h3\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2012\/05\/why-your-vision-lab-needs-reading-group.html\"\u003EWhy your lab needs a reading group\u003C\/a\u003E.\u0026nbsp;May 2012\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/12\/iccv-2015-twenty-one-hottest-research.html\"\u003EICCV 2015: 21 Hottest Research Papers\u003C\/a\u003E\u0026nbsp;December 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/06\/deep-down-rabbit-hole-cvpr-2015-and.html\"\u003EDeep Down the Rabbit Hole: CVPR 2015 and Beyond\u003C\/a\u003E\u0026nbsp;June 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/11\/the-deep-learning-gold-rush-of-2015.html\"\u003EThe Deep Learning Gold Rush of 2015\u003C\/a\u003E\u0026nbsp;November 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html\"\u003EDeep Learning vs Machine Learning vs Pattern Recognition\u003C\/a\u003E\u0026nbsp;March 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/04\/deep-learning-vs-probabilistic.html\"\u003EDeep Learning vs Probabilistic Graphical Models\u003C\/a\u003E\u0026nbsp;April 2015\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2016\/01\/why-slam-matters-future-of-real-time.html\"\u003EFuture of Real-time SLAM and \"Deep Learning vs SLAM\"\u003C\/a\u003E\u0026nbsp;January 2016\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003ERelevant Outside Links\u003C\/h3\u003E[1]\u0026nbsp;\u003Ca href=\"http:\/\/www.recode.net\/2015\/7\/15\/11614684\/ai-conspiracy-the-scientists-behind-deep-learning\"\u003EWelcome to the AI Conspiracy: The 'Canadian Mafia' Behind Tech's Latest Craze\u003C\/a\u003E\u0026nbsp;@ \u0026lt;re\/code\u0026gt;\u003Cbr \/\u003E[2]\u0026nbsp;\u003Ca href=\"http:\/\/karpathy.github.io\/2015\/05\/21\/rnn-effectiveness\/\"\u003EThe Unreasonable Effectiveness of Recurrent Neural Networks\u003C\/a\u003E\u0026nbsp;@ Andrej Karpathy's Blog\u003Cbr \/\u003E[3]\u0026nbsp;\u003Ca href=\"http:\/\/googleresearch.blogspot.com\/2016\/03\/deep-learning-for-robots-learning-from.html\"\u003EDeep Learning for Robots: Learning from Large-Scale Interaction\u003C\/a\u003E.\u0026nbsp;@ Google Research Blog\u003Cbr \/\u003E\u003Cbr \/\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/5529343758137673151\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/06\/deep-learning-trends-iclr-2016.html#comment-form","title":"14 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5529343758137673151"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5529343758137673151"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/06\/deep-learning-trends-iclr-2016.html","title":"Deep Learning Trends @ ICLR 2016"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-kVgTFEwiQ-0\/V06d4M-6i7I\/AAAAAAAAOqQ\/TxxSFstFdpw8Y3H9Q3SiRBBRvhXRY5VfwCLcB\/s72-c\/deep_learning_machine_learning_conference_iclr_2016.png","height":"72","width":"72"},"thr$total":{"$t":"14"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-2190638093839413385"},"published":{"$t":"2016-01-13T04:20:00.001-05:00"},"updated":{"$t":"2016-06-14T03:46:36.791-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"andrew davison"},{"scheme":"http://www.blogger.com/atom/ns#","term":"bundle adjustment"},{"scheme":"http://www.blogger.com/atom/ns#","term":"DTAM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"DynamicFusion"},{"scheme":"http://www.blogger.com/atom/ns#","term":"iccv 2015"},{"scheme":"http://www.blogger.com/atom/ns#","term":"jakob engel"},{"scheme":"http://www.blogger.com/atom/ns#","term":"KinectFusion"},{"scheme":"http://www.blogger.com/atom/ns#","term":"LSD-SLAM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"marc pollefeys"},{"scheme":"http://www.blogger.com/atom/ns#","term":"pose"},{"scheme":"http://www.blogger.com/atom/ns#","term":"PTAM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"real-time"},{"scheme":"http://www.blogger.com/atom/ns#","term":"richard newcombe"},{"scheme":"http://www.blogger.com/atom/ns#","term":"robotics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"segmentation"},{"scheme":"http://www.blogger.com/atom/ns#","term":"sfm"},{"scheme":"http://www.blogger.com/atom/ns#","term":"SLAM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"workshop"},{"scheme":"http://www.blogger.com/atom/ns#","term":"zisserman"}],"title":{"type":"text","$t":"The Future of Real-Time SLAM and Deep Learning vs SLAM"},"content":{"type":"html","$t":"\u003Cdiv style=\"background-color: white; color: #222222;\"\u003ELast month's International Conference of Computer Vision (ICCV) was \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/12\/iccv-2015-twenty-one-hottest-research.html\"\u003Efull of Deep Learning\u003C\/a\u003E\u0026nbsp;techniques, but before we declare an all-out ConvNet victory, let's see how the other \"non-learning\" geometric side of computer vision is doing. \u0026nbsp;\u003Cb\u003ES\u003C\/b\u003Eimultaneous \u003Cb\u003EL\u003C\/b\u003Eocalization \u003Cb\u003Ea\u003C\/b\u003End \u003Cb\u003EM\u003C\/b\u003Eapping, or \u003Cb\u003ESLAM\u003C\/b\u003E, is arguably one of the most important algorithms in Robotics, with pioneering work done by both computer vision and robotics research communities. \u0026nbsp;Today I'll be summarizing my key points from ICCV's\u0026nbsp;\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/programme\/\"\u003EFuture of Real-Time SLAM\u003C\/a\u003E\u0026nbsp;Workshop, which was held on the last day of the conference (December 18th, 2015).\u003Cbr \/\u003E\u003Cbr \/\u003EToday's post contains a brief introduction to SLAM,\u0026nbsp;a detailed description of what happened at the workshop (with summaries of all 7 talks),\u0026nbsp;and some take-home messages from the \u003Ci\u003EDeep Learning-focused panel discussion\u003C\/i\u003E at the end of the session.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-3WNdePDKHQw\/VpOAwv91xWI\/AAAAAAAAOcY\/Q6oXFwf14Jw\/s1600\/slammies2.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"290\" src=\"https:\/\/1.bp.blogspot.com\/-3WNdePDKHQw\/VpOAwv91xWI\/AAAAAAAAOcY\/Q6oXFwf14Jw\/s400\/slammies2.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003ESLAM visualizations.\u0026nbsp;\u003C\/b\u003ECan you identify any of these SLAM algorithms?\u003C\/div\u003E\u003Cbr \/\u003E\u003Ch2\u003EPart I: Why SLAM Matters\u003C\/h2\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EVisual SLAM algorithms are able to simultaneously build 3D maps of the world while tracking the location and orientation of the camera (hand-held or head-mounted for AR or mounted on a robot).\u0026nbsp;\u003C\/span\u003ESLAM algorithms are complementary to ConvNets and Deep Learning: SLAM focuses on geometric problems and Deep Learning is the master of perception (recognition) problems. If you want a robot to go towards your refrigerator without hitting a wall, use SLAM. If you want the robot to identify the items inside your fridge, use ConvNets.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/openmvg.readthedocs.org\/en\/latest\/_images\/structureFromMotion.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em; text-align: center;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/openmvg.readthedocs.org\/en\/latest\/_images\/structureFromMotion.png\" height=\"220\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003EBasics of SfM\/SLAM\u003C\/b\u003E: From point observation and intrinsic camera parameters, the 3D structure of a scene is computed from the estimated motion of the camera. For details, see\u0026nbsp;\u003Ca href=\"http:\/\/openmvg.readthedocs.org\/en\/latest\/\"\u003EopenMVG website\u003C\/a\u003E.\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003ESLAM\u0026nbsp;is a real-time version of\u0026nbsp;\u003Cb\u003ES\u003C\/b\u003Etructure\u0026nbsp;\u003Cb\u003Ef\u003C\/b\u003Erom\u0026nbsp;\u003Cb\u003EM\u003C\/b\u003Eotion (SfM). Visual SLAM or vision-based SLAM is a camera-only variant of SLAM which forgoes expensive laser sensors and\u0026nbsp;\u003Cb\u003Ei\u003C\/b\u003Enertial \u003Cb\u003Em\u003C\/b\u003Eeasurement \u003Cb\u003Eu\u003C\/b\u003Enits (IMUs). Monocular SLAM uses a single camera while non-monocular SLAM typically uses a pre-calibrated fixed-baseline stereo camera rig. SLAM is prime example of a what is called a \"Geometric Method\" in Computer Vision. In fact, CMU's Robotics Institute splits the graduate level computer vision curriculum into a \u003Ca href=\"http:\/\/graphics.cs.cmu.edu\/courses\/16-824-S15\/index.html\"\u003ELearning-based Methods in Vision\u003C\/a\u003E course and a separate \u003Ca href=\"http:\/\/www.cs.cmu.edu\/~hebert\/geom.html\"\u003EGeometry-Based Methods in Vision\u003C\/a\u003E course.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003EStructure from Motion vs Visual SLAM\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EStructure from Motion (SfM) and SLAM are solving a very similar problem, but while SfM is traditionally performed in an offline fashion, SLAM has been slowly moving towards the low-power \/ real-time \/ single RGB camera mode of operation. Many of the today’s top experts in Structure from Motion work for some of the world’s biggest tech companies, helping make maps better. Successful mapping products like Google Maps could not have been built without intimate knowledge of multiple-view geometry, SfM, and SLAM. \u0026nbsp;A typical SfM problem is the following: given a large collection of photos of a single outdoor structure (like the Colliseum), construct a 3D model of the structure and determine the camera's poses. The image collection is processed in an offline setting, and large reconstructions can take anywhere between hours and days.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.cs.cornell.edu\/~snavely\/bundler\/images\/Colosseum.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/www.cs.cornell.edu\/~snavely\/bundler\/images\/Colosseum.jpg\" height=\"138\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cb\u003ESfM Software\u003C\/b\u003E:\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.cornell.edu\/~snavely\/bundler\/\"\u003EBundler\u003C\/a\u003E\u0026nbsp;is\u0026nbsp;one of the most successful SfM open source libraries\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EHere are some popular SfM-related software libraries:\u003C\/span\u003E\u003C\/div\u003E\u003Cul\u003E\u003Cli\u003E\u003Ca href=\"http:\/\/www.cs.cornell.edu\/~snavely\/bundler\/\" style=\"font-family: inherit;\"\u003EBundler\u003C\/a\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E,\u0026nbsp;an open-source Structure from Motion toolkit\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Ca href=\"http:\/\/ceres-solver.org\/\" style=\"font-family: inherit;\"\u003ELibceres\u003C\/a\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E, a non-linear least squares minimizer (useful for bundle adjustment problems)\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003EAndrew Zisserman's \u003C\/span\u003E\u003Ca href=\"http:\/\/www.robots.ox.ac.uk\/~vgg\/hzbook\/code\/\" style=\"font-family: inherit;\"\u003EMultiple-View Geometry MATLAB Functions\u003C\/a\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003EVisual SLAM vs Autonomous Driving\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EWhile self-driving cars are one of the most important applications of SLAM, according to Andrew Davison, one of the workshop organizers, SLAM for Autonomous Vehicles deserves its own research track. (And as we'll see, none of the workshop presenters talked about self-driving cars). For many years to come it will make sense to continue studying SLAM from a research perspective, independent of any single Holy-Grail application. While there are just too many system-level details and tricks involved with autonomous vehicles, research-grade SLAM systems require very little more than a webcam, knowledge of algorithms, and elbow grease. As a research topic, Visual SLAM is much friendlier to thousands of early-stage PhD students who’ll first need years of in-lab experience with SLAM before even starting to think about expensive robotic platforms such as self-driving cars.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/spectrum.ieee.org\/image\/1948541\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/spectrum.ieee.org\/image\/1948541\" height=\"215\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EGoogle's Self-Driving Car's perception system\u003C\/b\u003E. From IEEE Spectrum's \"\u003Ca href=\"http:\/\/spectrum.ieee.org\/automaton\/robotics\/artificial-intelligence\/how-google-self-driving-car-works\"\u003EHow Google's Self-Driving Car Works\u003C\/a\u003E\"\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003ERelated\u003C\/b\u003E: March 2015 blog post, \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/mobileyes-quest-to-put-deep-learning.html\"\u003EMobileye's quest to put Deep Learning inside every new car\u003C\/a\u003E.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E \u003Ca href=\"http:\/\/mappingignorance.org\/2014\/04\/07\/one-way-googles-cars-localize\/\"\u003EOne way Google's Cars Localize Themselves\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Ch2\u003E\u003Cspan style=\"font-family: inherit;\"\u003EPart II: The Future of Real-time SLAM\u003C\/span\u003E\u003C\/h2\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003ENow it's time to\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003Eofficially\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u0026nbsp;summarize and comment on the presentations from The Future of Real-time SLAM workshop.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Ca href=\"http:\/\/www.doc.ic.ac.uk\/~ajd\/index.html\"\u003EAndrew Davison\u003C\/a\u003E started the day with an excellent historical overview of SLAM called \u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/slides_ajd.pdf\"\u003E15 years of vision-based SLAM\u003C\/a\u003E, and his slides have good content for an introductory robotics course.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EFor those of you who don’t know Andy, he is the one and only Professor Andrew Davison of Imperial College London. \u0026nbsp;Most known for his 2003 MonoSLAM system, he was one of the first to show how to build SLAM systems from a single “\u003Ci\u003Emonocular”\u003C\/i\u003E\u0026nbsp;camera at a time when just everybody thought you needed a stereo “\u003Ci\u003Ebinocular\u003C\/i\u003E” camera rig. More recently, his work has influenced the trajectory of companies such as Dyson and the capabilities of their robotic systems (e.g., \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/05\/dyson-360-eye-and-baidu-deep-learning.html\"\u003Ethe brand new Dyson360\u003C\/a\u003E).\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EI remember Professor Davidson from the Visual SLAM tutorial he gave at the BMVC Conference back in \u003Ca href=\"http:\/\/www.cs.bris.ac.uk\/Research\/Vision\/Realtime\/bmvctutorial\/\"\u003E2007\u003C\/a\u003E. Surprisingly very little has changed in SLAM compared to the rest of the machine-learning heavy work being done at the main vision conferences. In the past 8 years, object recognition has undergone 2-3 mini revolutions, while today's SLAM systems don't look much different than they did 8 years ago. The best way to see the progress of SLAM is to take a look at the most successful and memorable systems.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EIn Davison’s workshop introduction talk, he discussed some of these exemplary systems which were produced by the research community over the last 10-15 years:\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cbr \/\u003E\u003Cul\u003E\u003Cli\u003E\u003Cb style=\"color: #222222; font-family: inherit;\"\u003EMonoSLAM\u003C\/b\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cb style=\"color: #222222; font-family: inherit;\"\u003EPTAM\u003C\/b\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cb style=\"color: #222222; font-family: inherit;\"\u003EFAB-MAP\u003C\/b\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cb style=\"color: #222222; font-family: inherit;\"\u003EDTAM\u003C\/b\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cb style=\"color: #222222; font-family: inherit;\"\u003EKinectFusion\u003C\/b\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003EDavison vs Horn: The next chapter in Robot Vision\u003C\/span\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003EDavison also mentioned that he is working on a new Robot Vision book, which should be an exciting treat for researchers in computer vision, robotics, and artificial intelligence. The last \u003Ca href=\"https:\/\/mitpress.mit.edu\/books\/robot-vision\"\u003ERobot Vision book\u003C\/a\u003E was written by B.K. Horn (1986), and it’s about time for an updated take on Robot Vision.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-Oh94IZlctLA\/VpWlSwN_WEI\/AAAAAAAAOdw\/fKDBj8KQoGM\/s1600\/robotvision-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"117\" src=\"https:\/\/3.bp.blogspot.com\/-Oh94IZlctLA\/VpWlSwN_WEI\/AAAAAAAAOdw\/fKDBj8KQoGM\/s400\/robotvision-01.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EA new robot vision book?\u003C\/b\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EWhile I’ll gladly read a tome that focuses on the philosophy of robot vision, personally I would like the book to focus on practical algorithms for robot vision, like the excellent \u003Ca href=\"http:\/\/www.robots.ox.ac.uk\/~vgg\/hzbook\/\"\u003EMultiple View Geometry\u003C\/a\u003E book by Hartley and Zissermann or \u003Ca href=\"http:\/\/www.probabilistic-robotics.org\/\"\u003EProbabilistic Robotics\u003C\/a\u003E by Thrun, Burgard, and Fox. A \"cookbook\" of visual SLAM problems would be a welcome addition to any serious vision researcher's collection.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated\u003C\/b\u003E: Davison's \u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/slides_ajd.pdf\"\u003E15-years of vision-based SLAM\u003C\/a\u003E\u0026nbsp;slides\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 1: Christian Kerl on Continuous Trajectories in SLAM\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EThe first talk, by \u003Ca href=\"http:\/\/vision.in.tum.de\/members\/kerl\"\u003EChristian Kerl\u003C\/a\u003E, presented a dense tracking method to estimate a continuous-time trajectory. The key observation is that most SLAM systems estimate camera poses at a discrete number of time steps (either they key frames which are spaced several seconds apart, or the individual frames which are spaced approximately 1\/25s apart).\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-vjaOqhDMTBg\/VpODSBMvZ5I\/AAAAAAAAOck\/lbF_FQh5_EM\/s1600\/kerl.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em; text-align: center;\"\u003E\u003Cimg border=\"0\" height=\"115\" src=\"https:\/\/2.bp.blogspot.com\/-vjaOqhDMTBg\/VpODSBMvZ5I\/AAAAAAAAOck\/lbF_FQh5_EM\/s400\/kerl.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EContinuous Trajectories vs Discrete Time Points. \u003C\/b\u003ESLAM\/SfM usually uses discrete time points, but why not go continuous?\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003EMuch of Kerl’s talk was focused on undoing the damage of rolling shutter cameras, and the system demo’ed by Kerl paid meticulous attention to modeling and removing these adverse rolling shutter effects.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-gXML1JkPLGs\/VpOEM6IOsCI\/AAAAAAAAOcs\/yzLDs-WEMqM\/s1600\/shutter.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"245\" src=\"https:\/\/1.bp.blogspot.com\/-gXML1JkPLGs\/VpOEM6IOsCI\/AAAAAAAAOcs\/yzLDs-WEMqM\/s400\/shutter.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EUndoing the damage of rolling shutter in Visual SLAM.\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u003C\/span\u003E\u003Cspan style=\"text-align: center;\"\u003E\u0026nbsp;Kerl's\u0026nbsp;\u003C\/span\u003E\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/kerl_etal_iccv2015_futureofslam_talk.pdf\" style=\"text-align: center;\"\u003EDense continous-time tracking and mapping\u003C\/a\u003E\u003Cspan style=\"text-align: center;\"\u003E\u0026nbsp;slides.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"text-align: center;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003EDense Continuous-Time Tracking and Mapping with Rolling Shutter RGB-D Cameras (C. Kerl, J. Stueckler, D. Cremers), In IEEE International Conference on Computer Vision (ICCV), 2015. [\u003Ca href=\"http:\/\/vision.in.tum.de\/_media\/spezial\/bib\/kerl15iccv.pdf\"\u003Epdf\u003C\/a\u003E]\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 2: Semi-Dense Direct SLAM by Jakob Engel\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ELSD-SLAM came out at ECCV 2014 and is one of my favorite SLAM systems today! \u003Ca href=\"http:\/\/vision.in.tum.de\/members\/engelj\"\u003EJakob Engel\u003C\/a\u003E was there to present his system and show the crowd some of the coolest SLAM visualizations in town. LSD-SLAM is an acronym for Large-Scale Direct Monocular SLAM. LSD-SLAM is an important system for SLAM researchers because it does not use corners or any other local features. \u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003EDirect tracking is performed by image-to-image alignment\u003C\/b\u003E using a coarse-to-fine algorithm with a robust Huber loss. This is quite different than the feature-based systems out there. Depth estimation uses an inverse depth parametrization (like many other SLAM systems) and uses a large number or relatively small baseline image pairs. Rather than relying on image features, the algorithms is effectively performing “texture tracking”. Global mapping is performed by creating and solving a pose graph \"bundle adjustment\" optimization problem, and all of this works in real-time. The method is semi-dense because it only estimates depth at pixels solely near image boundaries. LSD-SLAM output is denser than traditional features, but not fully dense like Kinect-style RGBD SLAM.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Ctable align=\"center\" cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"margin-left: auto; margin-right: auto; text-align: center;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-VH3GehiSfKY\/Vnl44gfSt3I\/AAAAAAAAObw\/MYun2V6_C4M\/s1600\/lsd-slam.png\" imageanchor=\"1\" style=\"margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" height=\"225\" src=\"https:\/\/4.bp.blogspot.com\/-VH3GehiSfKY\/Vnl44gfSt3I\/AAAAAAAAObw\/MYun2V6_C4M\/s400\/lsd-slam.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\"\u003E\u003Cspan style=\"font-size: small;\"\u003E\u003Cb\u003ELSD-SLAM in Action:\u0026nbsp;\u003C\/b\u003E\u003Ca href=\"http:\/\/vision.in.tum.de\/research\/vslam\/lsdslam\"\u003ELSD-SLAM\u003C\/a\u003E\u0026nbsp;generates both a camera trajectory and a semi-dense 3D scene reconstruction. This approach works in real-time, does not use feature points as primitives, and performs direct image-to-image alignment.\u003C\/span\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EEngel gave us an overview of the original LSD-SLAM system as well as a handful of new results, extending their initial system to more creative applications and to more interesting deployments. (See paper citations below)\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;\u003Ca href=\"https:\/\/github.com\/tum-vision\/lsd_slam\"\u003ELSD-SLAM Open-Source Code on github\u003C\/a\u003E\u0026nbsp;\u003C\/span\u003E\u003Ca href=\"http:\/\/vision.in.tum.de\/research\/vslam\/lsdslam\"\u003ELSD-SLAM project webpage\u003C\/a\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003E\u003C\/span\u003ELSD-SLAM: Large-Scale Direct Monocular SLAM\u0026nbsp;(J. Engel, T. Schöps, D. Cremers), In European Conference on Computer Vision (ECCV), 2014. [\u003Ca href=\"http:\/\/vision.in.tum.de\/_media\/spezial\/bib\/engel14eccv.pdf\"\u003Epdf\u003C\/a\u003E] [youtube\u0026nbsp;\u003Ca href=\"https:\/\/youtu.be\/GnuQzP3gty4\"\u003Evideo\u003C\/a\u003E]\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EAn extension to LSD-SLAM, \u003Cb\u003EOmni LSD-SLAM\u003C\/b\u003E was created by the observation that the pinhole model does not allow for a large field of view. This work was presented at IROS 2015 (Caruso is first author) and allows a large field of view (ideally more than 180 degrees). From Engel’s presentation it was pretty clear that you can perform ballerina-like motions (extreme rotations) while walking around your office and holding the camera. This is one of those worst-case scenarios for narrow field of view SLAM, yet works quite well in Omni LSD-SLAM.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"-webkit-text-stroke-width: 0px; background-color: white; color: #222222; font-family: Times; font-size: medium; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: 1; word-spacing: 0px;\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003E\u003Cbr class=\"Apple-interchange-newline\" \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-WUowVgUNRuk\/VpOE_93Z_gI\/AAAAAAAAOc4\/4kudUERtd80\/s1600\/omni.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"251\" src=\"https:\/\/4.bp.blogspot.com\/-WUowVgUNRuk\/VpOE_93Z_gI\/AAAAAAAAOc4\/4kudUERtd80\/s400\/omni.png\" style=\"cursor: move;\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003E\u003Cb\u003EOmnidirectional LSD-SLAM Model.\u003C\/b\u003E See Engel's\u0026nbsp;\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/ICCV-SLAM-Workshop_JakobEngel.pdf\"\u003ESemi-Dense Direct SLAM\u003C\/a\u003E\u0026nbsp;presentation slides.\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003E\u003C\/span\u003ELarge-Scale Direct SLAM for Omnidirectional Cameras (D. Caruso, J. Engel, D. Cremers), In International Conference on Intelligent Robots and Systems (IROS), 2015. \u0026nbsp;[\u003Ca href=\"http:\/\/vision.in.tum.de\/_media\/spezial\/bib\/caruso2015_omni_lsdslam.pdf\"\u003Epdf\u003C\/a\u003E] [youtube\u0026nbsp;\u003Ca href=\"https:\/\/youtu.be\/v0NqMm7Q6S8\"\u003Evideo\u003C\/a\u003E]\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003EStereo LSD-SLAM\u003C\/b\u003E is an extension of LSD-SLAM to a binocular camera rig. This helps in getting the\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003Eabsolute scale,\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003Einitialization is instantaneous, and there are \u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003Eno issues with strong rotation. While monocular SLAM is very exciting from an academic point of view, if your robot is a 30,000$ car or 10,000$ drone prototype, you should have a good reason to not use a two+ camera rig. Stereo LSD-SLAM performs quite competitively on SLAM benchmarks.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-076ah06nqNo\/VpWpi1ty0BI\/AAAAAAAAOd8\/nVhyKjMFoXU\/s1600\/stereo-lsd.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"149\" src=\"https:\/\/2.bp.blogspot.com\/-076ah06nqNo\/VpWpi1ty0BI\/AAAAAAAAOd8\/nVhyKjMFoXU\/s400\/stereo-lsd.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EStereo LSD-SLAM.\u003C\/b\u003E\u0026nbsp;Excellent results on KITTI vehicle-SLAM dataset.\u003C\/div\u003E\u003Cdiv\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003EStereo LSD-SLAM is quite practical, optimizes a\u0026nbsp;pose graph in SE(3), and includes a correction for auto exposure. The goal of auto-exposure correcting is to make the error function invariant to affine lighting changes. The underlying parameters of the color-space affine transform are estimated during matching, but thrown away to estimate the image-to-image error. From Engel's talk, outliers (often caused by over-exposed image pixels) tend to be a problem, and much care needs to be taken to care of their effects.\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cb\u003ERelated: \u003C\/b\u003ELarge-Scale Direct SLAM with Stereo Cameras\u0026nbsp;(J. Engel, J. Stueckler, D. Cremers), In International Conference on Intelligent Robots and Systems (IROS), 2015. \u0026nbsp;[\u003Ca href=\"http:\/\/vision.in.tum.de\/_media\/spezial\/bib\/engel2015_stereo_lsdslam.pdf\"\u003Epdf\u003C\/a\u003E] [youtube\u0026nbsp;\u003Ca href=\"https:\/\/youtu.be\/oJt3Ln8H03s\"\u003Evideo\u003C\/a\u003E]\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ELater in his presentation, Engel gave us a sneak peak on new research about i\u003Ci\u003Entegrating both stereo and inertial sensors\u003C\/i\u003E. For details, you’ll have to keep hitting refresh on Arxiv or talk to Usenko\/Engel in person.\u0026nbsp;\u003C\/span\u003EOn the applications side, Engel's presentation included updated videos of an Autonomous Quadrotor driven by LSD-SLAM. The flight starts with an up-down motion to get the scale estimate and a free-space octomap is used to estimate the free-space so that the quadrotor can navigate space on its own. Stay tuned for an official publication...\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-E7FYRfwgEZE\/VpWrU97D4OI\/AAAAAAAAOeI\/Wf4toRYot88\/s1600\/quadrotor.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"256\" src=\"https:\/\/2.bp.blogspot.com\/-E7FYRfwgEZE\/VpWrU97D4OI\/AAAAAAAAOeI\/Wf4toRYot88\/s400\/quadrotor.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EQuadrotor running Stereo LSD-SLAM.\u003C\/b\u003E\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ESee\u003Ca href=\"https:\/\/youtu.be\/eznMokFQmpc\"\u003E Engel's quadrotor youtube video\u003C\/a\u003E from 2012.\u0026nbsp;\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003EThe story of LSD-SLAM is also the story of \u003Cb\u003Efeature-based vs direct-methods\u003C\/b\u003E and Engel gave both sides of the debate a fair treatment.\u0026nbsp;\u003Cspan style=\"font-family: inherit;\"\u003EFeature-based methods are engineered to work on top of Harris-like corners, while direct methods use the entire image for alignment.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EFeature-based methods are faster (as of 2015), but direct methods are good for parallelism. Outliers can be retroactively removed from feature-based systems, while direct methods are less flexible w.r.t. outliners. Rolling shutter is a bigger problem for direct methods and it makes sense to use a global shutter or a rolling shutter model (see Kerl’s work). Feature-based methods require making decisions using incomplete information, but direct methods can use much more information. Feature-based methods have no need for good initialization and direct-based methods need some clever tricks for initialization. There is only about 4 years of research on direct methods and 20+ on sparse methods. Engel is optimistic that direct methods will one day rise to the top, and so am I.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-i2l2PgDiovE\/VpOFbRPLrhI\/AAAAAAAAOdA\/mzE1KpKil4M\/s1600\/feature-vs-direct.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"300\" src=\"https:\/\/1.bp.blogspot.com\/-i2l2PgDiovE\/VpOFbRPLrhI\/AAAAAAAAOdA\/mzE1KpKil4M\/s400\/feature-vs-direct.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EFeature-based vs direct methods of building SLAM systems.\u003C\/b\u003E Slide from Engel's talk.\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EAt the end of Engel's presentation, Davison asked about semantic segmentation and Engel wondered whether semantic segmentation can be performed directly on semi-dense \"near-image-boundary\" data.\u0026nbsp; However, my personal opinion is that there are better ways to apply semantic segmentation to LSD-like SLAM systems. Semi-dense SLAM can focus on geometric information near boundaries, while object recognition can focus on reliable semantics away from the same boundaries, potentially creating a hybrid geometric\/semantic interpretation of the image.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cb\u003ERelated\u003C\/b\u003E: Engel's \u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/ICCV-SLAM-Workshop_JakobEngel.pdf\"\u003ESemi-Dense Direct SLAM presentation\u003C\/a\u003E slides\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 3: Sattler on The challenges of Large-Scale Localization and Mapping\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Ca href=\"https:\/\/www.graphics.rwth-aachen.de\/person\/21\/\"\u003ETorsten Sattler\u003C\/a\u003E gave a talk on large-scale localization and mapping.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003EThe motivation for this work is to perform 6-dof localization inside an existing map, especially for mobile localization. One of the key points in the talk was that when you are using traditional feature-based methods, storing your descriptors soon becomes very costly. Techniques such as visual vocabularies (remember product quantization?) can significantly reduce memory overhead, and with clever optimization at some point storing descriptors no longer becomes the memory bottleneck.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EAnother important take-home message from Sattler’s talk is that the number of inliers is not actually a good confidence measure for camera pose estimation.\u0026nbsp; When the feature point are all concentrated in a single part of the image, camera localization can be kilometers away! A better measure of confidence is the “effective inlier count” which looks at the area spanned by the inliers as a fraction of total image area.\u0026nbsp; What you really want is feature matches from all over the image — if the information is spread out across the image you get a much better pose estimate.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ESattler’s take on the future of real-time slam is the following: we should focus on compact map representations, we should get better at understanding camera pose estimate confidences (like down-weighing features from trees), we should work on more challenging scenes (such as worlds with planar structures and nighttime localization against daytime maps).\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-zXNg4No_GME\/VpOFsvOweII\/AAAAAAAAOdI\/l1pj_aH1UA4\/s1600\/mobileloc.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"218\" src=\"https:\/\/4.bp.blogspot.com\/-zXNg4No_GME\/VpOFsvOweII\/AAAAAAAAOdI\/l1pj_aH1UA4\/s400\/mobileloc.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EMobile Localisation: \u003C\/b\u003ESattler's key problem is localizing yourself inside a large city with a single smartphone picture\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated: \u003C\/b\u003EScalable 6-DOF Localization on Mobile Devices.\u0026nbsp;Sven Middelberg, Torsten Sattler, Ole Untzelmann, Leif Kobbelt. In ECCV 2014. [\u003Ca href=\"https:\/\/www.graphics.rwth-aachen.de\/publication\/213\/ECCV14_preprint.pdf\"\u003Epdf\u003C\/a\u003E]\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003ETorsten Sattler 's \u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/Sattler_challenges_large_scale_loc_and_mapping.pdf\"\u003EThe challenges of large-scale localisation and mapping\u003C\/a\u003E\u0026nbsp;slides\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 4: Mur-Artal on Feature-based vs Direct-Methods\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cspan style=\"color: #222222;\"\u003ERaúl Mur-\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003EArtal, the creator of ORB-SLAM, dedicated his entire presentation to the Feature-based vs Direct-method debate in SLAM and he's definitely on the feature-based side. ORB-SLAM is available as an open-source SLAM package and it is hard to beat. During his evaluation of ORB-SLAM vs PTAM it seems that PTAM actually fails quite often (at least on the TUM RGB-D benchmark). LSD-SLAM errors are also much higher on the TUM RGB-D benchmark than expected.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-Vk0x1Y6ZFRU\/VpX4gxX7TpI\/AAAAAAAAOe8\/8OLOkL7iDcw\/s1600\/types-of-slam.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"205\" src=\"https:\/\/4.bp.blogspot.com\/-Vk0x1Y6ZFRU\/VpX4gxX7TpI\/AAAAAAAAOe8\/8OLOkL7iDcw\/s400\/types-of-slam.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cb\u003EFeature-Based SLAM vs Direct SLAM. \u003C\/b\u003ESee Mur-Artal's\u0026nbsp;\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/ICCV15_SLAMWS_RaulMur.pdf\"\u003EShould we still do sparse feature based SLAM?\u003C\/a\u003E presentation slides\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;Mur-Artal's\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/ICCV15_SLAMWS_RaulMur.pdf\"\u003EShould we still do sparse-feature based SLAM?\u003C\/a\u003E\u0026nbsp;slides\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated: \u003C\/b\u003EMonocular ORB-SLAM R. Mur-Artal, J. M. M. Montiel and J. D. Tardos. A versatile and Accurate Monocular SLAM System. IEEE Transactions on Robotics. 2015 [\u003Ca href=\"http:\/\/webdiis.unizar.es\/~raulmur\/MurMontielTardosTRO15.pdf\"\u003Epdf\u003C\/a\u003E]\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;\u003Ca href=\"http:\/\/github.com\/raulmur\/ORB_SLAM\"\u003EORB-SLAM Open-source code on github\u003C\/a\u003E, \u003Ca href=\"http:\/\/webdiis.unizar.es\/~raulmur\/orbslam\/\"\u003EProject Website\u003C\/a\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 5: Project Tango and Visual loop-closure for image-2-image constraints\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003ESimply put, \u003Ca href=\"https:\/\/www.google.com\/atap\/project-tango\/\"\u003EGoogle's Project Tango\u003C\/a\u003E is the world' first attempt at commercializing SLAM.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003ESimon Lynen from Google Zurich (formerly ETH Zurich) came to the workshop with a Tango live demo (on a tablet) and a presentation on what's new in the world of Tango. In case you don't already know, Google wants to put SLAM capabilities into the next generation of Android Devices.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/3.bp.blogspot.com\/-cYVkHspdkXg\/V1q0jg-pvlI\/AAAAAAAAOrk\/gUKjTTU-irsiQWwGaOq5ZfghTk6WlxsiQCLcB\/s1600\/Google-project-tango-3D-mapping-video.jpeg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"https:\/\/3.bp.blogspot.com\/-cYVkHspdkXg\/V1q0jg-pvlI\/AAAAAAAAOrk\/gUKjTTU-irsiQWwGaOq5ZfghTk6WlxsiQCLcB\/s1600\/Google-project-tango-3D-mapping-video.jpeg\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; color: #222222; text-align: center;\"\u003EGoogle's Project Tango needs no introduction.\u003C\/div\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003EThe Project Tango presentation discussed a new way of doing loop closure by finding certain patters in the image-to-image matching matrix. This comes from the “Placeless Place Recognition” work. They also do online bundle adjustment w\/ vision-based loop closure.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-5Q_EOliJgwM\/VpWuMVDPcGI\/AAAAAAAAOeU\/ONEAYjX8f58\/s1600\/placeless.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"101\" src=\"https:\/\/2.bp.blogspot.com\/-5Q_EOliJgwM\/VpWuMVDPcGI\/AAAAAAAAOeU\/ONEAYjX8f58\/s400\/placeless.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003ELoop Closure inside a Project Tango?\u003C\/b\u003E Lynen et al's \u003Ca href=\"https:\/\/3234f89137bccf2ede29cc86e315c75116020d70.googledrive.com\/host\/0B64GJ60h3Ai1MVVwWTZwekhtcFU\/publications\/bib\/lynen_3dv14.pdf\"\u003EPlaceless Place Recognition\u003C\/a\u003E. The image-to-image matrix reveals a new way to look for loop-closure. See the algorithm in action in this \u003Ca href=\"https:\/\/www.youtube.com\/watch?v=HfWvWQrCwwA\"\u003Eyoutube video\u003C\/a\u003E.\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EThe Project Tango folks are also working on combing multiple crowd-sourced maps at Google, where the goals to combine multiple mini-maps created by different people using Tango-equipped devices.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ESimon showed a video of mountain bike trail tracking which is actually quite difficult in practice. The idea is to go down a mountain bike trail using a Tango device and create a map, then the follow-up goal is to have a separate person go down the trail. This currently “semi-works” when there are a few hours between the map building and the tracking step, but won’t work across weeks\/months\/etc.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EDuring the Tango-related discussion, Richard Newcombe pointed out that the “features” used by Project Tango are quite primitive w.r.t. getting a deeper understanding of the environment, and it appears that Project Tango-like methods won't work on outdoor scenes where the world is plagued by non-rigidity, massive illumination changes, etc. \u0026nbsp;So are we to expect different systems being designed for outdoor systems or will Project Tango be an indoor mapping device?\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003E\u003C\/span\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Ca href=\"https:\/\/3234f89137bccf2ede29cc86e315c75116020d70.googledrive.com\/host\/0B64GJ60h3Ai1MVVwWTZwekhtcFU\/publications\/bib\/lynen_3dv14.pdf\"\u003EPlaceless Place Recognition.\u003C\/a\u003E\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003ELynen, S. ; Bosse, M. ; Furgale, P. ; Siegwart, R. In 3DV 2014.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003E\u003Ca href=\"https:\/\/www.youtube.com\/watch?v=iP9m9a2KEN4\"\u003EGoogle I\/O talk from May 29, 2015 about Tango\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 6: ElasticFusion is DenseSLAM without a pose-graph\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EElasticFusion is a dense SLAM technique which requires a RGBD sensor like the Kinect. 2-3 minutes to obtain a high-quality 3D scan of a single room is pretty cool. A pose-graph is used behind the scenes of many (if not most) SLAM systems, and this technique has a different (map-centric) approach. The approach focuses on building a map, but the trick is that the map is deformable, hence the name ElasticFusion. The “Fusion” part of the algorithm is in homage to KinectFusion which was one of the first high quality kinect-based reconstruction pipelines. Also surfels are used as the underlying primitives.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-W6WF-6uN-Vs\/VpOGviRVS9I\/AAAAAAAAOdY\/a6G-E0aRoSM\/s1600\/kintinuous.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"212\" src=\"https:\/\/2.bp.blogspot.com\/-W6WF-6uN-Vs\/VpOGviRVS9I\/AAAAAAAAOdY\/a6G-E0aRoSM\/s400\/kintinuous.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EImage from Kintinuous, an early version of Whelan's Elastic Fusion.\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ERecovering light sources: we were given a sneak peak at new unpublished work from Imperial College London \/ dyson Robotics Lab. The idea is that detecting the light source direction and detecting specularities, you can improve 3D reconstruction results. Cool videos of recovering light source locations which work for up to 4 separate lights.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cspan style=\"color: #222222; font-family: inherit;\"\u003E\u003Cb\u003ERelated:\u0026nbsp;\u003C\/b\u003E\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Ca href=\"http:\/\/wp.doc.ic.ac.uk\/thefutureofslam\/wp-content\/uploads\/sites\/93\/2015\/12\/ElasticFusion.pdf\"\u003EMap-centric SLAM with ElasticFusion\u003C\/a\u003E presentation slides\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;\u003Ca href=\"http:\/\/www.doc.ic.ac.uk\/~bglocker\/pdfs\/whelan2015rss.pdf\"\u003EElasticFusion: Dense SLAM Without A Pose Graph.\u0026nbsp;\u003C\/a\u003EWhelan, Thomas and Leutenegger, Stefan and Salas-Moreno, Renato F and Glocker, Ben and Davison, Andrew J. In RSS 2015.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETalk 7: Richard Newcombe’s DynamicFusion\u003C\/span\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003ERichard Newcombe's (whose recently formed company was acquired by Oculus), was the last presenter. \u0026nbsp;It's really cool to see the person behind \u003Ca href=\"http:\/\/homes.cs.washington.edu\/~newcombe\/papers\/newcombe_etal_iccv2011.pdf\"\u003EDTAM\u003C\/a\u003E, \u003Ca href=\"http:\/\/homes.cs.washington.edu\/~newcombe\/papers\/newcombe_etal_ismar2011.pdf\"\u003EKinectFusion\u003C\/a\u003E, and\u0026nbsp;\u003Ca href=\"http:\/\/grail.cs.washington.edu\/projects\/dynamicfusion\/papers\/DynamicFusion.pdf\"\u003EDynamicFusion\u003C\/a\u003E now working in the VR space.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-S3mMC77oNMM\/VpWw25SfslI\/AAAAAAAAOeg\/16-YJs3a-sc\/s1600\/dynamicfusion.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"160\" src=\"https:\/\/4.bp.blogspot.com\/-S3mMC77oNMM\/VpWw25SfslI\/AAAAAAAAOeg\/16-YJs3a-sc\/s400\/dynamicfusion.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ENewcombe's \u003Ca href=\"http:\/\/grail.cs.washington.edu\/projects\/dynamicfusion\/papers\/DynamicFusion.pdf\"\u003EDynamic Fusion\u003C\/a\u003E algorithm. The technique won the prestigious CVPR 2015 best paper award, and to see it in action just take a look at the authors' \u003Ca href=\"https:\/\/www.youtube.com\/watch?v=i1eZekcc_lM\"\u003EDynamicFusion Youtube video\u003C\/a\u003E.\u003C\/div\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white;\"\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003ERelated\u003C\/b\u003E:\u0026nbsp;\u003C\/span\u003E\u003Ca href=\"http:\/\/grail.cs.washington.edu\/projects\/dynamicfusion\/papers\/DynamicFusion.pdf\"\u003EDynamicFusion: Reconstruction and Tracking of Non-rigid Scenes in Real-Time\u003C\/a\u003E, Richard A. Newcombe, Dieter Fox, Steven M. Seitz. In CVPR 2015. [\u003Ca href=\"http:\/\/grail.cs.washington.edu\/projects\/dynamicfusion\/papers\/DynamicFusion.pdf\"\u003Epdf\u003C\/a\u003E] [Best-Paper winner]\u003C\/div\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;\u003Ca href=\"http:\/\/homes.cs.washington.edu\/~newcombe\/papers\/Salas-Moreno_etal_cvpr2013.pdf\"\u003ESLAM++: Simultaneous Localisation and Mapping at the Level of Objects\u003C\/a\u003E Renato F. Salas-Moreno, Richard A. Newcombe, Hauke Strasdat, Paul H. J. Kelly and Andrew J. Davison (CVPR 2013)\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cb\u003ERelated:\u003C\/b\u003E\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Ca href=\"http:\/\/homes.cs.washington.edu\/~newcombe\/papers\/newcombe_etal_ismar2011.pdf\"\u003EKinectFusion: Real-Time Dense Surface Mapping and Tracking\u003C\/a\u003E Richard A. Newcombe Shahram Izadi,Otmar Hilliges, David Molyneaux, David Kim, Andrew J. Davison, Pushmeet Kohli, Jamie Shotton, Steve Hodges, Andrew Fitzgibbon (ISMAR 2011, Best paper award!)\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"color: #222222;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cb\u003EWorkshop Demos\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EDuring the demo sessions (held in the middle of the workshop), many of the presenter showed off their SLAM systems in action. Many of these systems are available as open-source (free for non-commercial use?) packages, so if you’re interested in real-time SLAM, downloading the code is worth a shot. However,\u003Cb\u003E the one demo which stood out was Andrew Davison’s showcase of his MonoSLAM system from 2004\u003C\/b\u003E. Andy had to revive his 15-year old laptop (which was running Redhat Linux) to show off his original system, running on the original hardware. If the computer vision community is going to oneway decide on a “retro-vision” demo session, I’m just going to go ahead and nominate Andy for the best-paper prize, right now.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-AwQoCYKPPQY\/VpYSrlbUf8I\/AAAAAAAAOfk\/MBKnnJh_Yss\/s1600\/IMG_0500.JPG\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"400\" src=\"https:\/\/3.bp.blogspot.com\/-AwQoCYKPPQY\/VpYSrlbUf8I\/AAAAAAAAOfk\/MBKnnJh_Yss\/s400\/IMG_0500.JPG\" width=\"300\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAndry's Retro-Vision SLAM Setup (Pictured on December 18th, 2015)\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EIt was interesting to watch the SLAM system experts wave their USB cameras around, showing their systems build 3D maps of the desk-sized area around their laptops.\u0026nbsp; If you carefully look at the way these experts move the camera around (i.e., smooth circular motions), you can almost tell how long a person has been working with SLAM. When the non-experts hold the camera, probability of tracking failure is significantly higher.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EI had the pleasure of speaking with Andy during the demo session, and I was curious which line of work (in the past 15 years) surprised him the most. His reply was that PTAM, which showed how to perform real-time bundle adjustment, surprised him the most. The PTAM system was essentially a MonoSLAM++ system, but the significantly improved tracking results were due to taking a heavyweight algorithm (bundle adjustment) and making it real-time — something which Andy did not believe was possible in the early 2000s.\u003C\/span\u003E\u003C\/div\u003E\u003Ch2 style=\"color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003EPart III: Deep Learning vs SLAM\u003C\/b\u003E\u003C\/span\u003E\u003C\/h2\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EThe SLAM panel discussion was a lot of fun. Before we jump to the important Deep Learning vs SLAM discussion, I should mention that each of the workshop presenters agreed that \u003Cb\u003Esemantics are necessary to build bigger and better SLAM systems\u003C\/b\u003E. There were lots of interesting mini-conversations about future directions. During the debates, \u003Ca href=\"https:\/\/www.inf.ethz.ch\/personal\/marc.pollefeys\/\"\u003EMarc Pollefeys\u003C\/a\u003E\u0026nbsp;(a well-known researcher in SfM and Multiple-View Geometry) reminded everybody that \u003Cb\u003ERobotics is the killer application of SLAM\u003C\/b\u003E\u0026nbsp;and suggested we keep an eye on the prize. This is quite surprising since SLAM was traditionally applied to Robotics problems, but the lack of Robotics success in the last few decades (Google Robotics?) has shifted the focus of SLAM away from Robots and towards large-scale map building (ala Google Maps) and Augmented Reality. Nobody at this workshop talked about Robots.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003EIntegrating semantic information into SLAM\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EThere was a lot of interest in incorporating semantics into today’s top-performing SLAM systems. When it comes to semantics, the \u003Cb\u003ESLAM community is unfortunately stuck in the world of bags-of-visual-words\u003C\/b\u003E, and doesn't have new ideas on how to integrate semantic information into their systems. On the other end, we’re now seeing real-time semantic segmentation demos (based on ConvNets) popping up at CVPR\/ICCV\/ECCV, and in my opinion SLAM needs Deep Learning as much as the other way around.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-gOJoLF_DWKQ\/VpOHU7r_O4I\/AAAAAAAAOdg\/vq85sOEnlBU\/s1600\/semantics.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"281\" src=\"https:\/\/1.bp.blogspot.com\/-gOJoLF_DWKQ\/VpOHU7r_O4I\/AAAAAAAAOdg\/vq85sOEnlBU\/s400\/semantics.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EIntegrating semantics into SLAM is often talk about, but it is easier said than done.\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFigure 6.9 (page 142) from Moreno's PhD thesis:\u0026nbsp;\u003Ca href=\"https:\/\/www.doc.ic.ac.uk\/~rfs09\/docs\/Salas-Moreno-R-2014-PhD-Thesis.pdf\"\u003EDense Semantic SLAM\u003C\/a\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cb\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\"Will end-to-end learning dominate SLAM?\"\u003C\/span\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003ETowards the end of the SLAM workshop panel, \u003Ca href=\"http:\/\/www.zeeshanzia.com\/\"\u003EDr. Zeeshan Zia\u003C\/a\u003E asked a question which \u003Ci\u003Estartled\u003C\/i\u003E the entire room and led to a memorable, energy-filled discussion. You should have seen the look on the panel’s faces.\u003C\/span\u003E\u0026nbsp;It was a bunch of geometers being thrown a fireball of deep learning.\u003Cspan style=\"font-family: inherit;\"\u003E\u0026nbsp;Their facial expressions suggest both bewilderment, anger, and disgust. \"\u003Ci\u003EHow dare you question us?\" \u003C\/i\u003Ethey were thinking.\u0026nbsp;And it is only during these fleeting moments that we can truly appreciate the conference experience. Zia's question was essentially: \u003Cb\u003EWill end-to-end learning soon replace the mostly manual labor involved in building today’s SLAM systems?\u003C\/b\u003E.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003EZia's question is very important because end-to-end trainable systems have been slowly creeping up on many advanced computer science problems, and there's no reason to believe SLAM will be an exception. A handful of the presenters pointed out that current SLAM systems rely on too much geometry for a pure deep-learning based SLAM system to make sense -- we should use learning to make the point descriptors better, but leave the geometry alone. \u003Ci\u003EJust because you can use deep learning to make a calculator, it doesn't mean you should.\u003C\/i\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-sOdinuQ3bBg\/VpYGCkrwjcI\/AAAAAAAAOfM\/2JbO1Sny3A0\/s1600\/convnet_lecun_stereo.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"85\" src=\"https:\/\/3.bp.blogspot.com\/-sOdinuQ3bBg\/VpYGCkrwjcI\/AAAAAAAAOfM\/2JbO1Sny3A0\/s400\/convnet_lecun_stereo.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1409.4326\"\u003ELearning Stereo Similarity Functions\u003C\/a\u003E via ConvNets, by Yan LeCun and collaborators.\u003C\/div\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003EWhile many of the panel speakers responded with a somewhat affirmative \"no\", it was Newcombe which surprisingly championed what the marriage of Deep Learning and SLAM might look like.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cb\u003ENewcombe's Proposal:\u0026nbsp;\u003C\/b\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003EUse SLAM to fuel Deep Learning\u003C\/b\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-family: inherit;\"\u003EAlthough Newcombe didn’t provide much evidence or ideas on how Deep Learning might help SLAM, he provided \u003Cb\u003Ea clear path on how SLAM might help Deep Learning\u003C\/b\u003E.\u0026nbsp; Think of all those maps that we've built using large-scale SLAM and all those correspondences that these systems provide — isn’t that a clear path for building terascale image-image \"association\" datasets which should be able to help deep learning? The basic idea is that today's SLAM systems are large-scale \"correspondence engines\" which can be used to generate large-scale datasets, precisely what needs to be fed into a deep ConvNet.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"font-family: inherit;\"\u003E\u003Cb\u003EConcluding Remarks\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cspan style=\"font-family: inherit;\"\u003EThere is quite a large disconnect between the kind of work done at the mainstream ICCV conference (heavy on machine learning) and the kind of work presented at the real-time SLAM workshop (heavy on geometric methods like bundle adjustment). The mainstream Computer Vision community has witnessed several mini-revolutions within the past decade (e.g., Dalal-Triggs, DPM, ImageNet, ConvNets, R-CNN) while the SLAM systems of today don’t look very different than they did 8 years ago. The Kinect sensor has probably been the single largest game changer in SLAM, but the fundamental algorithms remain intact.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; color: #222222;\"\u003E\u003Cdiv style=\"-webkit-text-stroke-width: 0px; background-color: white; color: #222222; font-family: Times; font-size: medium; font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; orphans: auto; text-align: start; text-indent: 0px; text-transform: none; white-space: normal; widows: 1; word-spacing: 0px;\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003E\u003Ca href=\"http:\/\/wordpress.viu.ca\/ciel\/files\/2013\/01\/134992626.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/wordpress.viu.ca\/ciel\/files\/2013\/01\/134992626.jpg\" height=\"296\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003E\u003Cb\u003EIntegrating semantic information: The next frontier in Visual SLAM.\u003C\/b\u003E\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; margin: 0px; text-align: center;\"\u003EBrain image from\u0026nbsp;\u003Ca href=\"http:\/\/wordpress.viu.ca\/ciel\/2013\/01\/23\/gaming-and-student-disengagement\/\"\u003EArwen Wallington\u003C\/a\u003E's blog post.\u003C\/div\u003E\u003Cdiv style=\"margin: 0px;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"margin: 0px;\"\u003EToday’s SLAM systems help machines geometrically understand the immediate world (i.e., build associations in a local coordinate system) while today’s Deep Learning systems help machines reason categorically (i.e., build associations across distinct object instances). In conclusion, I share Newcombe and Davison excitement in Visual SLAM, as vision-based algorithms are going to turn Augmented and Virtual Reality into billion dollar markets. However, we should not forget to keep our eyes on the \"trillion-dollar\" market, the one that's going to redefine what it means to \"work\" -- namely \u003Ci\u003ERobotics\u003C\/i\u003E. The day of Robot SLAM will come soon.\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/2190638093839413385\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/01\/why-slam-matters-future-of-real-time.html#comment-form","title":"27 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2190638093839413385"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2190638093839413385"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2016\/01\/why-slam-matters-future-of-real-time.html","title":"The Future of Real-Time SLAM and Deep Learning vs SLAM"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/1.bp.blogspot.com\/-3WNdePDKHQw\/VpOAwv91xWI\/AAAAAAAAOcY\/Q6oXFwf14Jw\/s72-c\/slammies2.png","height":"72","width":"72"},"thr$total":{"$t":"27"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-3255634544070567159"},"published":{"$t":"2015-12-08T23:57:00.000-05:00"},"updated":{"$t":"2016-06-13T07:40:03.573-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"chile"},{"scheme":"http://www.blogger.com/atom/ns#","term":"conference"},{"scheme":"http://www.blogger.com/atom/ns#","term":"ConvNets"},{"scheme":"http://www.blogger.com/atom/ns#","term":"decision forests"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep features"},{"scheme":"http://www.blogger.com/atom/ns#","term":"future directions"},{"scheme":"http://www.blogger.com/atom/ns#","term":"geometry"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hierarchy"},{"scheme":"http://www.blogger.com/atom/ns#","term":"iccv 2015"},{"scheme":"http://www.blogger.com/atom/ns#","term":"objectness"},{"scheme":"http://www.blogger.com/atom/ns#","term":"optical flow"},{"scheme":"http://www.blogger.com/atom/ns#","term":"papers"},{"scheme":"http://www.blogger.com/atom/ns#","term":"research"},{"scheme":"http://www.blogger.com/atom/ns#","term":"stereo"},{"scheme":"http://www.blogger.com/atom/ns#","term":"super-resolution"},{"scheme":"http://www.blogger.com/atom/ns#","term":"unsupervised"}],"title":{"type":"text","$t":"ICCV 2015: Twenty one hottest research papers"},"content":{"type":"html","$t":"\u003Ch3\u003E\"Geometry vs Recognition\" becomes ConvNet-for-X\u003C\/h3\u003EComputer Vision used to be cleanly separated into two schools: \u003Cb\u003Egeometry\u003C\/b\u003E and \u003Cb\u003Erecognition\u003C\/b\u003E.\u0026nbsp;Geometric methods like structure from motion and optical flow usually focus on measuring objective real-world quantities like 3D \"real-world\" distances directly from images and recognition techniques like support vector machines and probabilistic graphical models traditionally focus on perceiving high-level semantic information (i.e., is this a dog or a table) directly from images.\u003Cbr \/\u003E\u003Cbr \/\u003EThe world of computer vision \u003Cstrike\u003Eis changing fast\u003C\/strike\u003E has changed. We now have powerful convolutional neural networks that are able to extract \u003Ci\u003Ejust about anything\u003C\/i\u003E directly from images. So if your input is an image (or set of images), then there's probably a ConvNet for your problem. \u0026nbsp;While you do need a large labeled dataset, believe me when I say that collecting a large dataset is much easier than manually tweaking knobs inside your 100K-line codebase. As we're about to see, the separation between geometric methods and learning-based methods is no longer easily discernible.\u003Cbr \/\u003E\u003Cbr \/\u003EBy 2016 just about everybody in the computer vision community will have tasted the power of ConvNets, so let's take a look at some of the hottest new research directions in computer vision.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EICCV 2015's Twenty One Hottest Research Papers\u003C\/h3\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-OItMW6lzZ9A\/VmepmMJ2COI\/AAAAAAAAOa8\/MB1unZ7hr8Y\/s1600\/onfire-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"233\" src=\"https:\/\/2.bp.blogspot.com\/-OItMW6lzZ9A\/VmepmMJ2COI\/AAAAAAAAOa8\/MB1unZ7hr8Y\/s320\/onfire-01.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EThis December in Santiago, Chile, the \u003Ca href=\"http:\/\/pamitc.org\/iccv15\/\"\u003EInternational Conference of Computer Vision 2015\u003C\/a\u003E is going to bring together the world's leading researchers in Computer Vision, Machine Learning, and Computer Graphics.\u003Cbr \/\u003E\u003Cbr \/\u003ETo no surprise, this year's ICCV is filled with lots of ConvNets, but this time the applications of these Deep Learning tools are being applied to much much more creative tasks. Let's take a look at the following \u003Cb\u003Etwenty one ICCV 2015 research papers\u003C\/b\u003E, which will hopefully give you a taste of where the field is going.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E1.\u0026nbsp;\u003Ca href=\"https:\/\/www.d2.mpi-inf.mpg.de\/sites\/default\/files\/iccv15-neural_qa.pdf\"\u003EAsk Your Neurons: A Neural-Based Approach to Answering Questions About Images\u003C\/a\u003E Mateusz Malinowski, Marcus Rohrbach, Mario Fritz\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-YZ8eabNTeI0\/VmOay_v1zlI\/AAAAAAAAOW4\/05yONZ7wws4\/s1600\/malinowski.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"155\" src=\"https:\/\/2.bp.blogspot.com\/-YZ8eabNTeI0\/VmOay_v1zlI\/AAAAAAAAOW4\/05yONZ7wws4\/s320\/malinowski.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\"We propose a novel approach based on recurrent neural networks for the challenging task of answering of questions about images. It combines a CNN with a LSTM into an end-to-end architecture that predict answers conditioning on a question and an image.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E2.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1506.06724v1.pdf\"\u003EAligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books\u003C\/a\u003E\u0026nbsp;Yukun Zhu, Ryan Kiros, Rich Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, Sanja Fidler\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-mamWdhY2KL4\/VmOcUrBCZfI\/AAAAAAAAOXE\/H1tJf7cIYE0\/s1600\/torralba.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"196\" src=\"https:\/\/2.bp.blogspot.com\/-mamWdhY2KL4\/VmOcUrBCZfI\/AAAAAAAAOXE\/H1tJf7cIYE0\/s320\/torralba.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\"To align movies and books we exploit a neural sentence embedding that is trained in an unsupervised way from a large corpus of books, as well as a video-text neural embedding for computing similarities between movie clips and sentences in the book.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E3.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1505.01596.pdf\"\u003ELearning to See by Moving\u003C\/a\u003E Pulkit Agrawal, Joao Carreira, Jitendra Malik\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-b_tUpJAOD2c\/VmOdcXy74DI\/AAAAAAAAOXQ\/YlH43tryOYY\/s1600\/pulkit.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"196\" src=\"https:\/\/2.bp.blogspot.com\/-b_tUpJAOD2c\/VmOdcXy74DI\/AAAAAAAAOXQ\/YlH43tryOYY\/s320\/pulkit.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We show that using the same number of training images, features learnt using egomotion as supervision compare favourably to features learnt using class-label as supervision on the tasks of scene recognition, object recognition, visual odometry and keypoint matching.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E4.\u0026nbsp;\u003Ca href=\"https:\/\/hal.inria.fr\/hal-01207966\/document\"\u003ELocal Convolutional Features With Unsupervised Training for Image Retrieval \u003C\/a\u003EMattis Paulin, Matthijs Douze, Zaid Harchaoui, Julien Mairal, Florent Perronin, Cordelia Schmid\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-EhW5uNu3grc\/VmOeApfDUoI\/AAAAAAAAOXc\/rREG2irhh3w\/s1600\/paulin.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"152\" src=\"https:\/\/3.bp.blogspot.com\/-EhW5uNu3grc\/VmOeApfDUoI\/AAAAAAAAOXc\/rREG2irhh3w\/s320\/paulin.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We introduce a deep convolutional architecture that yields patch-level descriptors, as an alternative to the popular SIFT descriptor for image retrieval.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E5.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1507.08905v4.pdf\"\u003EDeep Networks for Image Super-Resolution With Sparse Prior\u003C\/a\u003E Zhaowen Wang, Ding Liu, Jianchao Yang, Wei Han, Thomas Huang\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-zi-b1m717LU\/VmOfwOVlYjI\/AAAAAAAAOXo\/hojgfeeMRQ4\/s1600\/huang.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"100\" src=\"https:\/\/3.bp.blogspot.com\/-zi-b1m717LU\/VmOfwOVlYjI\/AAAAAAAAOXo\/hojgfeeMRQ4\/s320\/huang.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We show that a sparse coding model particularly designed for super-resolution can be incarnated as a neural network, and trained in a cascaded structure from end to end.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E6.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1504.06201v3.pdf\"\u003EHigh-for-Low and Low-for-High: Efficient Boundary Detection From Deep Object Features and its Applications to High-Level Vision\u003C\/a\u003E Gedas Bertasius, Jianbo Shi, Lorenzo Torresani\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-nfEiHlEE2r4\/VmOghOhDpBI\/AAAAAAAAOX0\/xu6q_xwUalE\/s1600\/lorenzo.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"306\" src=\"https:\/\/1.bp.blogspot.com\/-nfEiHlEE2r4\/VmOghOhDpBI\/AAAAAAAAOX0\/xu6q_xwUalE\/s320\/lorenzo.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"In this work we show how to predict boundaries by exploiting object level features from a pretrained object-classification network.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E7. A\u003Ca href=\"http:\/\/research.baidu.com\/wp-content\/uploads\/2015\/11\/A-Deep-Visual-Correspondence-Embedding-Model-for-Stereo-Matching-Costs.pdf\"\u003E Deep Visual Correspondence Embedding Model for Stereo Matching Costs\u003C\/a\u003E Zhuoyuan Chen, Xun Sun, Liang Wang, Yinan Yu, Chang Huang\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-ZymsUgepamg\/VmOhORou1TI\/AAAAAAAAOYA\/4p_AK2hh35Q\/s1600\/baidu.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"190\" src=\"https:\/\/1.bp.blogspot.com\/-ZymsUgepamg\/VmOhORou1TI\/AAAAAAAAOYA\/4p_AK2hh35Q\/s320\/baidu.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"A novel deep visual correspondence embedding model is trained via Convolutional Neural Network on a large set of stereo images with ground truth disparities. This deep embedding model leverages appearance data to learn visual similarity relationships between corresponding image patches, and explicitly maps intensity values into an embedding feature space to measure pixel dissimilarities.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E8.\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.ubc.ca\/~murphyk\/Papers\/im2calories_iccv15.pdf\"\u003EIm2Calories: Towards an Automated Mobile Vision Food Diary\u003C\/a\u003E Austin Meyers, Nick Johnston, Vivek Rathod, Anoop Korattikara, Alex Gorban, Nathan Silberman, Sergio Guadarrama, George Papandreou, Jonathan Huang, Kevin P. Murphy\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-EE90tdIWdSk\/VmOiWNe32DI\/AAAAAAAAOYM\/Ulu9qXokp8Y\/s1600\/im2calories.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"194\" src=\"https:\/\/1.bp.blogspot.com\/-EE90tdIWdSk\/VmOiWNe32DI\/AAAAAAAAOYM\/Ulu9qXokp8Y\/s320\/im2calories.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We present a system which can recognize the contents of your meal from a single image, and then predict its nutritional contents, such as calories.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E9.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1505.05192v2.pdf\"\u003EUnsupervised Visual Representation Learning by Context Prediction\u003C\/a\u003E Carl Doersch, Abhinav Gupta, Alexei A. Efros\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-9ew7pwXd6x4\/VmOkS9xZaCI\/AAAAAAAAOYY\/tuL9fS-pP-I\/s1600\/carl.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"300\" src=\"https:\/\/2.bp.blogspot.com\/-9ew7pwXd6x4\/VmOkS9xZaCI\/AAAAAAAAOYY\/tuL9fS-pP-I\/s320\/carl.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"How can one write an objective function to encourage a representation to capture, for example, objects, if none of the objects are labeled?\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E10.\u0026nbsp;\u003Ca href=\"http:\/\/research.microsoft.com\/pubs\/255952\/ICCV15_DeepNDF_main.pdf\"\u003EDeep Neural Decision Forests\u003C\/a\u003E Peter Kontschieder, Madalina Fiterau, Antonio Criminisi, Samuel Rota Bulò\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-tnHOhE9j0gM\/VmOmpqBdWUI\/AAAAAAAAOYk\/QY3rsHYeQRs\/s1600\/neural_forests.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"137\" src=\"https:\/\/1.bp.blogspot.com\/-tnHOhE9j0gM\/VmOmpqBdWUI\/AAAAAAAAOYk\/QY3rsHYeQRs\/s320\/neural_forests.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We introduce a stochastic and differentiable decision tree model, which steers the representation learning usually conducted in the initial layers of a (deep) convolutional network.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E11.\u0026nbsp;\u003Ca href=\"http:\/\/www.robots.ox.ac.uk\/~szheng\/papers\/CRFasRNN.pdf\"\u003EConditional Random Fields as Recurrent Neural Networks\u003C\/a\u003E Shuai Zheng, Sadeep Jayasumana, Bernardino Romera-Paredes, Vibhav Vineet, Zhizhong Su, Dalong Du, Chang Huang, Philip H. S. Torr\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-akVEQ-3jL5I\/VmOncOQeG4I\/AAAAAAAAOYw\/VLQ9QQShjL4\/s1600\/crfrnn.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"134\" src=\"https:\/\/4.bp.blogspot.com\/-akVEQ-3jL5I\/VmOncOQeG4I\/AAAAAAAAOYw\/VLQ9QQShjL4\/s320\/crfrnn.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We formulate mean-field approximate inference for the Conditional Random Fields with Gaussian pairwise potentials as Recurrent Neural Networks.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E12.\u0026nbsp;\u003Ca href=\"https:\/\/www.robots.ox.ac.uk\/~vgg\/publications\/2015\/Pfister15a\/pfister15a.pdf\"\u003EFlowing ConvNets for Human Pose Estimation in Videos\u003C\/a\u003E Tomas Pfister, James Charles, Andrew Zisserman\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-sx8eJ1gyB_w\/VmOo2N9548I\/AAAAAAAAOY8\/YT2zl9pSQgw\/s1600\/pose.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"128\" src=\"https:\/\/3.bp.blogspot.com\/-sx8eJ1gyB_w\/VmOo2N9548I\/AAAAAAAAOY8\/YT2zl9pSQgw\/s320\/pose.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We investigate a ConvNet architecture that is able to benefit from temporal context by combining information across the multiple frames using optical flow.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E13.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1505.00295v1.pdf\"\u003EDense Optical Flow Prediction From a Static Image\u003C\/a\u003E Jacob Walker, Abhinav Gupta, Martial Hebert\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-Dpb7ON1dnqU\/VmOqG3qZ_GI\/AAAAAAAAOZI\/n8uOPdCUCjY\/s1600\/walker.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"120\" src=\"https:\/\/2.bp.blogspot.com\/-Dpb7ON1dnqU\/VmOqG3qZ_GI\/AAAAAAAAOZI\/n8uOPdCUCjY\/s320\/walker.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\"Given a static image, P-CNN predicts the future motion of each and every pixel in the image in terms of optical flow. Our P-CNN model leverages the data in tens of thousands of realistic videos to train our model. Our method relies on absolutely no human labeling and is able to predict motion based on the context of the scene.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E14.\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~wckuo\/KuoICCV2015.pdf\"\u003EDeepBox: Learning Objectness With Convolutional Networks\u003C\/a\u003E Weicheng Kuo, Bharath Hariharan, Jitendra Malik\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-9jYNKfnL9cY\/VmOrcQz_P6I\/AAAAAAAAOZU\/bXAirMjpnwc\/s1600\/deepbox.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"179\" src=\"https:\/\/2.bp.blogspot.com\/-9jYNKfnL9cY\/VmOrcQz_P6I\/AAAAAAAAOZU\/bXAirMjpnwc\/s320\/deepbox.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"Our framework, which we call DeepBox, uses convolutional neural networks (CNNs) to rerank proposals from a bottom-up method.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E15.\u0026nbsp;\u003Ca href=\"http:\/\/web.engr.illinois.edu\/~slazebni\/publications\/iccv15_active.pdf\"\u003EActive Object Localization With Deep Reinforcement Learning\u003C\/a\u003E Juan C. Caicedo, Svetlana Lazebnik\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/--aE0740Mvxs\/VmOsREiQ0GI\/AAAAAAAAOZg\/F1AL0yUMQ68\/s1600\/reinf.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"122\" src=\"https:\/\/2.bp.blogspot.com\/--aE0740Mvxs\/VmOsREiQ0GI\/AAAAAAAAOZg\/F1AL0yUMQ68\/s320\/reinf.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"This agent learns to deform a bounding box using simple transformation actions, with the goal of determining the most specific location of target objects following top-down reasoning.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E16.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1411.4734v3.pdf\"\u003EPredicting Depth, Surface Normals and Semantic Labels With a Common Multi-Scale Convolutional Architecture\u003C\/a\u003E David Eigen, Rob Fergus\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-O7D9kuYg1Po\/VmebD1TBFdI\/AAAAAAAAOZ0\/r_GAL23Hx9U\/s1600\/eigen.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"320\" src=\"https:\/\/2.bp.blogspot.com\/-O7D9kuYg1Po\/VmebD1TBFdI\/AAAAAAAAOZ0\/r_GAL23Hx9U\/s320\/eigen.png\" width=\"288\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We address three different computer vision tasks using a single multiscale convolutional network architecture: depth prediction, surface normal estimation, and semantic labeling.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E17.\u0026nbsp;\u003Ca href=\"https:\/\/dl.dropboxusercontent.com\/u\/44884434\/2015-HDCNN\/hdcnn-iccv15-CD.pdf\"\u003EHD-CNN: Hierarchical Deep Convolutional Neural Networks for Large Scale Visual Recognition\u003C\/a\u003E Zhicheng Yan, Hao Zhang, Robinson Piramuthu, Vignesh Jagadeesh, Dennis DeCoste, Wei Di, Yizhou Yu\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-71RqjVid3w4\/VmebjSYyNPI\/AAAAAAAAOaA\/Cp5K9QKZ0So\/s1600\/hdcnn_imagenet_vgg16layer.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"164\" src=\"https:\/\/1.bp.blogspot.com\/-71RqjVid3w4\/VmebjSYyNPI\/AAAAAAAAOaA\/Cp5K9QKZ0So\/s320\/hdcnn_imagenet_vgg16layer.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We introduce hierarchical deep CNNs (HD-CNNs) by embedding deep CNNs into a category hierarchy. An HD-CNN separates easy classes using a coarse category classifier while distinguishing difficult classes using fine category classifiers.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E18.\u0026nbsp;\u003Ca href=\"http:\/\/lmb.informatik.uni-freiburg.de\/Publications\/2015\/DFIB15\/flownet.pdf\"\u003EFlowNet: Learning Optical Flow With Convolutional Networks\u003C\/a\u003E Alexey Dosovitskiy, Philipp Fischer, Eddy Ilg, Philip Häusser, Caner Hazırbaş, Vladimir Golkov, Patrick van der Smagt, Daniel Cremers, Thomas Brox\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-bSwoFVVrptI\/VmecdGKlBvI\/AAAAAAAAOaM\/RH8llRLM9YA\/s1600\/flownet.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"168\" src=\"https:\/\/3.bp.blogspot.com\/-bSwoFVVrptI\/VmecdGKlBvI\/AAAAAAAAOaM\/RH8llRLM9YA\/s320\/flownet.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"We construct appropriate CNNs which are capable of solving the optical flow estimation problem as a supervised learning task.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E19.\u0026nbsp;\u003Ca href=\"http:\/\/imagine.enpc.fr\/~aubrym\/projects\/features_analysis\/texts\/understanding_deep_features_with_CG.pdf\"\u003EUnderstanding Deep Features With Computer-Generated Imagery\u003C\/a\u003E Mathieu Aubry, Bryan C. Russell\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-B-WqC6T614c\/VmedW-WnQ2I\/AAAAAAAAOaY\/JjXaFysD4Cw\/s1600\/russell.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"116\" src=\"https:\/\/4.bp.blogspot.com\/-B-WqC6T614c\/VmedW-WnQ2I\/AAAAAAAAOaY\/JjXaFysD4Cw\/s320\/russell.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\"Rendered images are presented to a trained CNN and responses for different layers are studied with respect to the input scene factors.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E20.\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/pdf\/1505.07427v3.pdf\"\u003EPoseNet: A Convolutional Network for Real-Time 6-DOF Camera Relocalization\u003C\/a\u003E Alex Kendall, Matthew Grimes, Roberto Cipolla\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-NiHeDknbdOA\/VmedusLsKAI\/AAAAAAAAOak\/6X_qO6P196g\/s1600\/posenet.png\" imageanchor=\"1\" style=\"clear: left; float: left; margin-bottom: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"134\" src=\"https:\/\/4.bp.blogspot.com\/-NiHeDknbdOA\/VmedusLsKAI\/AAAAAAAAOak\/6X_qO6P196g\/s320\/posenet.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"Our system trains a convolutional neural network to regress the 6-DOF camera pose from a single RGB image in an end-to-end manner with no need of additional engineering or graph optimisation.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E21.\u0026nbsp;\u003Ca href=\"http:\/\/scott89.github.io\/FCNT\/\"\u003EVisual Tracking With Fully Convolutional Networks\u003C\/a\u003E Lijun Wang, Wanli Ouyang, Xiaogang Wang, Huchuan Lu\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-fQD2V_1EUqM\/VmeeG1BNZkI\/AAAAAAAAOaw\/NcDLLrViaE8\/s1600\/fcnt.png\" imageanchor=\"1\" style=\"clear: right; float: right; margin-bottom: 1em; margin-left: 1em;\"\u003E\u003Cimg border=\"0\" height=\"90\" src=\"https:\/\/2.bp.blogspot.com\/-fQD2V_1EUqM\/VmeeG1BNZkI\/AAAAAAAAOaw\/NcDLLrViaE8\/s320\/fcnt.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\"A new approach for general object tracking with fully convolutional neural network.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EConclusion\u003C\/h3\u003EWhile some can argue that the great convergence upon ConvNets is making the field less diverse, it is actually making the techniques easier to comprehend. It is easier to \"borrow breakthrough thinking\" from one research direction when the core computations are cast in the language of ConvNets. Using ConvNets, properly trained (and motivated!) 21 year old graduate student are actually able to compete on benchmarks, where previously it would take an entire 6-year PhD cycle to compete on a non-trivial benchmark.\u003Cbr \/\u003E\u003Cbr \/\u003ESee you next week in Chile!\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cb\u003EUpdate (January 13th, 2016)\u003C\/b\u003E\u003C\/h3\u003E\u003Cdiv style=\"background-color: white; color: #333333; font-size: 18px; line-height: 23px; margin-bottom: 11.5px;\"\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EThe following awards were given at ICCV 2015.\u003C\/span\u003E\u003C\/div\u003E\u003Ch3 style=\"background-color: white; color: #333333; line-height: 34.5px; margin: 0px; text-rendering: optimizeLegibility;\"\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif; font-size: small;\"\u003EAchievement awards\u003C\/span\u003E\u003C\/h3\u003E\u003Cul style=\"background-color: white; color: #333333; line-height: 23px; margin: 0px 0px 11.5px 25px; padding: 0px;\"\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Distinguished Researcher Award (1):\u0026nbsp;\u003Cstrong\u003EYann LeCun\u003C\/strong\u003E\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Distinguished Researcher Award (2):\u0026nbsp;\u003Cstrong\u003EDavid Lowe\u003C\/strong\u003E\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Everingham Prize Winner (1):\u0026nbsp;\u003Cstrong\u003EAndrea Vedaldi\u003C\/strong\u003E\u0026nbsp;for\u0026nbsp;\u003Ca href=\"http:\/\/www.vlfeat.org\/\" style=\"color: #0088cc; text-decoration: none;\"\u003EVLFeat\u003C\/a\u003E\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Everingham Prize Winner (2):\u0026nbsp;\u003Cstrong\u003EDaniel Scharstein\u003C\/strong\u003E\u0026nbsp;and\u0026nbsp;\u003Cstrong\u003ERick Szeliski\u0026nbsp;\u003C\/strong\u003Efor the\u0026nbsp;\u003Ca href=\"http:\/\/vision\/middlebury.edu\/stereo\/data\/\" style=\"color: #0088cc; text-decoration: none;\"\u003EMiddlebury Datasets\u003C\/a\u003E\u003C\/span\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Ch2 style=\"background-color: white; color: #333333; line-height: 46px; margin: 0px; text-rendering: optimizeLegibility;\"\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif; font-size: small;\"\u003EPaper awards\u003C\/span\u003E\u003C\/h2\u003E\u003Cul style=\"background-color: white; color: #333333; line-height: 23px; margin: 0px 0px 11.5px 25px; padding: 0px;\"\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Helmholtz Prize (1):\u0026nbsp;\u003Cstrong\u003EDavid Martin\u003C\/strong\u003E,\u0026nbsp;\u003Cstrong\u003ECharles Fowlkes\u003C\/strong\u003E,\u0026nbsp;\u003Cstrong\u003EDoron Tal\u003C\/strong\u003E, and\u0026nbsp;\u003Cstrong\u003EJitendra Malik\u003C\/strong\u003E\u0026nbsp;for their ICCV 2001 paper \"A database of human segmented natural images and its application to evaluating segmentation algorithms and measuring ecological statistics\".\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EPAMI Helmholtz Prize (2):\u0026nbsp;\u003Cstrong\u003ESerge Belongie\u003C\/strong\u003E,\u0026nbsp;\u003Cstrong\u003EJitendra Malik\u003C\/strong\u003E, and\u0026nbsp;\u003Cstrong\u003EJan Puzicha\u003C\/strong\u003E, for their ICCV 2001 paper \"Matching Shapes\".\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EMarr Prize:\u0026nbsp;\u003Cstrong\u003EPeter Kontschieder\u003C\/strong\u003E,\u0026nbsp;\u003Cstrong\u003EMadalina Fiterau\u003C\/strong\u003E,\u0026nbsp;\u003Cstrong\u003EAntonio Criminisi\u003C\/strong\u003E, and\u0026nbsp;\u003Cstrong\u003ESamual Rota Bulo\u003C\/strong\u003E, for\u0026nbsp;\u003Ca href=\"http:\/\/www.cv-foundation.org\/openaccess\/content_iccv_2015\/papers\/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.pdf\" style=\"color: #0088cc; text-decoration: none;\"\u003E\"Deep Neural Decision Forests\"\u003C\/a\u003E.\u003C\/span\u003E\u003C\/li\u003E\u003Cli\u003E\u003Cspan style=\"font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003EMarr Prize honorable mention:\u0026nbsp;\u003Cstrong\u003ESaining Xie\u003C\/strong\u003E\u0026nbsp;and\u0026nbsp;\u003Cstrong\u003EZhuowen Tu\u003C\/strong\u003E\u0026nbsp;for\u003Ca href=\"http:\/\/www.cv-foundation.org\/openaccess\/content_iccv_2015\/papers\/Xie_Holistically-Nested_Edge_Detection_ICCV_2015_paper.pdf\" style=\"color: #0088cc; text-decoration: none;\"\u003E\"Holistically-Nested Edge Detection\"\u003C\/a\u003E.\u003C\/span\u003E\u003C\/li\u003E\u003C\/ul\u003E\u003Cdiv\u003E\u003Cspan style=\"color: #333333; font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003E\u003Cspan style=\"line-height: 23px;\"\u003EFor more information about awards, see \u003Ca href=\"http:\/\/www.nowozin.net\/sebastian\/blog\/iccv-2015-day-2.html\"\u003ESebastian Nowozin's ICCV-day-2 blog post\u003C\/a\u003E.\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cspan style=\"color: #333333; font-family: \u0026quot;times\u0026quot; , \u0026quot;times new roman\u0026quot; , serif;\"\u003E\u003Cspan style=\"line-height: 23px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003EI also wrote another ICCV-related blog post (January 13, 2016) about the \u003Ca href=\"http:\/\/www.computervisionblog.com\/2016\/01\/why-slam-matters-future-of-real-time.html\"\u003EFuture of Real-Time SLAM\u003C\/a\u003E."},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/3255634544070567159\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/12\/iccv-2015-twenty-one-hottest-research.html#comment-form","title":"7 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3255634544070567159"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3255634544070567159"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/12\/iccv-2015-twenty-one-hottest-research.html","title":"ICCV 2015: Twenty one hottest research papers"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/2.bp.blogspot.com\/-OItMW6lzZ9A\/VmepmMJ2COI\/AAAAAAAAOa8\/MB1unZ7hr8Y\/s72-c\/onfire-01.png","height":"72","width":"72"},"thr$total":{"$t":"7"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-3843217399785041099"},"published":{"$t":"2015-11-07T03:35:00.001-05:00"},"updated":{"$t":"2016-06-13T07:41:20.597-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"abbeel"},{"scheme":"http://www.blogger.com/atom/ns#","term":"clarifai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"dato"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"gold rush"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"guestrin"},{"scheme":"http://www.blogger.com/atom/ns#","term":"gupta"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hinton"},{"scheme":"http://www.blogger.com/atom/ns#","term":"lecun"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"metamind"},{"scheme":"http://www.blogger.com/atom/ns#","term":"Microsoft"},{"scheme":"http://www.blogger.com/atom/ns#","term":"saxena"},{"scheme":"http://www.blogger.com/atom/ns#","term":"silicon valley"},{"scheme":"http://www.blogger.com/atom/ns#","term":"socher"},{"scheme":"http://www.blogger.com/atom/ns#","term":"startups"},{"scheme":"http://www.blogger.com/atom/ns#","term":"toyota"},{"scheme":"http://www.blogger.com/atom/ns#","term":"yc"},{"scheme":"http://www.blogger.com/atom/ns#","term":"zeiler"}],"title":{"type":"text","$t":"The Deep Learning Gold Rush of 2015"},"content":{"type":"html","$t":"In the last few decades, we have witnessed major technological innovations such as personal computers and the internet finally reach the mainstream. And with mobile devices and social networks on the rise, we're now more connected than ever. \u003Ci\u003ESo what's next? When is it coming? And how will it change our lives?\u003C\/i\u003E Today I'll tell you that the next big advance is well underway and it's being fueled by a recent technique in the field of\u0026nbsp;Artificial Intelligence\u003Cb\u003E \u003C\/b\u003Eknown as\u003Cb\u003E\u0026nbsp;Deep Learning\u003C\/b\u003E.\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cb\u003E\u003Cbr class=\"Apple-interchange-newline\" \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-KLOHUJmgbR4\/VjRs4tdbZ8I\/AAAAAAAAOU8\/QzvI87MUF18\/s1600\/california_gold_rush-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"256\" src=\"https:\/\/2.bp.blogspot.com\/-KLOHUJmgbR4\/VjRs4tdbZ8I\/AAAAAAAAOU8\/QzvI87MUF18\/s400\/california_gold_rush-01.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ci\u003EThe California Gold Rush of 2015 is all about Deep Learning.\u0026nbsp;\u003C\/i\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ci\u003EIt's everywhere, you just don't know how to look.\u003C\/i\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EAll of today's excitement in Artificial Intelligence and Machine Learning stems from ground-breaking results in speech and visual object recognition using Deep Learning[1]. These algorithms are being applied to all sorts of data, and the learned deep neural networks outperform traditional expert systems carefully designed by scientists and engineers. End-to-end learning of deep representations from raw data is now possible due to a handful of well-performing deep learning recipes (\u003Ca href=\"http:\/\/www.cs.nyu.edu\/~yann\/talks\/lecun-ranzato-icml2013.pdf\"\u003EConvNets\u003C\/a\u003E, \u003Ca href=\"https:\/\/www.cs.toronto.edu\/~hinton\/absps\/JMLRdropout.pdf\"\u003EDropout\u003C\/a\u003E, \u003Ca href=\"http:\/\/www.cs.toronto.edu\/~fritz\/absps\/reluICML.pdf\"\u003EReLUs\u003C\/a\u003E, \u003Ca href=\"http:\/\/deeplearning.cs.cmu.edu\/pdfs\/Hochreiter97_lstm.pdf\"\u003ELSTM\u003C\/a\u003E, \u003Ca href=\"https:\/\/www.cs.toronto.edu\/~vmnih\/docs\/dqn.pdf\"\u003EDQN\u003C\/a\u003E,\u0026nbsp;\u003Ca href=\"http:\/\/www.image-net.org\/\"\u003EImageNet\u003C\/a\u003E).\u0026nbsp;But if there's one final takeaway that we can extract from decades of machine learning research, is that for many problems going deep isn't a choice, it's often a \u003Ci\u003Erequirement\u003C\/i\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003EMost of the apps and services you're already using (AirBnB, Snapchat, Twitch.tv, Uber, Yelp, LinkedIn, etc) are quite data-hungry and before you know it, they're all going to go mega-deep. So whether you need to revitalize your data science team with deep learning or you're starting an AI-from-day-one operation, it's pretty clear that everybody is rushing to get some of this\u0026nbsp;\u003Cb\u003ESilicon Valley Gold.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EFrom Titans to Gold Miners: Your atypical Gold Rush\u003C\/h3\u003ELike all great gold rushes, this movement is led by new faces, which are pouring into Silicon Valley like droves. But these aren't your typical unskilled immigrants willing to pick up a hammer, nor your fresh computer science grads with some app-writing skills. The key deep learning players of today (known as the Titans of Deep Learning) are computer science professors and researchers (seldom born in the USA) leaving their academic posts and bringing their students and ideas straight into Silicon Valley.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\"\u003Ci\u003ETurn on, Tune in, Dropout\u003C\/i\u003E\" -- Timothy Leary\u003C\/div\u003E\u003Cbr \/\u003ERecently, Google and Facebook announced that their operations are now being powered by Deep Learning [2,3]. And with most Deep Learning Titans representing the tech giants (\u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann LeCun\u003C\/a\u003E at Facebook Research, \u003Ca href=\"http:\/\/www.cs.toronto.edu\/~hinton\/\"\u003EGeoffrey Hinton\u003C\/a\u003E at Google, \u003Ca href=\"http:\/\/www.andrewng.org\/\"\u003EAndrew Ng\u003C\/a\u003E at Baidu), \u003Cb\u003EDeep Learning is likely to become one of the most sought after tech skills\u003C\/b\u003E. With\u0026nbsp;\u003Ca href=\"http:\/\/fortune.com\/2015\/11\/06\/toyota-ai-silicon-valley-robotics\/\"\u003EToyota to invest in $1 Billion in Robotics and Artificial Intelligence Research\u003C\/a\u003E\u0026nbsp;(November 6, 2015), the announcement of\u0026nbsp;\u003Ca href=\"http:\/\/blog.ycombinator.com\/yc-research\"\u003EYC Research\u003C\/a\u003E\u0026nbsp;(October 7, 2015), and the new\u0026nbsp;\u003Ca href=\"https:\/\/www.google.com\/about\/careers\/search#!t=jo\u0026amp;jid=147545001\u0026amp;\"\u003EGoogle Brain Residency Program\u003C\/a\u003E\u0026nbsp;\"Pre-doc\" AI jobs (October 26, 2015), Silicon Valley just got a whole lot more interesting.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003ESilicon Valley re-defines itself, yet again\u0026nbsp;\u003C\/b\u003E\u003C\/h3\u003ETo understand why it took so long for Deep Learning to take-off, let's take a brief look at the key technologies which defined Silicon Valley over the last 50 years. \u0026nbsp;The following timeline gives an overview of where Silicon Valley has been and where it's going.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-8GD4qeDAvow\/VjRx4jVpD8I\/AAAAAAAAOVM\/59xpx5CH1jk\/s1600\/mi-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"255\" src=\"https:\/\/2.bp.blogspot.com\/-8GD4qeDAvow\/VjRx4jVpD8I\/AAAAAAAAOVM\/59xpx5CH1jk\/s400\/mi-01.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"font-size: xx-small;\"\u003EFigure Adapted from \u003Ca href=\"http:\/\/steveblank.com\/secret-history\/\"\u003ESteve Blank's Secret History of Silicon Valley\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E1970s: Semiconductors\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003EThe story of the digital-era starts with semiconductors. \"Silicon\" in \"Silicon Valley\" originally referred to the silicon chip or integrated circuit innovations as well as the location (close to Stanford) of much tech-related activity. The dominant firm from that time period was Fairchild Semiconductor International and it eventually gave rise to more recognizable companies like Intel. For a more detailed discussion of this birthing era, take a look at Steve Blank's \u003Ca href=\"http:\/\/steveblank.com\/secret-history\/\"\u003ESecret History of Silicon Valley\u003C\/a\u003E[4].\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/tctechcrunch2011.files.wordpress.com\/2014\/07\/endeavor-insight-sv-2-retina.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/tctechcrunch2011.files.wordpress.com\/2014\/07\/endeavor-insight-sv-2-retina.png\" height=\"400\" width=\"395\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"font-size: x-small;\"\u003ERead more about Fairchild at\u0026nbsp;TechCrunch's \u003Ca href=\"http:\/\/techcrunch.com\/2014\/07\/26\/the-first-trillion-dollar-startup\/\"\u003EFirst Trillion-Dollar Startup\u0026nbsp;\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003E1980s: Personal Computers\u003C\/b\u003E\u003Cbr \/\u003EInitially computers were quite large and used solely by research labs, government, and big businesses. But it was the personal computer which turned computer programming from a hobby into a vital skill. You no longer needed to be an MIT student to program on one of these badboys. While both Microsoft and Apple were founded in 1975 and 1976, respectively, they persevered due to their pioneering work in graphical user interfaces. This was the birth of the modern user-friendly Operating System. IBM approached Microsoft in 1980, regarding its upcoming personal computer, and from then on Microsoft would be King for a very long time.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.mac-history.net\/wp-content\/uploads\/2011\/01\/PC_IBM_03_10_1983_BW_939.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/www.mac-history.net\/wp-content\/uploads\/2011\/01\/PC_IBM_03_10_1983_BW_939.jpg\" height=\"400\" width=\"298\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"font-size: x-small;\"\u003ESee Mac-history's article on \u003Ca href=\"http:\/\/www.mac-history.net\/apple\/2011-01-30\/microsofts-relationship-with-apple\"\u003EMicrosoft's relationship with Apple\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E1990s: Internet\u003C\/b\u003E\u003Cbr \/\u003EWhile the nerds at Universities were posting ascii messages on newsgroups in the 90s, service providers in the 1990s like AOL helped make the internet accessible to everyone. Remember getting all those AOL disks in the mail? Buying a chunk of digital real state (your own domain name) became possible and anybody with a dial up connection and some primitive text\/HTML skills could start posting online content. With a mission statement like \"organize the world's information\", it was eventually Google that got the most of out the late 90s dot-com bubble, and remains a very strong player in all things tech.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E2000s:\u0026nbsp;\u003C\/b\u003E\u003Cb\u003EMobile\u003C\/b\u003E\u003Cb\u003E\u0026nbsp;and Social\u003C\/b\u003E\u003Cbr \/\u003EWhile the dot-com bubble was about creating an online presence for startups and established companies, the way we use the internet has dramatically changed since 2001. A ton of new social communities have emerged, and due to Facebook we're now stars in our own reality show. Social and advertising have essentially turned the modern internet into a mainstream TV-like experience. The internet is no longer only for the nerds. The kings of this era (Google and Facebook) are also the biggest players in the Deep Learning space, because they have the largest user bases and in-house apps which can benefit most from machine learning.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E2010-2015: Deep Learning comes to the party\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan style=\"text-align: center;\"\u003ESpend more than a day in Silicon Valley and you'll hear the popular expression, \"Software is eating the world.\"\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"text-align: center;\"\u003ERampant spreading of software was only possible once the internet (1990s) AND mobile devices (2000s) became essential parts of our lives. No longer do we physically mail floppy disks, and social media fuels any app that goes viral.\u0026nbsp;\u003C\/span\u003E\u003Cspan style=\"text-align: center;\"\u003E\u003Cspan style=\"text-align: start;\"\u003EWhat traditional software is missing (or has been missing up until now) is the ability to improve over time from everyday use.\u0026nbsp;\u003C\/span\u003E\u003C\/span\u003E\u003Cspan style=\"text-align: center;\"\u003EIf that same software is able to connect to a large Deep Learning system and start improving, then we have a game-changer on our hands. This is already happening with online advertising, digital assistants like Siri, and smart auto-responders like \u003Ca href=\"http:\/\/googleresearch.blogspot.com\/2015\/11\/computer-respond-to-this-email.html\"\u003EGoogle's new email auto-reply feature\u003C\/a\u003E.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cspan style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-dJHGkYU-o9M\/VjSBqTFLu1I\/AAAAAAAAOVc\/LydkP3tNOZ8\/s1600\/single_layer.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"182\" src=\"https:\/\/4.bp.blogspot.com\/-dJHGkYU-o9M\/VjSBqTFLu1I\/AAAAAAAAOVc\/LydkP3tNOZ8\/s400\/single_layer.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"font-size: x-small;\"\u003EThe hierarchical award-winning\u0026nbsp;\u003Ci\u003E\"AlexNet\"\u003C\/i\u003E\u0026nbsp;Deep Learning architecture\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"font-size: x-small;\"\u003EVisualized using MIT's Toolbox for\u0026nbsp;\u003Ca href=\"http:\/\/vision03.csail.mit.edu\/cnn_art\/index.html\"\u003EDeep Learning Neuron Visualization\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EMassive hiring of deep learning experts by the leading tech companies has only begun, but we also should be on the lookout for new ventures built on top of Deep Learning, not just a revitalization of last decade's successes. On this front, keep a close look at the following\u0026nbsp;\u003Ci\u003EDeep Learning Cloud Service\u003C\/i\u003E\u0026nbsp;upstarts:\u0026nbsp;\u003Ca href=\"http:\/\/www.socher.org\/\"\u003ERichard Socher\u003C\/a\u003E\u0026nbsp;from\u0026nbsp;\u003Ca href=\"http:\/\/metamind.io\/\"\u003EMetaMind\u003C\/a\u003E,\u0026nbsp;\u003Ca href=\"http:\/\/www.matthewzeiler.com\/\"\u003EMatthew Zeiler\u003C\/a\u003E\u0026nbsp;from\u0026nbsp;\u003Ca href=\"http:\/\/clarifai.com\/\"\u003EClarifai\u003C\/a\u003E, and \u003Ca href=\"http:\/\/homes.cs.washington.edu\/~guestrin\/\"\u003ECarlos Guestrin\u003C\/a\u003E from \u003Ca href=\"https:\/\/dato.com\/\"\u003EDato\u003C\/a\u003E.\u003Cbr \/\u003E\u003Ci\u003E\u003Cbr \/\u003E\u003C\/i\u003E\u003Cb\u003E2015-2020: Deep Learning Revitalizes Robotics\u003C\/b\u003E\u003Cbr \/\u003ERecently it has been shown that Deep Learning can be used to help robots learn tasks involving movement, object manipulation, and decision making[6,7,8,9]. Before Deep Learning, lots of different pieces of robotic software and hardware would have to be developed independently and then hacked together for demo day. Today, you can use one of a handful of \"Deep Learning for Robotics recipes\" and start watching your robot learn the task you care about.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.technologyreview.com\/sites\/default\/files\/images\/Baxter%20grip.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/www.technologyreview.com\/sites\/default\/files\/images\/Baxter%20grip.png\" height=\"320\" width=\"257\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"text-align: start;\"\u003ERobots Learns to Grasp using Deep Learning at Carnegie Mellon University.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"text-align: start;\"\u003E\u003Ca href=\"http:\/\/arxiv.org\/abs\/1509.06825\"\u003ESupersizing Self-supervision: Learning to Grasp from 50K Tries and 700 Robot Hours\u003C\/a\u003E\u0026nbsp;\u003C\/span\u003E[6]\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"text-align: start;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003EWith their 2013 acquisition of Boston Dynamics (a hardware play), 2014 acquisition of DeepMind (a software play), and a serious autonomous car play, Google is definitely early to the Robotics party. But the noteworthy bits are happening at the intersection of deep learning and robotics. \u0026nbsp;I suggest taking a closer look at the Robotics research of \u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~pabbeel\/\"\u003EPieter Abbeel\u003C\/a\u003E of Berkeley, \u003Ca href=\"http:\/\/www.cs.cmu.edu\/~abhinavg\/\"\u003EAbhinav Gupta\u003C\/a\u003E of Carnegie Mellon, and \u003Ca href=\"http:\/\/www.cs.stanford.edu\/people\/asaxena\/index.html\"\u003EAshutosh Saxena\u003C\/a\u003E of Stanford -- all likely stars in the next\u003Ci\u003E Deep Learning for Robotics\u003C\/i\u003E race. As long as \u003Ca href=\"http:\/\/people.csail.mit.edu\/brooks\/\"\u003ERodney Brooks\u003C\/a\u003E keeps creating innovative Robotics platforms like Baxter, my expectations for Robotics are off the charts.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EConclusion\u003C\/h3\u003EUnlike in 1849, the Deep Learning Gold Rush of 2015 is \u003Ci\u003Enot\u003C\/i\u003E going to bring some 300,000 gold-seekers in boats to California's mainland. This isn't a bring-your-own-hammer kind of game -- the Titans have already descended from their Ivory Towers and handed us ample mining tools. But it won't hurt to gain some experience with traditional \"shallow\" machine learning techniques so you can appreciate the power of Deep Learning.\u003Cbr \/\u003E\u003Cbr \/\u003EI hope you enjoyed today's read and have a better sense of how Silicon Valley is undergoing a transformation. And remember, today's wave of Deep Learning upstart CEOs have PhDs, but once Deep Learning software becomes more user-friendly (TensorFlow?), maybe\u0026nbsp;you won't have to wait so long to \u003Ci\u003Edropout\u003C\/i\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EReferences\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[1] Krizhevsky, A., Sutskever, I. and Hinton, G. E.\u0026nbsp;\u003Ca href=\"http:\/\/papers.nips.cc\/paper\/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf\"\u003EImageNet Classification with Deep Convolutional Neural Networks.\u003C\/a\u003E\u0026nbsp;In NIPS 2012.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[2] D'Onfro, J.\u0026nbsp;\u003Ca href=\"http:\/\/www.businessinsider.com\/google-on-machine-learning-2015-10\"\u003EGoogle is 're-thinking' all of its products to include machine learning.\u003C\/a\u003E\u0026nbsp;Business Insider. October 22, 2015.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[3] D'Onfro, J.\u0026nbsp;\u003Ca href=\"http:\/\/how%20facebook%20will%20use%20artificial%20intelligence%20to%20organize%20insane%20amounts%20of%20data%20into%20the%20perfect%20news%20feed%20and%20a%20personal%20assistant%20with%20superpowers\/\"\u003EHow Facebook will use artificial intelligence to organize insane amounts of data into the perfect News Feed and a personal assistant with superpowers\u003C\/a\u003E. Business Insider. November 3, 2015.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[4] Blank, S.\u0026nbsp;\u003Ca href=\"http:\/\/steveblank.com\/secret-history\/\"\u003ESecret History of Silicon Valley\u003C\/a\u003E. 2008.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[5]\u0026nbsp;Donglai Wei, Bolei Zhou, Antonio Torralba William T. Freeman. \u003Ca href=\"http:\/\/vision03.csail.mit.edu\/cnn_art\/index.html\"\u003EmNeuron: A Matlab Plugin to Visualize Neurons from Deep Models\u003C\/a\u003E. 2015.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[6] Lerrel Pinto, Abhinav Gupta. \u003Ca href=\"http:\/\/arxiv.org\/abs\/1509.06825\"\u003ESupersizing Self-supervision: Learning to Graspfrom 50K Tries and 700 Robot Hours\u003C\/a\u003E. arXiv. 2015.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[7] Sergey Levine, Chelsea Finn, Trevor Darrell, Pieter Abbeel.\u0026nbsp;\u003Ca href=\"http:\/\/rll.berkeley.edu\/RSS2015-BlueSky-Shakey\/Levine-ShakeyWS-2015.pdf\"\u003EEnd-to-End Training of Deep Visuomotor Policies\u003C\/a\u003E. In RSS 2015.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[8]\u0026nbsp;Mnih, Volodymyr, et al. \"\u003Ca href=\"http:\/\/www.nature.com\/nature\/journal\/v518\/n7540\/full\/nature14236.html\"\u003EHuman-level control through deep reinforcement learning\u003C\/a\u003E.\" Nature 518.7540 (2015): 529-533.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[9] Ian Lenz, Ross Knepper, and Ashutosh Saxena. \u003Ca href=\"http:\/\/deepmpc.cs.cornell.edu\/\"\u003EDeepMPC: Learning Deep Latent Features for Model Predictive Control\u003C\/a\u003E. \u0026nbsp;In Robotics Science and Systems (RSS), 2015\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/3843217399785041099\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/11\/the-deep-learning-gold-rush-of-2015.html#comment-form","title":"4 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3843217399785041099"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3843217399785041099"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/11\/the-deep-learning-gold-rush-of-2015.html","title":"The Deep Learning Gold Rush of 2015"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/2.bp.blogspot.com\/-KLOHUJmgbR4\/VjRs4tdbZ8I\/AAAAAAAAOU8\/QzvI87MUF18\/s72-c\/california_gold_rush-01.png","height":"72","width":"72"},"thr$total":{"$t":"4"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-3339709172492766899"},"published":{"$t":"2015-06-26T21:45:00.000-05:00"},"updated":{"$t":"2016-06-14T04:58:40.614-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"abhinav gupta"},{"scheme":"http://www.blogger.com/atom/ns#","term":"antonio torralba"},{"scheme":"http://www.blogger.com/atom/ns#","term":"bolei zhou"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CNNs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"ConvNets"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CVPR 2015"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deepmind"},{"scheme":"http://www.blogger.com/atom/ns#","term":"facebook"},{"scheme":"http://www.blogger.com/atom/ns#","term":"geoff hinton"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hypercolumns"},{"scheme":"http://www.blogger.com/atom/ns#","term":"josh tenenbaum"},{"scheme":"http://www.blogger.com/atom/ns#","term":"karpathy"},{"scheme":"http://www.blogger.com/atom/ns#","term":"MIT"},{"scheme":"http://www.blogger.com/atom/ns#","term":"trevor darrell"},{"scheme":"http://www.blogger.com/atom/ns#","term":"yann lecun"}],"title":{"type":"text","$t":"Deep down the rabbit hole: CVPR 2015 and beyond"},"content":{"type":"html","$t":"\u003Cdiv class=\"p1\"\u003ECVPR is the premier Computer Vision conference, and it's fair to think of it as \u003Cb\u003Ethe Olympics of Computer Vision Research.\u003C\/b\u003E This year it was held in my own back yard -- less than a mile away from lovely Cambridge, MA! \u0026nbsp;Plenty of my MIT colleagues attended, but I wouldn't be surprised if \u003Ca href=\"http:\/\/googleresearch.blogspot.com\/2015\/06\/google-computer-vision-research-at-cvpr.html\"\u003EGoogle\u003C\/a\u003E had the largest showing at CVPR 2015. I have been going to CVPR almost every year since 2004, so let's take a brief tour at what's new in the exciting world of computer vision research.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/1.bp.blogspot.com\/-7MdQJ4NJlyo\/V1rAnoDGOPI\/AAAAAAAAOsk\/IgVsXeuTMywDMrrrsvnTPSSa-Pq6yIjAgCLcB\/s1600\/rabbit_hole.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"278\" src=\"https:\/\/1.bp.blogspot.com\/-7MdQJ4NJlyo\/V1rAnoDGOPI\/AAAAAAAAOsk\/IgVsXeuTMywDMrrrsvnTPSSa-Pq6yIjAgCLcB\/s400\/rabbit_hole.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/frostyshadows.deviantart.com\/art\/Down-the-Rabbit-Hole-358090601\"\u003EDown the rabbit hole art by frostyshadows\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EA lot has changed. Nothing has changed. Academics used to be on top, defending their Universities and the awesomeness happening inside their non-industrial research labs. Academics are still on top, but now defending their Google, Facebook, Amazon, and Company X affiliations. And with the hiring budget to acquire the best and a heavy publishing-oriented culture, don't be surprised if the massive academia exodus continues for years to come. It's only been two weeks since CVPR, and Google has since then been busy making \u003Ca href=\"http:\/\/googleresearch.blogspot.be\/2015\/06\/inceptionism-going-deeper-into-neural.html\"\u003EConvNet art\u003C\/a\u003E, showing the world that if you want to do the best Deep Learning research, they are King.\u003Cbr \/\u003E\u003Cbr \/\u003EAn army of PhD students and Postdocs simply cannot defeat an army of Software Engineers and Research Scientists. Back in the day, students used to typically depart after a Computer Vision PhD (there used to be few vision research jobs and Wall Street jobs were tempting). Now the former PhD students run research labs at big companies which have been feverishly getting into vision. It seems there aren't enough deep experts to fill the deep demand.\u003Cbr \/\u003E\u003Cbr \/\u003EDatasets used to be the big thing -- please download my data!\u0026nbsp; Datasets are still the big thing -- but we regret to inform you that your university’s computational resources won’t make the cut (but at Company X we’re always hiring, so come join us, and help push the frontier of research together).\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated Article:\u003C\/b\u003E \u003Ca href=\"https:\/\/research.facebook.com\/ai\"\u003EUnder LeCun's Leadership, Facebook's AI Research Lab is beefing up their research presence\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003EIf you want to check out the individual papers, I recommend Andrej Karpathy's\u0026nbsp;\u003Ca href=\"http:\/\/cs.stanford.edu\/people\/karpathy\/cvpr2015papers\/\"\u003Eonline navigation tool for CVPR 2015 papers\u003C\/a\u003E\u0026nbsp;or take a look at the vanilla listing of \u003Ca href=\"http:\/\/www.cv-foundation.org\/openaccess\/CVPR2015.py\"\u003ECVPR 2015 papers on the CV foundation website\u003C\/a\u003E.\u0026nbsp;\u003Ca href=\"http:\/\/web.mit.edu\/zoya\/www\/\"\u003EZoya Bylinskii,\u003C\/a\u003E an MIT PhD Candidate, also put together \u003Ca href=\"http:\/\/zoyathinks.blogspot.com\/2015\/06\/cvpr-recap-and-where-were-going.html\"\u003Ea list of interesting CVPR 2015 papers\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ch3\u003E\u003Cb\u003EThe ConvNet Revolution: There's a pre-trained network for that\u003C\/b\u003E\u003C\/h3\u003E\u003Cspan class=\"s1\"\u003EMachine Learning used to be the Queen. Machine Learning is now the King. Machine Learning used to be shallow, but today's learning approaches are \u003Cb\u003Eso deep that the diagrams barely fit on a single slide\u003C\/b\u003E. Grad students used to pass around jokes about Yann LeCun and his insistence that machine learning will one day do the work of the feature engineering stage. Now it seems that the entire vision community gets to ignore you when you insist that “manual feature engineering” is going to save the day. \u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann LeCun\u003C\/a\u003E gave a keynote presentation with the intriguing title \"\u003Ca href=\"https:\/\/drive.google.com\/file\/d\/0BxKBnD5y2M8NVHRiVXBnOVpiYUk\/view\"\u003EWhat's wrong with Deep Learning\u003C\/a\u003E\" and it seems that Convolutional Neural Networks (also called CNNs or ConvNets) are everywhere at CVPR.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-M5JZccnj1Ec\/VY3zTmGV8YI\/AAAAAAAAOP4\/q0c-QCtmi2o\/s1600\/convnet.jpeg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"191\" src=\"https:\/\/3.bp.blogspot.com\/-M5JZccnj1Ec\/VY3zTmGV8YI\/AAAAAAAAOP4\/q0c-QCtmi2o\/s400\/convnet.jpeg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/cs231n.github.io\/convolutional-networks\/\"\u003EFigure from Karpathy's Convolutional Neural Network tutorial\u003C\/a\u003E\u003C\/div\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EIt used to be hard to publish ConvNet research papers at CVPR, it's now hard to get a CVPR paper if you didn't at least compare against a ConvNet baseline.\u0026nbsp;\u003C\/span\u003E\u003Cb\u003EGot a new cool problem? Oooh, you didn’t try a ConvNet-based baseline? Well, that explains why nobody cares.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EBut it's not like the machines are taking over the job of the vision scientist. Today's vision scientist is much more of an applied machine learning hacker than anything else, and because of the strong CNN theme, it is much easier to understand and re-implement today's vision systems. What we're seeing at CVPR is essentially a revisiting of the classic problems like segmentation and motion, using this new machinery. As \u003Ca href=\"http:\/\/people.csail.mit.edu\/samson\/\"\u003ESamson Timoner\u003C\/a\u003E phrased it at the local \u003Ca href=\"http:\/\/www.meetup.com\/bostonimagevision\/\"\u003EBoston Vision Meetup\u003C\/a\u003E, when \u003Ca href=\"https:\/\/en.wikipedia.org\/wiki\/Mutual_information\"\u003EMutual Information\u003C\/a\u003E was popular, the community jumped on that bandwagon -- it's ConvNets this time around. But it's not just a trend, the non-CNN competition is getting crushed.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-PMO1RTHdFc0\/VY30k5m5PnI\/AAAAAAAAOQE\/IVxRFXbVSzg\/s1600\/hypercolumns.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"210\" src=\"https:\/\/3.bp.blogspot.com\/-PMO1RTHdFc0\/VY30k5m5PnI\/AAAAAAAAOQE\/IVxRFXbVSzg\/s320\/hypercolumns.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFigure from \u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~bharath2\/\"\u003EBharath Hariharan\u003C\/a\u003E's\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~bharath2\/pubs\/pdfs\/BharathCVPR2015.pdf\"\u003EHypercolumns\u003C\/a\u003E\u0026nbsp;CVPR 2015 paper on segmentation using CNNs\u003C\/div\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EThere's still plenty to be done by a vision scientist, and a solid formal education in mathematics is more important than ever. We used to train via gradient descent. We still train via gradient descent. We used to drink Coffee, now we all drink Caffe. But behind the scenes, it is still mathematics.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated Page:\u003C\/b\u003E \u003Ca href=\"https:\/\/github.com\/BVLC\/caffe\/wiki\/Model-Zoo\"\u003ECaffe Model Zoo where you can download lots of pretrained ConvNets\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EDeep down the rabbit hole\u003C\/h3\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003ECVPR 2015 reminds of the pre-Newtonian days of physics. A lot of smart scientists were able to predict the motions of objects using mathematics once the ingenious \u003Ca href=\"https:\/\/en.wikipedia.org\/wiki\/Ren%C3%A9_Descartes\"\u003EDescartes\u003C\/a\u003E taught us how to embed our physical thinking into a coordinate system. And it's pretty clear that by casting your computer vision problem in the language of ConvNets, you are going to beat just about anybody doing computer vision by hand. \u003Cb\u003EI think of \u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann LeCun\u003C\/a\u003E (one of the fathers of Deep Learning) as a modern day Descartes, \u003C\/b\u003Eonly because I think the ground-breaking work is right around the corner. His mental framework of ConvNets is like a much needed coordinate system -- we might not know what the destination looks like, but we now know how to build a map.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EDeep Networks are performing better every month, but I’m still waiting for Isaac to come in and make our lives even easier. I want a simplification. But I'm not being pessimistic -- there is a flurry of activity in the ConvNet space for a very good reason (in case you didn't get to attend CVPR 2015), so I'll just be blunt:\u0026nbsp;\u003Cb\u003EConvNets fuckin' work\u003C\/b\u003E! I just want the F=ma of deep learning.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-Lu7umrplafc\/VYsIdA-y9EI\/AAAAAAAAOOU\/S-a6qd0ruMU\/s1600\/yann_descartes-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"166\" src=\"https:\/\/1.bp.blogspot.com\/-Lu7umrplafc\/VYsIdA-y9EI\/AAAAAAAAOOU\/S-a6qd0ruMU\/s400\/yann_descartes-01.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003EOpen Source Deep Learning for Computer Vision: Torch vs Caffe\u003C\/h3\u003E\u003Cdiv class=\"p1\"\u003ECVPR 2015 started off with some excellent software tutorials on day one. \u0026nbsp;There is some great non-alpha deep learning software out there and it has been making everybody's life easier. \u0026nbsp;At CVPR, we had both a\u0026nbsp;\u003Ca href=\"http:\/\/torch.ch\/docs\/cvpr15.html\"\u003ETorch tutorial\u003C\/a\u003E and a Caffe tutorial. \u0026nbsp;I attended the \u003Ca href=\"http:\/\/tutorial.caffe.berkeleyvision.org\/\"\u003EDIY Deep Learning Caffe tutorial\u003C\/a\u003E and it was a full house -- standing room only for slackers like me who join the party only 5 minutes before it starts. Caffe is much more popular that Torch, but when talking to some power users of Deep Learning (like \u003Ca class=\"g-profile\" href=\"https:\/\/plus.google.com\/100209651993563042175\" target=\"_blank\"\u003E+Andrej Karpathy\u003C\/a\u003E\u0026nbsp;and other DeepMind scientists), a certain group of experts seems to be migrating from Caffe to Torch.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-YIql8zIlTNc\/VY3y8q-pmPI\/AAAAAAAAOPw\/HS78ACtebo4\/s1600\/torch_vs_caffe-01.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"160\" src=\"https:\/\/3.bp.blogspot.com\/-YIql8zIlTNc\/VY3y8q-pmPI\/AAAAAAAAOPw\/HS78ACtebo4\/s320\/torch_vs_caffe-01.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003ECaffe is developed at Berkeley, has a vibrant community, Python bindings, and seems to be quite popular among University students. \u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~trevor\/\"\u003EProf. Trevor Darrell\u003C\/a\u003E\u0026nbsp;at Berkeley is even looking for a Postdoc to help the Caffe effort. If I was a couple of years younger and a fresh PhD, I would definitely apply.\u003Cbr \/\u003E\u003Cbr \/\u003EInstead of following the Python trend, Torch is Lua-based. There is no need for an interpreter like Matlab or Python -- Lua gives you the magic console. Torch is heavily used by Facebook AI Research Labs and Google's DeepMind Lab in London. \u0026nbsp;For those afraid of new languages like Lua, don't worry -- Lua is going to feel \"easy\" if you've dabbled in Python, Javascript, or Matlab. And if you don't like editing protocol buffer files by hand, definitely check out Torch.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EIt's starting to become clear that the future power of deep learning is going to come with its own self-contained software package like Caffe or Torch, and not from a dying breed of all-around\u0026nbsp;tool-belts\u0026nbsp;like OpenCV or Matlab.\u003C\/b\u003E\u0026nbsp;When you share creations made in OpenCV, you end up sharing source files, but with the Deep Learning toolkits, you end up sharing your pre-trained networks. \u0026nbsp;No longer do you have to think about a combination of 20 \"little\" algorithms for your computer vision pipeline -- you just think about which popular network architecture you want, and then the dataset. \u0026nbsp;If you have the GPUs and ample data, you can do full end-to-end training. \u0026nbsp;And if your dataset is small\/medium, you can fine-tune the last few layers. You can even train a linear classifier on top of the final layer, if you're afraid of getting your hands dirty -- just doing that will beat the SIFTs, the HOGs, the GISTs, and all that was celebrated in the past two decades of computer vision.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated Article:\u003C\/b\u003E \u003Ca href=\"http:\/\/fastml.com\/torch-vs-theano\/\"\u003ETorch vs Theano on fastml.com\u003C\/a\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated Code:\u003C\/b\u003E \u003Ca href=\"http:\/\/www.vlfeat.org\/matconvnet\/\"\u003EAndrea Vedaldi's MatConvNet Deep Learning Library for MATLAB users\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EThe way in which ConvNets are being used at CVPR 2015 makes me feel like we're close to something big. \u0026nbsp;But before we strike gold, ConvNets still feel like a Calculus of Shadows, merely \"hoping\" to get at something bigger, something deeper, and something more meaningful. I think the flurry of research which investigates visualization algorithms for ConvNets suggests that even the network architects aren't completely sure what is happening behind the scenes.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EThe Video Game Engine Inside Your Head: A different path towards Machine Intelligence\u003C\/h3\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/web.mit.edu\/cocosci\/josh.html\"\u003EJosh Tenenbaum\u003C\/a\u003E gave an invited talk titled The Video Game Engine Inside Your Head at the \u003Ca href=\"http:\/\/sunw.csail.mit.edu\/\"\u003EScene Understanding Workshop\u003C\/a\u003E on the last day of the CVPR 2015 conference. You can read a summary of his ideas in a \u003Ca href=\"http:\/\/www.scientificamerican.com\/article\/the-video-game-engine-in-your-head\/\"\u003Eshort Scientific American article\u003C\/a\u003E. While his talk might appear to be unconventional by CVPR standards, it is classic Tenenbaum. In his world, there is no benchmark to beat, no curves to fit to shadows, and if you allow my LeCun-Descartes analogy, then in some sense Prof. Tenenbaum might be the modern day Aristotle. As \u003Ca href=\"http:\/\/vision.princeton.edu\/people\/xj\/\"\u003EProf. Jianxiong Xiao\u003C\/a\u003E\u0026nbsp;introduced Josh with a grand intro, he was probably right -- this is one of the most intelligent speakers you can find. \u0026nbsp;He speaks 100 words a second, you can't help but feel your brain enlarge as you listen.\u003Cbr \/\u003E\u003Cbr \/\u003EOne of Josh's main research themes is going beyond the shadows of image-based recognition. \u0026nbsp;Josh's work is all about building mental models of the world, and his work can really be thought of as analysis-by-synthesis. Inside his models is something like a video game engine, and he showed lots of compelling examples of inferences that are easy for people, but nearly impossible for the data-driven ConvNets of today. \u0026nbsp;It's not surprising that his student is working at Google's DeepMind this summer.\u003Cbr \/\u003E\u003Cbr \/\u003EA couple of years ago, \u003Ca href=\"http:\/\/pgm.stanford.edu\/\"\u003EProbabilistic Graphical Models\u003C\/a\u003E (the marriage of Graph Theory and Probabilistic Methods) used to be all the rage. \u0026nbsp;Josh gave us a taste of \u003Cb\u003EProbabilistic Programming\u003C\/b\u003E, and while we're not yet seeing these new methods dominate the world of computer vision research, keep your eyes open. He mentioned a recent Nature paper (citation below) from another well respected machine intelligence research, which should keep the trendsetters excited for quite some time. Just take a look at the bad-ass looking \u003Ca href=\"http:\/\/julialang.org\/\"\u003EJulia\u003C\/a\u003E code below:\u003Cbr \/\u003E\u003Cbr \/\u003EProbabilistic machine learning and artificial intelligence. \u003Ca href=\"http:\/\/mlg.eng.cam.ac.uk\/zoubin\/\"\u003EZoubin Ghahramani\u003C\/a\u003E. Nature 521, 452–459 (28 May 2015) doi:10.1038\/nature14541\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-KpKw8OQq2CE\/VYs5NIPpjbI\/AAAAAAAAOOs\/kqxc5aJt2TY\/s1600\/nature14541-f2.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"117\" src=\"https:\/\/4.bp.blogspot.com\/-KpKw8OQq2CE\/VYs5NIPpjbI\/AAAAAAAAOOs\/kqxc5aJt2TY\/s400\/nature14541-f2.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003ETo see some of Prof. Tenenbaum's ideas in action, take a look at the following CVPR 2015 paper, titled\u0026nbsp;\u003Ca href=\"http:\/\/mrkulk.github.io\/www_cvpr15\/\"\u003EPicture: A Probabilistic Programming Language for Scene Perception\u003C\/a\u003E. Congrats to\u0026nbsp;\u003Ca href=\"http:\/\/tejask.com\/\"\u003ETejas D. Kulkarni\u003C\/a\u003E, the first author, an MIT student, who got the Best Paper Honorable Mention prize for this exciting new work. Google DeepMind, you're going to have one fun summer.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-oYEe0sbaCao\/VYs4zlLYzLI\/AAAAAAAAOOk\/rzTaG2jtwoI\/s1600\/tenenbaum_cvpr2015.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"240\" src=\"https:\/\/1.bp.blogspot.com\/-oYEe0sbaCao\/VYs4zlLYzLI\/AAAAAAAAOOk\/rzTaG2jtwoI\/s320\/tenenbaum_cvpr2015.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/mrkulk.github.io\/www_cvpr15\/\"\u003EPicture: A Probabilistic Programming Language for Scene Perception\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EObject Detectors Emerge in Deep Scene CNNs\u003C\/h3\u003EThere were lots of great presentation as the Scene Understanding Workshop, and another talk that truly stood out was about a new large-scale dataset (MIT Places) and a thorough investigation of what happens when you train with scenes vs. objects.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-SMTxAQ_gQ9U\/VY3vega-VNI\/AAAAAAAAOPk\/Zrd5yn4pSiU\/s1600\/emerge.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"145\" src=\"https:\/\/3.bp.blogspot.com\/-SMTxAQ_gQ9U\/VY3vega-VNI\/AAAAAAAAOPk\/Zrd5yn4pSiU\/s400\/emerge.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/web.mit.edu\/torralba\/www\/\"\u003EAntonio Torralba\u003C\/a\u003E\u0026nbsp;from MIT gave the talk about the Places Database and an in-depth analysis of what is learned when you train on object-centric databases like ImageNet vs. Scene-scentric databases like \u003Ca href=\"http:\/\/places.csail.mit.edu\/\"\u003EMIT Places.\u003C\/a\u003E You can check out \"\u003Ca href=\"http:\/\/places.csail.mit.edu\/slide_iclr2015.pdf\"\u003EObject Detectors Emerge\u003C\/a\u003E\" slides or their \u003Ca href=\"http:\/\/arxiv.org\/pdf\/1412.6856v1.pdf\"\u003EArXiv paper\u003C\/a\u003E to learn more. Great work by an upcoming researcher, \u003Ca href=\"http:\/\/people.csail.mit.edu\/bzhou\/\"\u003EBolei Zhou\u003C\/a\u003E!\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003EOverheard at CVPR: ArXiv Publishing Frenzy \u0026amp; Baidu Fiasco\u0026nbsp;\u003C\/h3\u003E\u003Cbr \/\u003EIn the long run, the recent trend of \u003Cb\u003Erapidly pushing preprints\u003C\/b\u003E to \u003Ca href=\"http:\/\/arxiv.org\/\"\u003EArXiv.org\u003C\/a\u003E is great for\u0026nbsp;academic and industry research alike. When you have a large collection of experts exploring ideas at very fast rates, waiting 6 months until the next conference deadline just doesn't make sense. \u0026nbsp;The only downside is that it makes new CVPR papers feel old. It seems like everybody has already perused the good stuff the day it went up on ArXiv. But you get your \"idea claim\" without worrying that a naughty reviewer will be influenced by your submission. \u003Cb\u003EDouble blind reviewing, get ready for a serious revamp.\u003C\/b\u003E\u0026nbsp; We now know who's doing what, significantly before publication time. \u0026nbsp;Students, publish-or-perish just got a new name. Whether the ArXiv frenzy is a good or a bad thing, is up to you, and probably more a function of your seniority than anything else. But the CV buzz is definitely getting louder and will continue to do so.\u003Cbr \/\u003E\u003Cbr \/\u003EThe \u003Ca href=\"http:\/\/www.technologyreview.com\/view\/538111\/why-and-how-baidu-cheated-an-artificial-intelligence-test\/\"\u003EBaidu cheating scandal\u003C\/a\u003E might appear to be big news for outsiders just reading the Artificial Intelligence headlines, but overfitting to the testing set is nothing new in Computer Vision. Papers get retracted, grad students often evaluate their algorithms on test sets too many times, and the truth is that nobody's perfect. \u0026nbsp;When it's important to be #1, don't be surprised that your competition is being naughty. But it's important to realize the difference between ground-breaking research and petty percentage chasing. We all make mistakes, and under heavy pressure, we're all likely to show our weaknesses. \u0026nbsp;So let's laugh about it. \u0026nbsp;\u003Cb\u003ELet's hire the best of the best, encourage truly great research, and stop chasing percentages.\u003C\/b\u003E \u0026nbsp;The truth is that a lot of the top performing methods are more similar than different.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003Cbr \/\u003ECVPR has been constantly growing in attendance. We now have Phd Students, startups, Professors, recruiters, big companies, and even undergraduates coming to the show. Will CVPR become the new SIGGRAPH?\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-Mu-G1cfyImE\/VY3tZRiS0kI\/AAAAAAAAOPY\/gf-QZnvFgqk\/s1600\/cvpr_attendance.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"250\" src=\"https:\/\/4.bp.blogspot.com\/-Mu-G1cfyImE\/VY3tZRiS0kI\/AAAAAAAAOPY\/gf-QZnvFgqk\/s400\/cvpr_attendance.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ECVPR attendance plot from\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.cmu.edu\/~changbo\/\"\u003EChangbo Hu\u003C\/a\u003E.\u0026nbsp;\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EConvNets are here to stay, but if we want ConvNets to be more than than a mere calculus of shadows, there's still ample work do be done. Geoff Hinton's capsules keep popping up during midnight discussions. \"\u003Cb\u003EI want to replace unstructured layers with groups of neurons that I call 'capsules' that are a lot more like cortical columns\u003C\/b\u003E\" -- \u003Ca href=\"http:\/\/www.reddit.com\/r\/MachineLearning\/comments\/2lmo0l\/ama_geoffrey_hinton\"\u003EGeoff Hinton during his Reddit AMA\u003C\/a\u003E. A lot of people (like \u003Ca href=\"http:\/\/www.cs.cmu.edu\/~abhinavg\/\"\u003EProf. Abhinav Gupta\u003C\/a\u003E from CMU) are also talking about unsupervised CNN training, and my prediction is that learning large ConvNets from videos without annotations is going to be big at next year's CVPR.\u003Cbr \/\u003E\u003Cbr \/\u003EMost importantly, when the titans of Deep Learning get to mention what's wrong with their favorite methods, I only expect the best research to follow. Happy computing and remember, never stop learning.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/3339709172492766899\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/06\/deep-down-rabbit-hole-cvpr-2015-and.html#comment-form","title":"7 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3339709172492766899"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/3339709172492766899"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/06\/deep-down-rabbit-hole-cvpr-2015-and.html","title":"Deep down the rabbit hole: CVPR 2015 and beyond"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/1.bp.blogspot.com\/-7MdQJ4NJlyo\/V1rAnoDGOPI\/AAAAAAAAOsk\/IgVsXeuTMywDMrrrsvnTPSSa-Pq6yIjAgCLcB\/s72-c\/rabbit_hole.png","height":"72","width":"72"},"thr$total":{"$t":"7"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-1716021431567432061"},"published":{"$t":"2015-05-06T12:16:00.000-05:00"},"updated":{"$t":"2016-06-13T07:42:56.512-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"baidu"},{"scheme":"http://www.blogger.com/atom/ns#","term":"california"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CNNs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"dyson"},{"scheme":"http://www.blogger.com/atom/ns#","term":"embedded computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"healthcare"},{"scheme":"http://www.blogger.com/atom/ns#","term":"knithealth"},{"scheme":"http://www.blogger.com/atom/ns#","term":"meetup"},{"scheme":"http://www.blogger.com/atom/ns#","term":"robotics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"startups"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision as a service"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision.ai"}],"title":{"type":"text","$t":"Dyson 360 Eye and Baidu Deep Learning at the Embedded Vision Summit in Santa Clara"},"content":{"type":"html","$t":"\u003Ch3\u003E\u003Cb\u003EBringing Computer Vision to the Consumer\u003C\/b\u003E\u003C\/h3\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003EMike Aldred\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003EElectronics Lead, Dyson Ltd\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EWhile vision has been a research priority for decades, the results have often remained out of reach of the consumer. Huge strides have been made, but the final, and perhaps toughest, hurdle is how to integrate vision into real world products. It’s a long road from concept to finished machine, and to succeed, companies need clear objectives, a robust test plan, and the ability to adapt when those fail.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-xKfSvJf2BOY\/VUo6zsl2WKI\/AAAAAAAAOGk\/97yv4YuXzdk\/s1600\/dyson-360-eye-front1.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"197\" src=\"https:\/\/2.bp.blogspot.com\/-xKfSvJf2BOY\/VUo6zsl2WKI\/AAAAAAAAOGk\/97yv4YuXzdk\/s320\/dyson-360-eye-front1.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EImage from ExtremeTech:\u0026nbsp;\u003Ca href=\"http:\/\/www.extremetech.com\/extreme\/189240-dyson-360-eye-dysons-truly-intelligent-robotic-vacuum-cleaner-is-finally-here\"\u003EDyson 360 Eye: Dyson’s ‘truly intelligent’ robotic vacuum cleaner is finally here\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EThe \u003Ca href=\"https:\/\/www.dyson360eye.com\/\"\u003EDyson 360 Eye robot vacuum cleaner\u003C\/a\u003E uses computer vision as its primary localization technology. 10 years in the making, it was taken from bleeding edge academic research to a robust, reliable and manufacturable solution by Mike Aldred and his team at Dyson.\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EMike Aldred’s keynote at next week's \u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/highlights\/speakers\"\u003EEmbedded Vision Summit\u003C\/a\u003E (May 12th in Santa Clara) will chart some of the high and lows of the project, the challenges of bridging between academia and business, and how to use a diverse team to take an idea from the lab into real homes.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003E\u003Cb\u003EEnabling Ubiquitous Visual Intelligence Through Deep Learning\u003C\/b\u003E\u003C\/h3\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003ERen Wu\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003EDistinguished Scientist, Baidu Institute of Deep Learning\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EDeep learning techniques have been making headlines lately in computer vision research. Using techniques inspired by the human brain, deep learning employs massive replication of simple algorithms which learn to distinguish objects through training on vast numbers of examples. Neural networks trained in this way are gaining the ability to recognize objects as accurately as humans. Some experts believe that deep learning will transform the field of vision, enabling the widespread deployment of visual intelligence in many types of systems and applications. But there are many practical problems to be solved before this goal can be reached. For example, how can we create the massive sets of real-world images required to train neural networks? And given their massive computational requirements, how can we deploy neural networks into applications like mobile and wearable devices with tight cost and power consumption constraints?\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-XxXCP2Vf_D8\/VUpKAEqx4II\/AAAAAAAAOHU\/UOJVjMV-b3w\/s1600\/baidu-phone-image.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"319\" src=\"https:\/\/2.bp.blogspot.com\/-XxXCP2Vf_D8\/VUpKAEqx4II\/AAAAAAAAOHU\/UOJVjMV-b3w\/s320\/baidu-phone-image.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003ERen Wu’s morning keynote at next week's\u0026nbsp;\u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/highlights\/speakers\"\u003EEmbedded Vision Summit\u003C\/a\u003E\u0026nbsp;(May 12th in Santa Clara)\u0026nbsp;will share an insider’s perspective on these and other critical questions related to the practical use of neural networks for vision, based on the pioneering work being conducted by his team at Baidu.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003E\u003Cb\u003EVision-as-a-Service: Democratization of Vision for Consumers and Businesses\u003C\/b\u003E\u003C\/h3\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003EHerman Yau\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cspan style=\"font-size: x-small;\"\u003ECo-founder and CEO, Tend\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EHundreds of millions of video cameras are installed around the world—in businesses, homes, and public spaces—but most of them provide limited insights. Installing new, more intelligent cameras requires massive deployments with long time-to-market cycles. Computer vision enables us to extract meaning from video streams generated by existing cameras, creating value for consumers, businesses, and communities in the form of improved safety, quality, security, and health. But how can we bring computer vision to millions of deployed cameras? The answer is through “Vision-as-a-Service” (VaaS), a new business model that leverages the cloud to apply state-of-the-art computer vision techniques to video streams captured by inexpensive cameras. Centralizing vision processing in the cloud offers some compelling advantages, such as the ability to quickly deploy sophisticated new features without requiring upgrades of installed camera hardware. It also brings some tough challenges, such as scaling to bring intelligence to millions of cameras.\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-3-JR8IYNxM4\/VUpKVAhXtDI\/AAAAAAAAOHc\/wax8UnHDVT0\/s1600\/300x300_TPO_February_PRM844_Distributed_Computing.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"https:\/\/3.bp.blogspot.com\/-3-JR8IYNxM4\/VUpKVAhXtDI\/AAAAAAAAOHc\/wax8UnHDVT0\/s1600\/300x300_TPO_February_PRM844_Distributed_Computing.jpg\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EImage From\u0026nbsp;\u003Ca href=\"http:\/\/www.techpageone.co.uk\/en\/technology\/distributed-computing-three-best-use-cases\/#.VUpKO9pViko\"\u003EDistributed Computing: Three Best-Use Cases\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EHerman Yau's talk at next week's\u0026nbsp;\u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/highlights\/speakers\"\u003EEmbedded Vision Summit\u003C\/a\u003E\u0026nbsp;(May 12th in Santa Clara)\u0026nbsp;will explain the architecture and business model behind VaaS, show how it is being deployed in a wide range of real-world use cases, and highlight some of the key challenges and how they can be overcome.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003EEmbedded Vision Summit on May 12th, 2015\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EThere will be many more great presentations at the upcoming Embedded Vision Summit. \u0026nbsp;From the range of topics, it looks like any startup with interest in computer vision will be able to benefit from attending. The entire day is filled with talks by great presenters (Gary Bradski will talk about the latest developments in OpenCV). You can see the list of speakers:\u0026nbsp;\u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/highlights\/speakers\"\u003EEmbedded Vision Summit 2015 List of speakers\u003C\/a\u003E\u0026nbsp;or the day's agenda \u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/attend\/agenda\"\u003EEmbedded Vision Summit 2015 Agenda\u003C\/a\u003E.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ca href=\"http:\/\/www.embedded-vision.com\/summit\/attend\/register\"\u003EEmbedded Vision Summit 2015 Registration \u003C\/a\u003E(249$ for the one day event\u0026nbsp;+ food)\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003EDemos during lunch:\u0026nbsp;\u003C\/b\u003EThe Technology Showcase at the Embedded Vision Summit will highlight demonstrations of technology for computer vision-based applications and systems from the following companies.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-HOuzwOMkz-A\/VUpC8QdxplI\/AAAAAAAAOHE\/QKZKvKS5Hb0\/s1600\/tech.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"315\" src=\"https:\/\/3.bp.blogspot.com\/-HOuzwOMkz-A\/VUpC8QdxplI\/AAAAAAAAOHE\/QKZKvKS5Hb0\/s400\/tech.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EThe vision topics covered will be: Deep Learning, CNNs, Business, Markets, Libraries, Standards, APIs, 3D Vision, and Processors. I will be there with my \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E team, together with some computer vision guys from\u0026nbsp;\u003Ca href=\"http:\/\/www.knithealth.com\/\"\u003EKnitHealth, Inc\u003C\/a\u003E, a new SF-based Health Vision Company. If you're interested in meeting with us, let's chat at the Vision Summit.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EWhat kind of startups and companies should attend? Definitely robotics. Definitely vision sensors. Definitely those interested in deep learning hardware implementations. Seems like even half of the software engineers at Google could benefit from learning about their favorite deep learning algorithms being optimized for hardware.\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/1716021431567432061\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/05\/dyson-360-eye-and-baidu-deep-learning.html#comment-form","title":"1 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/1716021431567432061"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/1716021431567432061"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/05\/dyson-360-eye-and-baidu-deep-learning.html","title":"Dyson 360 Eye and Baidu Deep Learning at the Embedded Vision Summit in Santa Clara"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/2.bp.blogspot.com\/-xKfSvJf2BOY\/VUo6zsl2WKI\/AAAAAAAAOGk\/97yv4YuXzdk\/s72-c\/dyson-360-eye-front1.jpg","height":"72","width":"72"},"thr$total":{"$t":"1"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-2490030129949040725"},"published":{"$t":"2015-05-05T14:11:00.001-05:00"},"updated":{"$t":"2015-05-05T14:11:45.213-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"backpropagation"},{"scheme":"http://www.blogger.com/atom/ns#","term":"berkeley"},{"scheme":"http://www.blogger.com/atom/ns#","term":"big data"},{"scheme":"http://www.blogger.com/atom/ns#","term":"caffe"},{"scheme":"http://www.blogger.com/atom/ns#","term":"clarifai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"compiled"},{"scheme":"http://www.blogger.com/atom/ns#","term":"copyright"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CVPR 2015"},{"scheme":"http://www.blogger.com/atom/ns#","term":"data science"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"derivative work"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"gpu"},{"scheme":"http://www.blogger.com/atom/ns#","term":"imagenet"},{"scheme":"http://www.blogger.com/atom/ns#","term":"legal"},{"scheme":"http://www.blogger.com/atom/ns#","term":"metamind"},{"scheme":"http://www.blogger.com/atom/ns#","term":"nearest neighbor"},{"scheme":"http://www.blogger.com/atom/ns#","term":"snapchat"},{"scheme":"http://www.blogger.com/atom/ns#","term":"stanford"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision.ai"}],"title":{"type":"text","$t":"Deep Learning vs Big Data: Who owns what?"},"content":{"type":"html","$t":"In order to learn anything useful, large-scale multi-layer deep neural networks (aka Deep Learning systems) require a large amount of labeled data. There is clearly a need for big data, but only a few places where \u003Cb\u003Ebig visual data\u003C\/b\u003E is available. Today we'll take a look at one of the most popular sources of big visual data, peek inside a trained neural network, and ask ourselves some data\/model ownership questions. The fundamental question to keep in mind is the following, \"\u003Ci\u003EAre the learned weights of a neural network derivate works of the input images?\u003C\/i\u003E\" In other words, \u003Cb\u003Ewhen deep learning touches your data, who owns what\u003C\/b\u003E?\u003Cbr \/\u003E\u003Cbr class=\"Apple-interchange-newline\" \/\u003E\u003Cimg border=\"0\" height=\"144\" src=\"http:\/\/3.bp.blogspot.com\/-zQlQvmK9U9g\/VT_Hk6yKlmI\/AAAAAAAAODQ\/nNNcpVM4UPM\/s1600\/bg_pipeline-01.png\" style=\"color: #0000ee; text-align: center;\" width=\"640\" \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EBackground: The Deep Learning \"Computer Vision Recipe\"\u003C\/b\u003E\u003Cbr \/\u003EOne of today's most successful machine learning techniques is called Deep Learning. The broad interest in Deep Learning is backed by some remarkable results on real-world data interpretation tasks dealing with speech[1], text[2], and images[3]. Deep learning and object recognition techniques have been pioneered by academia (University of Toronto, NYU, Stanford, Berkeley, MIT, CMU, etc), picked up by industry (Google, Facebook, Snapchat, etc), and are now fueling a new generation of startups ready to bring visual intelligence to the masses (\u003Ca href=\"http:\/\/clarifai.com\/\"\u003EClarifai.com\u003C\/a\u003E, \u003Ca href=\"http:\/\/metamind.io\/\"\u003EMetamind.io\u003C\/a\u003E, \u003Ca href=\"http:\/\/vision.ai\/\"\u003EVision.ai\u003C\/a\u003E, etc). And while it's still not clear where Artificial Intelligence is going, Deep Learning will be a key player.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated blog post\u003C\/b\u003E:\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html\"\u003EDeep Learning vs Machine Learning vs Pattern Recognition\u003C\/a\u003E\u003Cbr \/\u003E\u003Cb\u003ERelated blog post\u003C\/b\u003E:\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/04\/deep-learning-vs-probabilistic.html\"\u003EDeep Learning vs Probabilistic Graphical Models vs Logic\u003C\/a\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003EFor visual object recognition tasks, the most popular models are \u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Convolutional_neural_network\"\u003EConvolutional Neural Networks\u003C\/a\u003E (also known as ConvNets or CNNs). They can be trained end-to-end without manual feature engineering, but this requires a large set of training images (sometimes called big data, or big visual data). These large neural networks start out as a Tabula Rasa (or \"blank slate\") and the full system is trained in an end-to-end fashion using a heavily optimized implementation of \u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Backpropagation\"\u003EBackpropagation\u003C\/a\u003E (informally called \"backprop\"). Backprop is nothing but the chain rule you learned in Calculus 101 and today's deep neural networks are trained in almost the same way they were trained in the 1980s. But today's highly-optimized implementations of backprop are GPU-based and can process orders of magnitude more data than was approachable in the pre-internet pre-cloud pre-GPU golden years of Neural Networks. The output of the deep learning training procedure is a set of learned weights for the different layers defined in the model architecture -- millions of floating point numbers representing what was learned from the images. So what's so interesting about the weights? It's the relationship between the weights and the original big data, that will be under scrutiny today.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ci\u003E\u003Cspan style=\"font-size: large;\"\u003E\"Are weights of a trained network based on ImageNet a derived work, a cesspool of millions of copyright claims? What about networks trained to approximate another ImageNet network?\"\u003C\/span\u003E\u003C\/i\u003E\u003Cbr \/\u003E\u003Cspan style=\"font-size: x-small;\"\u003E[This question was asked on HackerNews by kastnerkyle in the comments of\u0026nbsp;\u003Ca href=\"https:\/\/news.ycombinator.com\/item?id=8290441\"\u003EA Revolutionary Technique That Changed Machine Vision\u003C\/a\u003E.]\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EIn the context of computer vision, this question truly piqued my interest, and as we start seeing robots and AI-powered devices enter our homes I expect much more serious versions of this question to arise in the upcoming decade. Let's see how some of these questions are being addressed in 2015.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E1. ImageNet: Non-commercial Big Visual Data\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003ELet's first take a look at the most common data source for Deep Learning systems designed to recognize a large number of different objects, namely ImageNet[4].\u0026nbsp;\u003Ca href=\"http:\/\/image-net.org\/\"\u003EImageNet\u003C\/a\u003E is the de-facto source of big visual data for computer vision researchers working on large scale object recognition and detection. The dataset debuted in a 2009 CVPR paper by \u003Ca href=\"http:\/\/vision.stanford.edu\/feifeili\/\"\u003EFei-Fei Li\u003C\/a\u003E's research group and was put in place to replace both PASCAL datasets (which lacked size and variety) and LabelMe datasets (which lacked standardization). ImageNet grew out of \u003Ca href=\"http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech101\/\"\u003ECaltech101\u003C\/a\u003E (a 2004 dataset focusing on image categorization, also pioneered by Fei-Fei Li) so personally I still think of ImageNet as something like \"Stanford10^N\". ImageNet has been a key player in organizing the scale of data that was required to push object recognition to its new frontier, the deep learning phase.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-s3RtVPKoELQ\/VUPHGKDmscI\/AAAAAAAAOEU\/SMriMK94c9E\/s1600\/ca-imagenet.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"200\" src=\"http:\/\/1.bp.blogspot.com\/-s3RtVPKoELQ\/VUPHGKDmscI\/AAAAAAAAOEU\/SMriMK94c9E\/s400\/ca-imagenet.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/image-net.org\/\"\u003EImageNet\u003C\/a\u003E has over 15 million images in its database as of May 1st, 2015.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003EProblem\u003C\/b\u003E: Lots of extremely large datasets are mined from internet images, but these images often come with their own copyright. \u0026nbsp;This prevents collecting and selling such images, and from a commercial point of view, when creating such a dataset, some care has to be taken. \u0026nbsp;For research to keep pushing the state-of-the-art on real-world recognition problems, we have to use standard big datasets (representative of what is found in the \u003Cstrike\u003Ereal-world\u003C\/strike\u003E internet), foster a strong sense of community centered around sharing results, and maintain the copyrights of the original sources.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cb\u003ESolution\u003C\/b\u003E: ImageNet decided to publicly provide links to the dataset images so that they can be downloaded without having to be hosted on an University-owned server. The ImageNet website only serves the image thumbnails and provides a copyright infringement clause together with instructions where to file a DMCA takedown notice. The dataset organizers provide the entire dataset only after signing a\u003Ca href=\"http:\/\/image-net.org\/download-faq\"\u003E\u0026nbsp;terms of access\u003C\/a\u003E, prohibiting commercial use. See the ImageNet clause below (taken on May 5th, 2015).\u003Cbr \/\u003E\u003Cbr \/\u003E\"\u003Ci\u003EImageNet does not own the copyright of the images. ImageNet only provides thumbnails and URLs of images, in a way similar to what image search engines do. In other words, ImageNet compiles an accurate list of web images for each synset of WordNet. For researchers and educators who wish to use the images for non-commercial research and\/or educational purposes, we can provide access through our site under certain conditions and terms.\u003C\/i\u003E\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E2. Caffe: Unrestricted Use Deep Learning Models\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003ENow that we have a good idea where to download big visual data and an understanding of the terms that apply, let's take a look at the the other end of the spectrum: the output of the Deep Learning training procedure. We'll take a look at\u0026nbsp;\u003Ca href=\"http:\/\/caffe.berkeleyvision.org\/\"\u003ECaffe\u003C\/a\u003E,\u0026nbsp;one of the most popular Deep Learning libraries, which was engineered to handle ImageNet-like data. \u0026nbsp;Caffe provides an ecosystem for sharing models (the Model Zoo), and is becoming an indispensable tool for today's computer vision researcher.\u003Cb\u003E\u0026nbsp;\u003C\/b\u003ECaffe is developed at the\u0026nbsp;Berkeley Vision and Learning Center (BVLC) and by community contributors -- it is open source.\u003Cbr \/\u003E  \u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-Kx5DAr-8Bq8\/VT_PtvDxAiI\/AAAAAAAAODg\/ryHhjLQ2Em0\/s1600\/caffe.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"247\" src=\"http:\/\/1.bp.blogspot.com\/-Kx5DAr-8Bq8\/VT_PtvDxAiI\/AAAAAAAAODg\/ryHhjLQ2Em0\/s1600\/caffe.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ESlide from\u0026nbsp;\u003Ca href=\"https:\/\/docs.google.com\/presentation\/d\/1UeKXVgRvvxg9OUdh_UiC5G71UMscNPlvArsWER41PsU\/edit#slide=id.p\"\u003EDIY Deep Learning for Vision with Caffe\u003C\/a\u003E\u003C\/div\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cb\u003EProblem: \u003C\/b\u003EAs a project that started at a University, Caffe's goal is to be the de-facto standard for creating, training, and sharing Deep Learning models. The shared models were initially licensed for non-commercial use, but the problem is that a new wave of startups is using these techniques, so there must be a licensing agreement which allows Universities, large companies, and startups to explore the same set of pretrained models.\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cb\u003ESolution:\u0026nbsp;\u003C\/b\u003EThe current model licensing for Caffe is \u003Cb\u003Eunrestricted use\u003C\/b\u003E. This is really great for a broad range of hackers, scientists, and engineers. \u0026nbsp;The models used to be shared with a non-commercial clause. Below is the entire model licensing agreement from the\u0026nbsp;\u003Ca href=\"http:\/\/caffe.berkeleyvision.org\/model_zoo.html#bvlc-model-license\"\u003EModel License\u003C\/a\u003E\u0026nbsp;section of Caffe (taken on May 5th, 2015).\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ci\u003E\"The Caffe models bundled by the BVLC are released for unrestricted use.\u0026nbsp;\u003C\/i\u003E\u003Cbr \/\u003E\u003Ci\u003E\u003Cbr \/\u003E\u003C\/i\u003E\u003Ci\u003EThese models are trained on data from the ImageNet project and training data includes internet photos that may be subject to copyright.\u0026nbsp;\u003C\/i\u003E\u003Cbr \/\u003E\u003Ci\u003E\u003Cbr \/\u003E\u003C\/i\u003E\u003Ci\u003EOur present understanding as researchers is that there is no restriction placed on the open release of these learned model weights, since none of the original images are distributed in whole or in part. To the extent that the interpretation arises that weights are derivative works of the original copyright holder and they assert such a copyright, UC Berkeley makes no representations as to what use is allowed other than to consider our present release in the spirit of fair use in the academic mission of the university to disseminate knowledge and tools as broadly as possible without restriction.\"\u0026nbsp;\u003C\/i\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E3. Vision.ai: Dataset generation and training in your home\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003EDeep Learning learns a summary of the input data, but what happens if a different kind of model memorizes bits and pieces of the training data? And more importantly what if there are things inside the memorized bits which you might not want shared with outsiders? \u0026nbsp;For this case study, we'll look at \u003Ca href=\"http:\/\/vision.ai\/\"\u003EVision.ai\u003C\/a\u003E, and their real-time computer vision server which is designed to simultaneously create a dataset and learn about an object's appearance.\u0026nbsp;Vision.ai software can be applied to real-time training from videos as well as live webcam streams. \u003Cbr \/\u003E\u003Cbr \/\u003EInstead of starting with big visual data collected from internet images (like ImageNet), the vision.ai training procedure is based on a person waving an object of interest in front of the webcam. The user bootstraps the learning procedure with an initial bounding box, and the algorithm continues learning hands-free. As the algorithm learns, it is stores a partial history of what it previously saw, effectively creating its own dataset on the fly. Because the vision.ai convolutional neural networks are designed for detection (where an object only occupies a small portion of the image), there is a large amount of background data presented inside the collected dataset. At the end of the training procedure you get both the Caffe-esque bit (the learned weights) and the ImageNet bit (the collected images). So what happens when it's time to share the model?\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-vg68waluCtw\/VUjhBS6ypKI\/AAAAAAAAOF4\/ShDabArY55Q\/s1600\/vmx_user.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"252\" src=\"http:\/\/1.bp.blogspot.com\/-vg68waluCtw\/VUjhBS6ypKI\/AAAAAAAAOF4\/ShDabArY55Q\/s400\/vmx_user.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EA user training a cup detector using \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E's real-time detector training interface\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003EProblem\u003C\/b\u003E: Training in your home means that potentially private and sensitive information is contained inside the backgrounds of the collected images. If you train in your home and make the resulting object model public, think twice about what you're sharing. Sharing can also be problematic if you have trained an object detector from a copyrighted video\/images and want to share\/sell the resulting model.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ESolution\u003C\/b\u003E: When you save a vision.ai model to disk, you get both a compiled model and the full model. The compiled model is the full model sans the images (thus much smaller). This allows you to maintain fully editable models on your local computer, and share the compiled model (essentially only the learned weights), without the chance of anybody else peeking into your living room. Vision.ai's computer vision server called VMX can run both compiled and uncompiled models; however, only uncompiled models can be edited and extended. In addition, vision.ai provides their vision server as a standalone install, so that all of the training images and computations can reside on your local computer. In brief, vision.ai's solution is to allow you to choose whether you want to run the computations in the cloud or locally, and whether you want to distribute full models (with background images) or the compiled models (solely what is required for detection). When it comes to sharing the trained models and\/or created datasets, you are \u003Cb\u003Efree to choose your own licensing agreement.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E4. Open Problems for Licensing Memory-based Machine Learning Models\u003C\/b\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003EDeep Learning methods aren't the only techniques applicable to object recognition. What if our model was a\u0026nbsp;\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/K-nearest_neighbors_algorithm\"\u003ENearest-Neighbor classifier\u003C\/a\u003E using raw RGB pixels? A Nearest Neighbor Classifier is a memory based classifier which memorizes all of the training data -- the model is the training data. It would be contradictory to license the same set of data differently if one day it was viewed as training data and another day as the output of a learning algorithm. I wonder if there is a way to reconcile the kind of restrictive non-commercial licensing behind ImageNet with the unrestricted licensing use strategy of Caffe Deep Learning Models. Is it possible to have one hacker-friendly data\/model license agreement to rule them all?\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EDon't be surprised if neural network upgrades come as part of your future operating system. As we transition from a data economy (sharing images) to a knowledge economy (sharing neural networks), legal\/ownership issues will pop up. I hope that the three scenarios I covered today (big visual data, sharing deep learning models, and training in your home) will help you think about the future legal issues that might come up when sharing visual knowledge. When AI starts generating its own art (maybe by re-synthesizing old pictures), legal issues will pop up. And when your competitor starts selling \u003Ci\u003Eyour\u003C\/i\u003E models and\/or data, legal issues will resurface. Don't be surprised if the MIT license vs. GPL license vs. Apache License debate resurges in the context of pre-trained deep learning models. Who knows, maybe AI Law will become the next big thing.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EReferences\u003C\/b\u003E\u003Cbr \/\u003E[1]\u0026nbsp;\u003Ca href=\"http:\/\/devblogs.nvidia.com\/parallelforall\/deep-speech-accurate-speech-recognition-gpu-accelerated-deep-learning\/\"\u003EDeep Speech: Accurate Speech Recognition with GPU-Accelerated Deep Learning\u003C\/a\u003E: NVIDIA dev blog post about Baidu's work on speech recognition using Deep Learning. \u003Ca href=\"http:\/\/cs.stanford.edu\/people\/ang\/\"\u003EAndrew Ng\u003C\/a\u003E is working with Baidu on Deep Learning.\u003Cbr \/\u003E\u003Cbr \/\u003E[2]\u0026nbsp;\u003Ca href=\"http:\/\/arxiv.org\/abs\/1502.01710\"\u003EText Understanding from Scratch\u003C\/a\u003E: Arxiv paper from Facebook about end-to-end training of text understanding systems using ConvNets. \u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann Lecun\u003C\/a\u003E is working with Facebook on Deep Learning.\u003Cbr \/\u003E\u003Cbr \/\u003E[3]\u0026nbsp;\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~fritz\/absps\/imagenet.pdf\"\u003EImageNet Classification with Deep Convolutional Neural Networks\u003C\/a\u003E. Seminal 2012 paper from the Neural Information and Processing Systems (NIPS) conference which showed breakthrough performance from a deep neural network. Paper came out of University of Toronto, but now most of these guys are now at Google. \u0026nbsp;\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~hinton\/\"\u003EGeoff Hinton\u003C\/a\u003E is working with Google on Deep Learning.\u003Cbr \/\u003E\u003Cbr \/\u003E[4] J. Deng, W. Dong, R. Socher, L.-J. Li, K. Li and L. Fei-Fei, ImageNet: \u003Ca href=\"http:\/\/www.image-net.org\/papers\/imagenet_cvpr09.pdf\"\u003EA Large-Scale Hierarchical Image Database\u003C\/a\u003E. IEEE Computer Vision and Pattern Recognition (CVPR), 2009.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/web.eecs.umich.edu\/~jiadeng\/\"\u003EJia Deng\u003C\/a\u003E is now assistant professor at Michigan University and he is growing his research group. If you're interested in starting a PhD in deep learning and vision, check out his call for prospective students. This might be a younger version of Andrew Ng.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.socher.org\/\"\u003ERichard Socher\u003C\/a\u003E is the CTO and Co-Founder of \u003Ca href=\"https:\/\/www.metamind.io\/\"\u003EMetaMind\u003C\/a\u003E, and new startup in the Deep Learning space. They are VC-backed and have plenty of room to grow.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/vision.stanford.edu\/lijiali\/\"\u003EJia Li \u003C\/a\u003Eis now Head of Research at Snapchat, Inc. I can't say much, but take a look at the recent VentureBeat article:\u0026nbsp;\u003Ca href=\"http:\/\/venturebeat.com\/2015\/04\/08\/snapchat-research\/\"\u003ESnapchat is quietly building a research team to do deep learning on images, videos\u003C\/a\u003E. Jia and I overlapped at Google Research back in 2008.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/vision.stanford.edu\/feifeili\/\"\u003EFei-Fei Li\u003C\/a\u003E is currently the Director of the Stanford Artificial Intelligence Lab and the Stanford Vision Lab. See the article on Wired: \u003Ca href=\"http:\/\/www.wired.com\/2015\/04\/fei-fei-li-want-machines-think-need-teach-see\/\"\u003EIf we want our machines to think, we need to teach them to see\u003C\/a\u003E. Yann, you have some competition.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/daggerfs.com\/\"\u003EYangqing Jia\u003C\/a\u003E\u0026nbsp;created the Caffe project during his PhD at UC Berkeley. He is now a research scientist at Google.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/people.csail.mit.edu\/tomasz\/\"\u003ETomasz Malisiewicz\u003C\/a\u003E is the Co-Founder of Vision.ai, which focuses on real-time training of vision systems -- something which is missing in today's Deep Learning systems. Come say hi at CVPR.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/2490030129949040725\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/05\/deep-learning-vs-big-data-who-owns-what.html#comment-form","title":"2 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2490030129949040725"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2490030129949040725"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/05\/deep-learning-vs-big-data-who-owns-what.html","title":"Deep Learning vs Big Data: Who owns what?"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-zQlQvmK9U9g\/VT_Hk6yKlmI\/AAAAAAAAODQ\/nNNcpVM4UPM\/s72-c\/bg_pipeline-01.png","height":"72","width":"72"},"thr$total":{"$t":"2"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-6627003067235908523"},"published":{"$t":"2015-04-24T17:13:00.001-05:00"},"updated":{"$t":"2016-06-14T04:32:00.428-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"alyosha efros"},{"scheme":"http://www.blogger.com/atom/ns#","term":"average explorer"},{"scheme":"http://www.blogger.com/atom/ns#","term":"berkeley"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CMU"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer graphics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"forensics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google street view"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"mid-level patch discovery"},{"scheme":"http://www.blogger.com/atom/ns#","term":"mirror mirror"},{"scheme":"http://www.blogger.com/atom/ns#","term":"paris"},{"scheme":"http://www.blogger.com/atom/ns#","term":"selfie"},{"scheme":"http://www.blogger.com/atom/ns#","term":"svr"},{"scheme":"http://www.blogger.com/atom/ns#","term":"visual data"}],"title":{"type":"text","$t":"Making Visual Data a First-Class Citizen"},"content":{"type":"html","$t":"\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003E“\u003Ci\u003EAbove all, don't lie to yourself. The man who lies to himself and listens to his own lie comes to a point that he cannot distinguish the truth within him, or around him, and so loses all respect for himself and for others. And having no respect he ceases to love.\u003C\/i\u003E” ― \u003Cb\u003EFyodor Dostoyevsky, The Brothers Karamazov\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-EzeE9hNvg9Y\/VSYAZLOBfzI\/AAAAAAAAN9A\/vJjfXl61DK4\/s1600\/forensics.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"120\" src=\"https:\/\/3.bp.blogspot.com\/-EzeE9hNvg9Y\/VSYAZLOBfzI\/AAAAAAAAN9A\/vJjfXl61DK4\/s400\/forensics.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ECity Forensics: Using Visual Elements to Predict Non-Visual City Attributes\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003ETo respect the power and beauty of machine learning algorithms, especially when they are applied to the visual world, let's take a look at three recent applications of learning-based \"computer vision\" to\u0026nbsp;\u003Cb\u003Ecomputer graphics\u003C\/b\u003E. Researchers in computer graphics are known for producing truly captivating illustrations of their results, so this post is going to be very visual. Now is your chance to sit back and let the pictures do the talking.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3 style=\"clear: both; text-align: left;\"\u003ECan \u003Ci\u003Eyou\u003C\/i\u003E predict things simply by looking at street-view images?\u003C\/h3\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003ELet's say you're going to visit an old-friend in a foreign country for the first time. You've never visited this country before and have no idea what kind of city\/neighborhood your friend lives in. So you decide to get a sneak peak -- you enter your friend's address into Google Street View.\u003Cbr \/\u003E\u003Cbr \/\u003EMost people can look at Google Street View images in a given location and estimate attributes such as \"sketchy,\" \"rural,\" \"slum-like,\" \"noisy\" for the given neighborhood. TLDR; A person is a pretty good visual recommendation engine.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-GuuBpcMwTeM\/VTqHTS0Z8oI\/AAAAAAAAN_c\/yXKqe5qYR08\/s1600\/Google_Street_View_Colombia_screenshot.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"235\" src=\"https:\/\/1.bp.blogspot.com\/-GuuBpcMwTeM\/VTqHTS0Z8oI\/AAAAAAAAN_c\/yXKqe5qYR08\/s400\/Google_Street_View_Colombia_screenshot.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ECan you predict if this looks like a safe location?\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E(Screenshot of \u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Google_Street_View_in_Colombia\"\u003EStreet view for Manizales, Colombia\u003C\/a\u003E on Google Earth)\u003C\/div\u003E\u003Cbr \/\u003ECan a computer program predict things by looking at images? If so, then these kinds of computer programs could be used to automatically generate semantic map layovers (see the crime prediction overlay from the first figure), help organize fast-growing cities (computer vision meets urban planning?), and ultimately bring about a new generation of match-making \"visual recommendation engines\" (a whole suite of new startups).\u003Cbr \/\u003E\u003Cbr \/\u003EBefore I discuss the research paper behind this idea, here are two cool things you could do (in theory) with a non-visual data prediction algorithm. There are plenty of great product ideas in this space -- just be creative.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EStartup Idea #1: Avoiding sketchy areas when traveling abroad\u0026nbsp;\u003C\/b\u003E\u003Cbr \/\u003EA Personalized location recommendation engine could be used to find locations in a city that I might find interesting (techie coffee shop for entrepreneurs, a park good for frisbee) subject to my constraints (near my current location, in a low-danger area, low traffic). \u0026nbsp;Below is the kind of place you want to avoid if you're looking for a coffee and a place to open up your laptop to do some work.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-BFdhyOunp6M\/VTqJqgec_UI\/AAAAAAAAN_o\/momH6l2XEDU\/s1600\/SaoPauloPoorWorld.gif\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"190\" src=\"https:\/\/1.bp.blogspot.com\/-BFdhyOunp6M\/VTqJqgec_UI\/AAAAAAAAN_o\/momH6l2XEDU\/s400\/SaoPauloPoorWorld.gif\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EGoogle Street Maps, Morumbi São Paulo: slum housing (image from \u003Ca href=\"http:\/\/geographyfieldwork.com\/\"\u003Egeographyfieldwork.com\u003C\/a\u003E)\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003EStartup Idea #2: Apartment Pricing and Marketing from Images\u003C\/b\u003E\u003Cbr \/\u003EVisual recommendation engines could be used to predict the best images to represent an apartment for an Airbnb listing. \u0026nbsp;It would be great if Airbnb had a filter that would let you upload videos of your apartment, and it would predict that set of static images that best depict your apartment to maximize earning potential. I'm sure that Airbnb users would pay extra for this feature if it was available for a small extra charge. The same computer vision prediction idea can be applied to home pricing on Zillow, Craigslist, and anywhere else that pictures of for-sale items are shared.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-5zfMwkEd-n0\/VTqaK_chiJI\/AAAAAAAAOAE\/ZBHDIyRoaTE\/s1600\/gapp.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"230\" src=\"https:\/\/4.bp.blogspot.com\/-5zfMwkEd-n0\/VTqaK_chiJI\/AAAAAAAAOAE\/ZBHDIyRoaTE\/s400\/gapp.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EGoogle image search result for \"Good looking apartment\". Can computer vision be used to automatically select pictures that will make your apartment listing successful on Airbnb?\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003EPart I.\u0026nbsp;City Forensics: Using Visual Elements to Predict Non-Visual City Attributes\u003C\/h3\u003E\u003Cbr \/\u003EThe Berkeley Computer Graphics Group has been working on\u0026nbsp;predicting non-visual attributes from images, so before I describe their approach, let me discuss how Berkeley's Visual Elements relate to Deep Learning.\u003Cbr \/\u003E\u003Cdiv class=\"\" style=\"clear: both;\"\u003E\u003Cbr class=\"Apple-interchange-newline\" \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-S1CDo80rewY\/VTn-HT-xDeI\/AAAAAAAAN-U\/qO-7tI3kBBU\/s1600\/attributes2.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"232\" src=\"https:\/\/3.bp.blogspot.com\/-S1CDo80rewY\/VTn-HT-xDeI\/AAAAAAAAN-U\/qO-7tI3kBBU\/s400\/attributes2.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EPredicting Chicago Thefts from San Francisco data. Predicting Philadelphia Housing Prices from Boston data. From\u0026nbsp;City Forensics\u0026nbsp;paper.\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cbr \/\u003E\u003Cb\u003EDeep Learning vs Mid-level Patch Discovery (Technical Discussion)\u003C\/b\u003E\u003Cbr \/\u003EYou might think that non-visual data prediction from images (if even possible) will require a deep understanding of the image and thus these approaches must be based on a recent ConvNet deep learning method. Obviously, knowing the locations and categories associated with each object in a scene could benefit \u003Ci\u003Eany\u003C\/i\u003E computer vision algorithm. \u0026nbsp;The problem is that such general purpose CNN recognition systems aren't powerful enough to parse Google Street View images, at least not yet.\u003Cbr \/\u003E\u003Cbr \/\u003EAnother extreme is to train classifiers on entire images. \u0026nbsp;This was initially done when researchers were using GIST, but there are just too many nuisance pixels inside a typical image, so it is better to focus your machine learning a subset of the image. \u0026nbsp;But how do you choose the subset of the image to focus on?\u003Cbr \/\u003E\u003Cbr \/\u003EThere exist computer vision algorithms that can mine a large dataset of images and \u003Ci\u003Eautomatically\u003C\/i\u003E extract meaningful, repeatable, and detectable mid-level visual patterns. These methods are not label-based and work really well when there is an underlying theme tying together a collection of images. The set of all Google Street View Images from Paris satisfies this criterion. \u0026nbsp;Large collections of random images from the internet must be labeled before they can be used to produce the kind of stellar results we all expect out of deep learning. The Berkeley Group uses visual elements automatically mined from images as the core representation. \u0026nbsp;Mid-level visual patterns are simply chunks of the image which correspond to repeatable configurations -- they sometimes contain entire objects, parts of objects, and popular multiple object configurations. (See Figure below) \u0026nbsp;The mid-level visual patterns form a visual dictionary which can be used to represent the set of images. Different sets of images (e.g., images from two different US cities) will have different mid-level dictionaries. These dictionaries are similar to \"Visual Words\" but their creation uses more SVM-like machinery.\u003Cbr \/\u003E\u003Cbr \/\u003EThe patch mining algorithm is known as \u003Cb\u003Emid-level patch discovery.\u003C\/b\u003E You can think of mid-level patch discovery as a visually intelligent K-means clustering algorithm, but for really really large datasets. Here's a figure from the ECCV 2012 paper which introduced mid-level discriminative patches.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-eOT33fEkxBM\/VTqrwqVG_uI\/AAAAAAAAOAs\/mbvyNSde2UM\/s1600\/patches.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"197\" src=\"https:\/\/2.bp.blogspot.com\/-eOT33fEkxBM\/VTqrwqVG_uI\/AAAAAAAAOAs\/mbvyNSde2UM\/s400\/patches.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EUnsupervised Discovery of Mid-Level Discriminative Patches\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ca href=\"http:\/\/graphics.cs.cmu.edu\/projects\/discriminativePatches\/\"\u003EUnsupervised Discovery of Mid-Level Discriminative Patches.\u003C\/a\u003E\u0026nbsp;\u003Ca href=\"http:\/\/www.saurabhsingh.info\/\"\u003ESaurabh Singh\u003C\/a\u003E, Abhinav Gupta and Alexei A. Efros. In European Conference on Computer Vision (2012).\u003Cbr \/\u003E\u003Cbr \/\u003EI should also point out that non-final layers in a pre-trained CNN could also be used for representing images, without the need to use a descriptor such as HOG. I would expect the performance to improve, so the questions is perhaps: \u003Cb\u003EHow long until somebody publishes an awesome unsupervised CNN-based patch discovery algorithm\u003C\/b\u003E? I'm a handful of researchers are already working on it. :-)\u003Cbr \/\u003E\u003Cbr \/\u003ERelated Blog Post:\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/01\/from-feature-descriptors-to-deep.html\"\u003EFrom feature descriptors to deep learning: 20 years of computer vision\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003ERelated Blog Post: \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html\"\u003EDeep Learning vs Machine Learning vs Pattern Recognition\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"\" style=\"clear: both; text-align: left;\"\u003EThe\u0026nbsp;\u003Ca href=\"http:\/\/vis.berkeley.edu\/papers\/cityforensics\/\"\u003ECity Forensics\u003C\/a\u003E\u0026nbsp;paper from Berkeley tries to map the visual appearance of cities (as obtained from Google Street View Images) to non-visual data like crime statistics, housing prices and population density. \u0026nbsp;The basic idea is to 1.) mine discriminative patches from images and 2.) train a predictor which can map these visual primitives to non-visual data. While the underlying technique is that of mid-level patch discovery combined with \u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Support_vector_machine#Regression\"\u003ESupport Vector Regression\u003C\/a\u003E\u0026nbsp;(SVR), the result is an attribute-specific distribution over GPS coordinates. \u0026nbsp;Such a distribution should be appreciated for its own aesthetic value. I personally love custom data overlays.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/vis.berkeley.edu\/papers\/cityforensics\/\"\u003ECity Forensics: Using Visual Elements to Predict Non-Visual City Attributes\u003C\/a\u003E.\u0026nbsp;\u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~sarietta\/\"\u003ESean Arietta\u003C\/a\u003E, Alexei A. Efros, Ravi Ramamoorthi, Maneesh Agrawala. In IEEE Transactions on Visualization and Computer Graphics (TVCG), 2014.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003EPart II. The Selfie 2.0: Computer Vision as a Sidekick\u003C\/b\u003E\u003C\/h3\u003E\u003Cbr \/\u003ESometimes you just want the algorithm to be your sidekick. Let's talk about a new and improved method for using vision algorithms and the wisdom of the crowds to select better pictures of your face. While you might think of an improved selfie as a silly application, you do want to look \"professional\" in your professional photos, sexy in your \"selfies\" and \"friendly\" in your family pictures. An algorithm that helps you get the desired picture is an algorithm the whole world can get behind.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-miEURD81g70\/VToO80OsCvI\/AAAAAAAAN-4\/T6-s0-HH3A8\/s1600\/selfie.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"67\" src=\"https:\/\/4.bp.blogspot.com\/-miEURD81g70\/VToO80OsCvI\/AAAAAAAAN-4\/T6-s0-HH3A8\/s400\/selfie.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAttractiveness versus Time. From MirrorMirror Paper.\u003C\/div\u003E\u003Cbr \/\u003EThe basic idea is to collect a large video of a single person which spans different emotions, times of day, different days, or whatever condition you would like to vary. \u0026nbsp;Given this video, you can use crowdsourcing to label frames based on a property like attractiveness or seriousness. \u0026nbsp;Given these labeled frames, you can then train a standard HOG detector and predict one of these attributes on new data. Below if a figure which shows the 10 best shots of the child (lots of smiling and eye contact) and the worst 10 shots (bad lighting, blur, red-eye, no eye contact).\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-eCWy2EZgJNo\/VToPQxkIHhI\/AAAAAAAAN_A\/JufIw5Z-xd8\/s1600\/good_kids.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"81\" src=\"https:\/\/1.bp.blogspot.com\/-eCWy2EZgJNo\/VToPQxkIHhI\/AAAAAAAAN_A\/JufIw5Z-xd8\/s400\/good_kids.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E10 good shots, 10 worst shots. From MirrorMirror Paper.\u003C\/div\u003E\u003Cbr \/\u003EYou can also collect a video of yourself as you go through a sequence of different emotions, get people to label frames, and build a system which can predict an attribute such as \"seriousness\".\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-FWltYsB7aOI\/VToQRmlv2TI\/AAAAAAAAN_I\/-I6_S1b_NkI\/s1600\/faceranks.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"127\" src=\"https:\/\/4.bp.blogspot.com\/-FWltYsB7aOI\/VToQRmlv2TI\/AAAAAAAAN_I\/-I6_S1b_NkI\/s400\/faceranks.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFaces ranked from Most serious to least serious. From MirrorMirror Paper.\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EIn this work, labeling was necessary for taking better selfies. \u0026nbsp;But if half of the world is taking pictures, while the other half is voting pictures up and down (or Tinder-style swiping left and right), then I think the data collection and data labeling effort won't be a big issue in years to come. Nevertheless, this is a cool way of scoring your photos. Regarding consumer applications, this is something that Google, Snapchat, and Facebook will probably integrate into their products very soon.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~junyanz\/projects\/mirrormirror\/\"\u003EMirror Mirror: Crowdsourcing Better Portraits.\u003C\/a\u003E\u0026nbsp;\u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~junyanz\/\"\u003EJun-Yan Zhu\u003C\/a\u003E, Aseem Agarwala, Alexei A. Efros, Eli Shechtman and Jue Wang.\u0026nbsp;In ACM Transactions on Graphics (SIGGRAPH Asia), 2014.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003EPart III. What does it all mean? I'm ready for the cat pictures.\u003C\/b\u003E\u003C\/h3\u003E\u003Cbr \/\u003EThis final section revisits an old, simple, and powerful trick in computer vision and graphics. If you know how to compute the average of a sequence of numbers, then you'll have no problem understanding what an average image (or \"mean image\") is all about. And if you're read this far, don't worry, the cat picture is coming soon.\u003Cbr \/\u003E\u003Cbr \/\u003EComputing average images (or \"mean\" images) is one of those tricks that I was introduced to very soon after I started working at CMU. \u0026nbsp;\u003Ca href=\"http:\/\/web.mit.edu\/torralba\/www\/\"\u003EAntonio Torralba\u003C\/a\u003E, who has always had \"a few more visualization tricks\" up his sleeve, started computing average images (in the early 2000s) to analyze scenes as well as datasets collected as part of the LabelMe project at MIT. There's really nothing more to the basic idea beyond simply averaging a bunch of pictures.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-1c4Y-aRLLqI\/VSYAiMlrxyI\/AAAAAAAAN9I\/UPkrTwJC-P4\/s1600\/averages.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"248\" src=\"https:\/\/4.bp.blogspot.com\/-1c4Y-aRLLqI\/VSYAiMlrxyI\/AAAAAAAAN9I\/UPkrTwJC-P4\/s400\/averages.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ETeaser Image from AverageExplorer paper.\u003C\/div\u003E\u003Cbr \/\u003EUsually this kind of averaging is done informally in research, to make some throwaway graphic, or make cool web-ready renderings. \u0026nbsp;It's great seeing an entire paper dedicated to a system which explores the concept of averaging even further. It took about 15 years of use until somebody was bold enough to write a paper about it. When you perform a little bit of alignment, the mean pictures look really awesome. Check out these cats!\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-5E5brGuwyiE\/VTqd34MoCsI\/AAAAAAAAOAc\/Gs6T4wGSpn0\/s1600\/cat_aligned.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"203\" src=\"https:\/\/4.bp.blogspot.com\/-5E5brGuwyiE\/VTqd34MoCsI\/AAAAAAAAOAc\/Gs6T4wGSpn0\/s400\/cat_aligned.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAligned cat images from the AverageExplorer paper.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ci\u003EI want one! (Both the algorithm and a Platonic cat)\u003C\/i\u003E\u003C\/div\u003E\u003Cbr \/\u003EThe AverageExplorer paper extends simple image average with some new tricks which make the operations much more effective. I won't say much about the paper (the link is below), just take at a peek at some of the coolest mean cats I've ever seen (visualized above) or a jaw-dropping way to look at community collected landmark photos (Oxford bridge mean image visualized below).\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-52g-MXy8kjg\/VTqca_2sm6I\/AAAAAAAAOAQ\/-8wZyEh4uK8\/s1600\/bridges.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"325\" src=\"https:\/\/2.bp.blogspot.com\/-52g-MXy8kjg\/VTqca_2sm6I\/AAAAAAAAOAQ\/-8wZyEh4uK8\/s400\/bridges.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAligned bridges from AverageExplorer paper.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ci\u003EI wish Google would make all of Street View look like this.\u003C\/i\u003E\u003C\/div\u003E\u003Ch3\u003E\u003C\/h3\u003E\u003Ch3\u003E\u003C\/h3\u003E\u003Cdiv\u003E\u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~junyanz\/projects\/averageExplorer\/\"\u003EAverageExplorer: Interactive Exploration and Alignment of Visual Data Collections\u003C\/a\u003E. \u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~junyanz\/\"\u003EJun-Yan Zhu\u003C\/a\u003E, Yong Jae Lee, and Alexei A. Efros. In SIGGRAPH 2014.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EAveraging images is a really powerful idea. \u0026nbsp;Want to know what your magical classifier is tuned to detect? \u0026nbsp;Compute the top detections and average them. \u0026nbsp;Soon enough you'll have a good idea of what's going on behind the scenes.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003C\/h3\u003E\u003Cbr \/\u003EAllow me to mention the mastermind that helped bring most of these vision+graphics+learning applications to life. \u0026nbsp;There's an inimitable charm present in all of the works of Prof. Alyosha\u0026nbsp;Efros\u0026nbsp;-- a certain aesthetic that is missing from 2015's overly empirical zeitgeist. \u0026nbsp;He used to be at CMU, but recently moved back to Berkeley.\u003Cbr \/\u003E\u003Cbr \/\u003EBeing able to summarize several of years worth of research into a single computer generated graphic can go a long way to making your work memorable and inspirational. And maybe our lives don't need that much automation. \u0026nbsp;Maybe general purpose object recognition is too much? Maybe all we need is a little art? I want to leave you with a YouTube video from a recent 2015 lecture by\u0026nbsp;\u003Ca href=\"http:\/\/www.eecs.berkeley.edu\/~efros\/\"\u003EProfessor A.A. Efros\u003C\/a\u003E titled \"Making Visual Data a First-Class Citizen.\" If you want to hear the story in the master's own words, grab a drink and enjoy the lecture.\u003Cbr \/\u003E\u003Cbr \/\u003E\"\u003Ci\u003EVisual data is the biggest Big Data there is (Cisco projects that it will soon account for over 90% of internet traffic), but currently, the main way we can access it is via associated keywords. I will talk about some efforts towards indexing, retrieving, and mining visual data directly, without the use of keywords.\u003C\/i\u003E\" ―\u0026nbsp;\u003Cb\u003EA.A. Efros,\u0026nbsp;Making Visual Data a First-Class Citizen\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ccenter\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/Dhp1trwtI2k\" width=\"400\"\u003E\u003C\/iframe\u003E\u003C\/center\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/6627003067235908523\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/making-visual-data-first-class-citizen.html#comment-form","title":"3 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6627003067235908523"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6627003067235908523"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/making-visual-data-first-class-citizen.html","title":"Making Visual Data a First-Class Citizen"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-EzeE9hNvg9Y\/VSYAZLOBfzI\/AAAAAAAAN9A\/vJjfXl61DK4\/s72-c\/forensics.jpg","height":"72","width":"72"},"thr$total":{"$t":"3"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-6794382104242630346"},"published":{"$t":"2015-04-08T14:05:00.001-05:00"},"updated":{"$t":"2018-04-17T03:42:50.037-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"AI"},{"scheme":"http://www.blogger.com/atom/ns#","term":"carlos guestrin"},{"scheme":"http://www.blogger.com/atom/ns#","term":"clusters"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CMU"},{"scheme":"http://www.blogger.com/atom/ns#","term":"data science"},{"scheme":"http://www.blogger.com/atom/ns#","term":"dato"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"distributed systems"},{"scheme":"http://www.blogger.com/atom/ns#","term":"graphlab"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hadoop"},{"scheme":"http://www.blogger.com/atom/ns#","term":"lecun"},{"scheme":"http://www.blogger.com/atom/ns#","term":"logic"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"probabilistic graphical models"},{"scheme":"http://www.blogger.com/atom/ns#","term":"theano"}],"title":{"type":"text","$t":"Deep Learning vs Probabilistic Graphical Models vs Logic"},"content":{"type":"html","$t":"Today, let's take a look at three paradigms\u003Cb\u003E\u0026nbsp;\u003C\/b\u003Ethat have shaped the field of Artificial Intelligence in the last 50 years: \u003Cb\u003ELogic\u003C\/b\u003E, \u003Cb\u003EProbabilistic Methods\u003C\/b\u003E, and \u003Cb\u003EDeep Learning\u003C\/b\u003E. The empirical, \"data-driven\", or big-data \/ deep-learning ideology triumphs today, but that wasn't always the case. Some of the earliest approaches to AI were based on Logic, and the transition from logic to data-driven methods has been heavily influenced by probabilistic thinking, something we will be investigating in this blog post.\u003Cbr \/\u003E\u003Cbr \/\u003ELet's take a look back Logic and Probabilistic Graphical Models and make some predictions on where the field of AI and Machine Learning is likely to go in the near future. We will proceed in chronological order.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-b_Ux2LXhPyk\/VSVdqYcp6-I\/AAAAAAAAN8M\/eBJ2ln-6nDU\/s1600\/probgraphmods.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"225\" src=\"https:\/\/3.bp.blogspot.com\/-b_Ux2LXhPyk\/VSVdqYcp6-I\/AAAAAAAAN8M\/eBJ2ln-6nDU\/s1600\/probgraphmods.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EImage from Coursera's \u003Ca href=\"http:\/\/online.stanford.edu\/pgm-fa12\"\u003EProbabilistic Graphical Models\u003C\/a\u003E course\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ch3\u003E\u003Cb\u003E1. Logic and Algorithms (Common-sense \"Thinking\" Machines)\u003C\/b\u003E\u003C\/h3\u003E\u003Cbr \/\u003EA lot of early work on Artificial Intelligence was concerned with Logic, Automated Theorem Proving, and manipulating symbols. It should not be a surprise that John McCarthy's seminal 1959 paper on AI had the title \"Programs with common sense.\"\u003Cbr \/\u003E\u003Cbr \/\u003EIf we peek inside one of most popular AI textbooks, namely \"Artificial Intelligence: A Modern Approach,\" we immediately notice that the beginning of the book is devoted to \u003Cb\u003Esearch, constraint satisfaction problems, first-order logic, and planning\u003C\/b\u003E. The third edition's cover (pictured below) looks like a big chess board (because being good at chess used to be a sign of human intelligence), features a picture of Alan Turing (the father of computing theory) as well as a picture of Aristotle (one of the greatest classical philosophers which had quite a lot to say about intelligence).\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-BCOXrp3r-SI\/VSVYsgEm-eI\/AAAAAAAAN74\/J2Lx0ssKvsA\/s1600\/cover2.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"320\" src=\"https:\/\/1.bp.blogspot.com\/-BCOXrp3r-SI\/VSVYsgEm-eI\/AAAAAAAAN74\/J2Lx0ssKvsA\/s1600\/cover2.jpg\" width=\"246\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EThe cover of\u0026nbsp;\u003Ca href=\"http:\/\/aima.cs.berkeley.edu\/\"\u003EAIMA\u003C\/a\u003E, the canonical AI text for undergraduate CS students\u003C\/div\u003E\u003Cbr \/\u003EUnfortunately, logic-based AI brushes the perception problem under the rug, and I've argued quite some time ago that understanding how perception works is really the key to unlocking the secrets of intelligence. Perception is one of those things which is easy for humans and immensely difficult for machines. (To read more see my 2011 blog post, \u003Ca href=\"http:\/\/www.computervisionblog.com\/2011\/03\/computer-vision-is-artificial.html\"\u003EComputer Vision is Artificial Intelligence\u003C\/a\u003E). Logic is pure and traditional chess-playing bots are very algorithmic and search-y, but the real world is ugly, dirty, and ridden with uncertainty.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EI think most contemporary AI researchers agree that Logic-based AI is dead.\u003C\/b\u003E\u0026nbsp;The kind of world where everything can be perfectly observed, a world with no measurement error, is not the world of robotics and big-data. \u0026nbsp;We live in the era of machine learning, and numerical techniques triumph over first-order logic. \u0026nbsp;As of 2015, I pity the fool who prefers Modus Ponens over Gradient Descent.\u003Cbr \/\u003E\u003Cbr \/\u003ELogic is great for the classroom and I suspect that once enough perception problems become \"essentially solved\" that we will see a resurgence in Logic. \u0026nbsp;And while there will be plenty of open perception problems in the future, there will be scenarios where the community can stop worrying about perception and start revisiting these classical ideas. Perhaps in 2020.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;\u003Ca href=\"http:\/\/plato.stanford.edu\/entries\/logic-ai\/\"\u003ELogic and Artificial Intelligence from the Stanford Encyclopedia of Philosophy\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003E2. Probability, Statistics, and Graphical Models (\"Measuring\" Machines)\u003C\/b\u003E\u003C\/h3\u003E\u003Cbr \/\u003EProbabilistic methods in Artificial Intelligence came out of the need to deal with uncertainty. The middle part of the Artificial Intelligence a Modern Approach textbook is called \"Uncertain Knowledge and Reasoning\" and is a great introduction to these methods. \u0026nbsp;If you're picking up AIMA for the first time, I recommend you start with this section. And if you're a student starting out with AI, do yourself a favor and don't skimp on the math.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-TKWy6zvuPo4\/VSVcc67N3YI\/AAAAAAAAN8E\/eSdTsH46L1A\/s1600\/HamburgerDensity4.gif\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"226\" src=\"https:\/\/3.bp.blogspot.com\/-TKWy6zvuPo4\/VSVcc67N3YI\/AAAAAAAAN8E\/eSdTsH46L1A\/s1600\/HamburgerDensity4.gif\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/onlinecourses.science.psu.edu\/stat414\/node\/97\"\u003EIntro to PDFs\u003C\/a\u003E from Penn State's\u0026nbsp;Probability Theory and Mathematical Statistics course\u003C\/div\u003E\u003Cbr \/\u003EWhen most people think about probabilistic methods they think of counting. \u0026nbsp;In laymen's terms, it's fair to think of probabilistic methods as fancy counting methods. \u0026nbsp;Let's briefly take a look at what used to be the two competing methods for thinking probabilistically.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFrequentist methods\u003C\/b\u003E are very empirical -- these methods are data-driven and make inferences purely from data. \u0026nbsp;\u003Cb\u003EBayesian methods\u003C\/b\u003E are more sophisticated and combine data-driven likelihoods with magical priors. \u0026nbsp;These priors often come from first principles or \"intuitions\" and the Bayesian approach is great for combining heuristics with data to make cleverer algorithms -- a nice mix of the rationalist and empiricist world views.\u003Cbr \/\u003E\u003Cbr \/\u003EWhat is perhaps more exciting that then Frequentist vs. Bayesian flamewar is something known as Probabilistic Graphical Models. \u0026nbsp;This class of techniques comes from computer science, and even though Machine Learning is now a strong component of a CS and a Statistics degree, the true power of statistics only comes when it is married with computation.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EProbabilistic Graphical Models are a marriage of Graph Theory with Probabilistic Methods\u003C\/b\u003E and they were all the rage among Machine Learning researchers in the mid-2000s. Variational methods, Gibbs Sampling, and Belief Propagation were being pounded into the brains of CMU graduate students when I was in graduate school (2005-2011) and provided us with a superb mental framework for thinking about machine learning problems. I learned most of what I know about Graphical Models from \u003Ca href=\"http:\/\/homes.cs.washington.edu\/~guestrin\/\"\u003ECarlos Guestrin\u003C\/a\u003E and \u003Ca href=\"http:\/\/jonathan-huang.org\/\"\u003EJonathan Huang\u003C\/a\u003E.\u0026nbsp;Carlos Guestrin is now the CEO of GraphLab, Inc (now known as \u003Ca href=\"https:\/\/dato.com\/\"\u003EDato\u003C\/a\u003E) which is a company that builds large-scale products for machine learning on graphs and Jonathan Huang is a senior research scientist at Google.\u003Cbr \/\u003E\u003Cbr \/\u003EThe video below is a high-level overview of GraphLab, but it serves a very nice overview of \"graphical thinking\" and how it fits into the modern data scientist's tool-belt. Carlos is an excellent lecturer and his presentation is less about the company's product and more about ways of thinking about next-generation machine learning systems.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ccenter\u003E \u003Ciframe allow=\"autoplay; encrypted-media\" allowfullscreen=\"\" frameborder=\"0\" height=\"252\" src=\"https:\/\/www.youtube.com\/embed\/VliM9-tB2VE\" width=\"448\"\u003E\u003C\/iframe\u003E \u003C\/center\u003E\u003Ccenter\u003E A Computational Introduction to Probabilistic Graphical Models\u003C\/center\u003E\u003Ccenter\u003E by GraphLab, Inc CEO Prof. Carlos Guestrin (Video Link updated 4\/17\/2018)\u003C\/center\u003E\u003Cbr \/\u003EIf you think that deep learning is going to solve all of your machine learning problems, you should really take a look at the above video. \u0026nbsp;If you're building recommender systems, an analytics platform for healthcare data, designing a new trading algorithm, or building the next generation search engine, Graphical Models are the perfect place to start.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Belief_propagation\"\u003EBelief Propagation Algorithm Wikipedia Page\u003C\/a\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~jordan\/papers\/variational-intro.pdf\"\u003EAn Introduction to Variational Methods for Graphical Models\u003C\/a\u003E by Michael Jordan et al.\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~jordan\/\"\u003EMichael Jordan's webpage\u003C\/a\u003E (one of the titans of inference and graphical models)\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003E3. Deep Learning and Machine Learning (Data-Driven Machines)\u003C\/b\u003E\u003C\/h3\u003EMachine Learning is about learning from examples and today's state-of-the-art recognition techniques require a lot of training data, a deep neural network, and patience. \u003Cb\u003EDeep Learning emphasizes the network architecture of today's most successful machine learning approaches.\u003C\/b\u003E \u0026nbsp;These methods are based on \"deep\" multi-layer neural networks with many hidden layers. NOTE: I'd like to emphasize that using deep architectures (as of 2015) is not new. \u0026nbsp;Just check out the following \"deep\" architecture from 1998.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-eAFL2rN9cm0\/VRjBiJiecrI\/AAAAAAAAN4E\/2Q7LthaoLEY\/s1600\/lenet.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"155\" src=\"https:\/\/4.bp.blogspot.com\/-eAFL2rN9cm0\/VRjBiJiecrI\/AAAAAAAAN4E\/2Q7LthaoLEY\/s400\/lenet.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ELeNet-5 Figure From Yann LeCun's seminal \"\u003Cspan style=\"background-color: white;\"\u003EGradient-based learning\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cspan style=\"background-color: white;\"\u003Eapplied to document recognition\"\u003C\/span\u003E\u0026nbsp;paper.\u003C\/div\u003E\u003Cbr \/\u003EWhen you take a look at \u003Ca href=\"http:\/\/deeplearning.net\/tutorial\/lenet.html\"\u003Emodern guide about LeNet\u003C\/a\u003E, it comes with the following disclaimer:\u003Cbr \/\u003E\u003Cbr \/\u003E\"To run this example on a GPU, you need a good GPU. It needs at least 1GB of GPU RAM. More may be required if your monitor is connected to the GPU.\u003Cbr \/\u003E\u003Cbr \/\u003EWhen the GPU is connected to the monitor, there is a limit of a few seconds for each GPU function call. This is needed as current GPUs can’t be used for the monitor while performing computations. Without this limit, the screen would freeze for too long and make it look as if the computer froze. This example hits this limit with medium-quality GPUs. When the GPU isn’t connected to a monitor, there is no time limit. You can lower the batch size to fix the timeout problem.\"\u003Cbr \/\u003E\u003Cbr \/\u003EIt really \u003Cb\u003Emakes me wonder how Yann was able to get \u003Ci\u003Eanything \u003C\/i\u003Eout of his deep model back in 1998\u003C\/b\u003E. Perhaps it's not surprising that it took another decade for the rest of us to get the memo.\u003Cbr \/\u003E\u003Cbr \/\u003EUPDATE: Yann pointed out (via a Facebook comment) that the ConvNet work dates back to 1989. \"It had about 400K connections and took about 3 weeks to train on the USPS dataset (8000 training examples) on a SUN4 machine.\" -- LeCun\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-O60saG4CJs4\/VSX5HjoXr4I\/AAAAAAAAN8o\/smmT02ecM_k\/s1600\/1989net.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"367\" src=\"https:\/\/4.bp.blogspot.com\/-O60saG4CJs4\/VSX5HjoXr4I\/AAAAAAAAN8o\/smmT02ecM_k\/s400\/1989net.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/yann.lecun.com\/exdb\/publis\/pdf\/lecun-89e.pdf\"\u003EA Deep Network from Yann's work at Bell Labs from 1989\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003ENOTE: At roughly the same time (~1998) two crazy guys in California were trying to cache the entire internet inside the computers in their garage (they started some funny-sounding company which starts with a G). I don't know how they did it, but I guess sometimes to win big you have to \u003Ca href=\"http:\/\/paulgraham.com\/ds.html\"\u003Edo things that don't scale\u003C\/a\u003E. Eventually, the world will catch up.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003EY.\u0026nbsp;LeCun, L.\u0026nbsp;Bottou, Y.\u0026nbsp;Bengio, and P.\u0026nbsp;Haffner. \u003Ca href=\"http:\/\/yann.lecun.com\/exdb\/publis\/pdf\/lecun-98.pdf\"\u003EGradient-based learning applied to document recognition\u003C\/a\u003E.\u0026nbsp;\u003C\/span\u003E\u003Ccite style=\"background-color: white;\"\u003EProceedings of the IEEE\u003C\/cite\u003E\u003Cspan style=\"background-color: white;\"\u003E, November 1998.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E \u003Cspan style=\"background-color: white;\"\u003EY. LeCun, B. Boser, J. S. Denker, D. Henderson, R. E. Howard, W. Hubbard and L. D. Jackel: \u003Ca href=\"http:\/\/yann.lecun.com\/exdb\/publis\/pdf\/lecun-89e.pdf\"\u003EBackpropagation Applied to Handwritten Zip Code Recognition\u003C\/a\u003E, Neural Computation, 1(4):541-551, Winter 1989\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan style=\"background-color: white;\"\u003E\u003Cb\u003EDeep Learning code:\u003C\/b\u003E Modern \u003Ca href=\"http:\/\/deeplearning.net\/tutorial\/lenet.html\"\u003ELeNet implementation in Theano \u003C\/a\u003Eand docs.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cspan style=\"background-color: white;\"\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003C\/span\u003E\u003C\/h3\u003E\u003Cspan style=\"background-color: white;\"\u003EI don't see traditional first-order logic making a comeback anytime soon. And while there is a lot of hype behind deep learning, distributed systems and \"graphical thinking\" is likely to make a much more profound impact on data science than heavily optimized CNNs. There is no reason why deep learning can't be combined with a GraphLab-style architecture, and some of the new exciting machine learning work in the next decade is likely to be a marriage of these two philosophies.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EYou can also check out a relevant post from last month:\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html\"\u003EDeep Learning vs Machine Learning vs Pattern Recognition\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ca class=\"hn-link\" href=\"javascript:window.location=%22http:\/\/news.ycombinator.com\/submitlink?u=%22+encodeURIComponent(document.location)+%22\u0026amp;t=%22+encodeURIComponent(document.title)\"\u003EDiscuss on Hacker News\u003C\/a\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/6794382104242630346\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/deep-learning-vs-probabilistic.html#comment-form","title":"9 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6794382104242630346"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6794382104242630346"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/deep-learning-vs-probabilistic.html","title":"Deep Learning vs Probabilistic Graphical Models vs Logic"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-b_Ux2LXhPyk\/VSVdqYcp6-I\/AAAAAAAAN8M\/eBJ2ln-6nDU\/s72-c\/probgraphmods.png","height":"72","width":"72"},"thr$total":{"$t":"9"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-8255085887863637133"},"published":{"$t":"2015-04-04T09:08:00.000-05:00"},"updated":{"$t":"2016-06-14T04:57:35.478-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"andrew ng"},{"scheme":"http://www.blogger.com/atom/ns#","term":"classifier"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"coursera"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"intuition"},{"scheme":"http://www.blogger.com/atom/ns#","term":"linear classifier"},{"scheme":"http://www.blogger.com/atom/ns#","term":"linear SVM"},{"scheme":"http://www.blogger.com/atom/ns#","term":"mathematics"},{"scheme":"http://www.blogger.com/atom/ns#","term":"mooc"},{"scheme":"http://www.blogger.com/atom/ns#","term":"peter norvig"},{"scheme":"http://www.blogger.com/atom/ns#","term":"phd"},{"scheme":"http://www.blogger.com/atom/ns#","term":"pyimagesearch"},{"scheme":"http://www.blogger.com/atom/ns#","term":"raphael"},{"scheme":"http://www.blogger.com/atom/ns#","term":"springs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"training"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision.ai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX"}],"title":{"type":"text","$t":"Three Fundamental Dimensions for Thinking About Machine Learning Systems"},"content":{"type":"html","$t":"Today, let's set cutting-edge machine learning and computer vision techniques aside. You probably already know that computer vision (or \"machine vision\") is the branch of computer science \/ artificial intelligence concerned with recognizing objects like cars, faces, and hand gestures in images. And you also probably know that Machine Learning algorithms are used to drive state-of-the-art computer vision systems. But what's missing is a birds-eye view of \u003Ci\u003Ehow to think\u003C\/i\u003E about designing new learning-based systems. So instead of focusing on today's trendiest machine learning techniques, let's go all the way back to day 1 and build ourselves \u003Cb\u003Ea strong foundation for thinking about machine learning and computer vision systems\u003C\/b\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-sYorlxwElHA\/VSAlhF21s9I\/AAAAAAAAN7Q\/HH9esGPAjGU\/s1600\/springs-01.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em; text-align: center;\"\u003E\u003Cimg border=\"0\" height=\"173\" src=\"https:\/\/4.bp.blogspot.com\/-sYorlxwElHA\/VSAlhF21s9I\/AAAAAAAAN7Q\/HH9esGPAjGU\/s400\/springs-01.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EAllow me to introduce\u0026nbsp;\u003Cb\u003Ethree fundamental dimensions\u003C\/b\u003E\u0026nbsp;which you can follow to obtain computer vision masterdom. The first dimension is \u003Cb\u003Emathematical\u003C\/b\u003E, the second is \u003Cb\u003Everbal\u003C\/b\u003E, and the third is \u003Cb\u003Eintuitive\u003C\/b\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003EOn a personal level, most of my daily computer vision activities directly map onto these dimensions. When I'm at a coffee shop, I prefer the mathematical - pen and paper are my weapons of choice. When it's time to get ideas out of my head, there's nothing like a solid founder-founder face-to-face meeting, an occasional MIT visit to brainstorm with my scientist colleagues, or simply rubberducking\u0026nbsp;(\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Rubber_duck_debugging\"\u003Erubber duck debugging\u003C\/a\u003E) with developers.\u0026nbsp;And when it comes to engineering, interacting with a live learning system can help develop the intuition necessary to make a system more powerful, more efficient, and ultimately much more robust.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cdiv\u003E\u003Ch3\u003E\u003Cb\u003EMathematical: Learn to love the linear classifier\u003C\/b\u003E\u003C\/h3\u003E\u003C\/div\u003E\u003Cdiv\u003EAt the core of machine learning is mathematics, so you shouldn't be surprised that I include \u003Ci\u003Emathematical\u003C\/i\u003E as one of the three fundamental dimensions of thinking about computer vision.\u003Cbr \/\u003E\u003Cbr \/\u003EThe single most important concept in all of machine learning which you should master is the idea of the classifier. For some of you, classification is a well-understood problem; however, too many students prematurely jump into more complex algorithms line randomized decision forests and multi-layer neural networks, without first grokking the power of the linear classifier. Plenty of data scientists will agree that \u003Cb\u003Ethe linear classifier is the most fundamental machine learning algorithm.\u003C\/b\u003E\u0026nbsp;In fact, when Peter Norvig, Director of Research at Google, was asked \"Which AI field has surpassed your expectations and surprised you the most?\" in his 2010 interview, he answered with \"machine learning by linear separators.\"\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003EThe illustration below depicts a linear classifier. In two dimensions, a linear classifier is a line which separates the positive examples from the negative examples. \u0026nbsp;You should first master the 2D linear classifier, even though in most applications you'll need to explore a higher-dimensional feature space.\u0026nbsp;\u003Cb\u003EMy personal favorite learning algorithm is the linear support vector machine, or linear SVM\u003C\/b\u003E. In a SVM, overly-confident data points do not influence the decision boundary. Or put in another way, learning with these confident points is like they aren't even there! This is a very useful property for large-scale learning problems where you can't fit all data into memory. You're going to want to master the linear SVM (and how it relates to Linear Discriminant Analysis, Linear Regression, and Logistic Regression) if you're going to pass one of my whiteboard data-science interviews.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/upload.wikimedia.org\/wikipedia\/commons\/2\/2a\/Svm_max_sep_hyperplane_with_margin.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/upload.wikimedia.org\/wikipedia\/commons\/2\/2a\/Svm_max_sep_hyperplane_with_margin.png\" height=\"400\" width=\"370\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ELinear Support Vector Machine from the \u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Support_vector_machine\"\u003ESVM Wikipedia page\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EAn intimate understanding of the linear classifier is necessary to understand how deep learning systems work. \u0026nbsp;The neurons inside a multi-layer neural network are little linear classifiers, and while the final decision boundary is non-linear, you should understand the underlying primitives very well. Loosely speaking, you can \u003Cb\u003Ethink of the linear classifier as a simple spring system and a more complex classifiers as a higher-order assembly of springs.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cbr \/\u003EAlso, there are going to be scenarios in your life as a data-scientist where \u003Cb\u003Ea linear classifier should be the first machine learning algorithm you try\u003C\/b\u003E. So don't be afraid to use some pen and paper, get into that hinge loss, and master the fundamentals.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;Google's Research Director talks about Machine Learning.\u0026nbsp;\u003Ca href=\"https:\/\/www.youtube.com\/watch?v=hE7k0_9k0VA\"\u003EPeter Norvig's Reddit AMA on YouTube from 2010\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;A demo for playing with linear classifiers in the browser.\u0026nbsp;\u003Ca href=\"http:\/\/vision.stanford.edu\/teaching\/cs231n\/linear-classify-demo\/\"\u003ELinear classifier Javascript demo\u003C\/a\u003E from Stanford's\u0026nbsp;CS231n: Convolutional Neural Networks for Visual Recognition.\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;My blog post:\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html\"\u003EDeep Learning vs Machine Learning vs Pattern Recognition\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ch3\u003E\u003Cb\u003EVerbal: Talk about you vision (and join a community)\u003C\/b\u003E\u003C\/h3\u003E\u003C\/div\u003E\u003Cdiv\u003EAs you start acquiring knowledge of machine learning concepts, the best way forward is to speak up. \u003Cb\u003ELearn something, then teach a friend.\u003C\/b\u003E As counterintuitive as it sounds, when it comes down to machine learning mastery, human-human interaction is key. This is why getting a ML-heavy Masters or PhD degree is ultimately the best bet for those adamant about becoming pioneers in the field. Daily conversations are necessary to strengthen your ideas. \u0026nbsp;See Raphael's \"The School of Athens\" for a depiction of what I think of as the ideal learning environment. \u0026nbsp;I'm sure half of those guys were thinking about computer vision.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-bHmE-F0j4zY\/VSAl3Juj71I\/AAAAAAAAN7Y\/mXUGUAEwmsY\/s1600\/soa.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"200\" src=\"https:\/\/3.bp.blogspot.com\/-bHmE-F0j4zY\/VSAl3Juj71I\/AAAAAAAAN7Y\/mXUGUAEwmsY\/s400\/soa.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/The_School_of_Athens\"\u003EThe School of Athens\u003C\/a\u003E by Raphael\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAn ideal ecosystem for collaboration and learning about computer vision\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003EIf you're not ready for a full-time graduate-level commitment to the field, consider \u003Cb\u003Ea.)\u003C\/b\u003E taking an advanced undergraduate course in vision\/learning from your university, \u003Cb\u003Eb.)\u003C\/b\u003E a machine learning MOOC, or \u003Cb\u003Ec.)\u003C\/b\u003E\u0026nbsp;taking part in a practical and application-focused online community\/course focusing on computer vision.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EDuring my 12-year academic stint, I made the observation that talking to your peers about computer vision and machine learning is more important that listening to teachers\/supervisors\/mentors. \u0026nbsp;Of course, there's much value in having a great teacher, but don't be surprised if you get 100x more face-to-face time with your friends compared to student-teacher interactions. \u0026nbsp;So if you take an online course like Coursera's Machine Learning MOOC, make sure to take it with friends. \u0026nbsp;Pause the video and discuss. Go to dinner and discuss. Write some code and discuss. Rinse, lather, repeat.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"https:\/\/d3njjcbhbojbot.cloudfront.net\/api\/utilities\/v1\/imageproxy\/https:\/\/d15cw65ipctsrr.cloudfront.net\/30\/d6ee30352d11e4b07f0965d0c0162f\/large-icon.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"180\" src=\"https:\/\/d3njjcbhbojbot.cloudfront.net\/api\/utilities\/v1\/imageproxy\/https:\/\/d15cw65ipctsrr.cloudfront.net\/30\/d6ee30352d11e4b07f0965d0c0162f\/large-icon.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ECoursera's \u003Ca href=\"https:\/\/www.coursera.org\/course\/ml\"\u003EMachine Learning MOOC\u003C\/a\u003E taught by \u003Ca href=\"http:\/\/cs.stanford.edu\/people\/ang\/\"\u003EAndrew Ng\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003EAnother great opportunity is to follow Adrian Rosebrock's \u003Ca href=\"http:\/\/www.pyimagesearch.com\/\"\u003Epyimagesearch.com blog\u003C\/a\u003E, where he focuses on python and computer vision applications. \u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;Old blog post:\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2012\/05\/why-your-vision-lab-needs-reading-group.html\"\u003EWhy your vision lab needs a reading group\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003Cb\u003EHomework assignment:\u0026nbsp;\u003C\/b\u003EFirst somebody on the street and teach them about machine learning.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ch3\u003E\u003Cb\u003EIntuitive: Play with a real-time machine learning system\u003C\/b\u003E\u003C\/h3\u003E\u003C\/div\u003E\u003Cdiv\u003EThe third and final dimension is centered around intuition. Intuition is\u0026nbsp;the ability to understand something immediately, without the need for conscious reasoning. The following guidelines are directed towards real-time object detection systems, but can also transfer over to other applications like learning-based attribution models for advertisements, high-frequency trading, as well as numerous tasks in robotics.\u003Cbr \/\u003E\u003Cbr \/\u003ETo gain some true insights about object detection, you should \u003Cb\u003Eexperience a real-time object detection system\u003C\/b\u003E. \u0026nbsp;There's something unique about seeing a machine learning system run in real-time, right in front of you. \u0026nbsp;And when you get to control the input to the system, such as when using a webcam, you can learn a lot about how the algorithms work. \u0026nbsp;For example, seeing the classification score go down as you occlude the object of interest, and seeing the detection box go away when the object goes out of view is fundamental to building intuition about what works and what elements of a system need to improve.\u003Cbr \/\u003E\u003Cbr \/\u003EI see countless students tweaking an algorithm, applying it to a static large-scale dataset, and then waiting for the precision-recall curve to be generated. I understand that this is the hard and scientific way of doing things, but unless you've already spent a few years making friends with every pixel, you're unlikely to make a lasting contribution this way. And it's not very exciting -- you'll probably fall asleep at your desk.\u003Cbr \/\u003E\u003Cbr \/\u003EUsing a real-time feedback loop (see illustration below), you can learn about the patterns which are intrinsically difficult to classify, as well what environmental variations (lights, clutter, motion) affect your system the most. \u0026nbsp;This is something which really cannot be done with a static dataset. \u0026nbsp;So go ahead, mine some intuition and play.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-iWfH_in0I_M\/VSAmGScNeuI\/AAAAAAAAN7g\/fDC_KIE_f_A\/s1600\/pong2.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"225\" src=\"https:\/\/3.bp.blogspot.com\/-iWfH_in0I_M\/VSAmGScNeuI\/AAAAAAAAN7g\/fDC_KIE_f_A\/s400\/pong2.jpg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cb\u003EVisual Debugging:\u003C\/b\u003E Designing the\u0026nbsp;\u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E real-time gesture-based controller in Fall 2013\u003C\/div\u003E\u003Cbr \/\u003EVisual feedback is where our work at \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E\u0026nbsp;truly stands out. Take a look at the following video, where\u0026nbsp;we show a live example of training and playing with a detector based on vision.ai's VMX object recognition system.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ccenter\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"158\" src=\"https:\/\/www.youtube.com\/embed\/u6HPGCefm9I\" width=\"280\"\u003E\u003C\/iframe\u003E\u003C\/center\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003Cb\u003ENOTE:\u003C\/b\u003E There a handful of other image recognition systems out there which you can turn into real-time vision systems, but be warned that optimization for real-time applications requires some non-trivial software engineering experience. \u0026nbsp;We've put a lot of care into our system so that the detection scores are analogous to a linear SVM scoring strategy.\u0026nbsp;Making the output of a non-trivial learning algorithm backwards-compatible with a linear SVM isn't always easy, but in my opinion, well-worth the effort.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EExtra Credit:\u003C\/b\u003E\u0026nbsp;See comments below for some free VMX by vision.ai beta software licenses so you can train some detectors using our visual feedback interface and gain your own machine vision intuition.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ch3\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003C\/h3\u003E\u003C\/div\u003E\u003Cdiv\u003EThe three dimensions, namely \u003Cb\u003Emathematical\u003C\/b\u003E, \u003Cb\u003Everbal\u003C\/b\u003E, and \u003Cb\u003Eintuitive\u003C\/b\u003E\u0026nbsp;provide different ways for advancing your knowledge of machine learning and computer vision systems. \u0026nbsp;So remember to love the linear classifier, talk to your friends, and use a real-time feedback loop when designing your machine learning system.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/8255085887863637133\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/three-fundamental-dimensions-for.html#comment-form","title":"4 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/8255085887863637133"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/8255085887863637133"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/04\/three-fundamental-dimensions-for.html","title":"Three Fundamental Dimensions for Thinking About Machine Learning Systems"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/4.bp.blogspot.com\/-sYorlxwElHA\/VSAlhF21s9I\/AAAAAAAAN7Q\/HH9esGPAjGU\/s72-c\/springs-01.png","height":"72","width":"72"},"thr$total":{"$t":"4"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-7265067667792142150"},"published":{"$t":"2015-03-26T18:51:00.000-05:00"},"updated":{"$t":"2015-03-26T18:51:23.574-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision jobs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CVPR 2015"},{"scheme":"http://www.blogger.com/atom/ns#","term":"demos"},{"scheme":"http://www.blogger.com/atom/ns#","term":"startups"},{"scheme":"http://www.blogger.com/atom/ns#","term":"venture pitch contest"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VIEW workshop"}],"title":{"type":"text","$t":"Venture Pitch Contest at CVPR 2015 in Boston, MA"},"content":{"type":"html","$t":"This year's \u003Ca href=\"http:\/\/www.pamitc.org\/cvpr15\/\"\u003ECVPR\u003C\/a\u003E will be in Boston, and as always, I expect it to be the single best venue to meet computer vision experts and see cutting edge research. I expect Google and Facebook to show off their best Deep Learning systems, NVIDIA to demo their newest GPUs, and dozens of computer vision startups to be looking for talent to grow their teams.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/cvisioncentral.com\/promotion\/view2015\/\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/cvisioncentral.com\/wp-content\/uploads\/2014\/12\/view2015.png\" height=\"97\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003EI expect the entrepreneur\/academic ratio to be much higher, as it is getting easier for PhD students and postdocs to start their own companies. \u0026nbsp;This year's CVPR will even feature a\u0026nbsp;\u003Ca href=\"http:\/\/cvisioncentral.com\/promotion\/venture-pitch-contest\/\"\u003EVenture Pitch Contest\u003C\/a\u003E\u0026nbsp;as part of the Fourth Annual Vision Industry and Entrepreneur (VIEW) Workshop at CVPR. From the \u003Ca href=\"http:\/\/cvisioncentral.com\/promotion\/view2015\/\"\u003EVIEW workshop\u003C\/a\u003E webpage:\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"background: rgb(255, 255, 255); border: 0px; color: #666666; font-family: 'Helvetica Neue', arial, sans-serif; font-size: 14px; line-height: 21px; margin-bottom: 20px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EComputer vision as a technology is penetrating the industry at an extraordinary pace with many computer vision applications directly becoming consumer commodities. Both startups and big companies have contributed to this trend. At the fourth annual Vision Industry and Entrepreneur Workshop, we are organizing a first of its kind Startup Pitch Contest. As a computer vision innovator, this is your chance to present the next great computer vision product idea to a\u0026nbsp;\u003Ca href=\"http:\/\/cvisioncentral.com\/promotion\/venture-pitch-contest\/\" style=\"background: transparent; border: 0px; color: #497ea8; margin: 0px; outline: 0px; padding: 0px; text-decoration: none; vertical-align: baseline;\" target=\"_blank\"\u003Edistinguished panel of judges\u003C\/a\u003E\u0026nbsp;which will include Venture Capitalists, Investors and leading Researchers in the field.\u003C\/div\u003E\u003Cdiv style=\"background: rgb(255, 255, 255); border: 0px; color: #666666; font-family: 'Helvetica Neue', arial, sans-serif; font-size: 14px; line-height: 21px; margin-bottom: 20px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EApplications should employ novel computer vision technologies towards an innovative product. The best submissions would be selected for an\u0026nbsp;\u003Cstrong style=\"background: transparent; border: 0px; margin: 0px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EElevator Pitch\u003C\/strong\u003E\u0026nbsp;presentation in front of the judges. Prizes would be awarded to the winners who would be announced at the end of the workshop. The details about the judging criteria will be posted on the website.\u003C\/div\u003E\u003Cdiv style=\"background: rgb(255, 255, 255); border: 0px; color: #666666; font-family: 'Helvetica Neue', arial, sans-serif; font-size: 14px; line-height: 21px; margin-bottom: 20px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EThe submission is broken into two phases –\u0026nbsp;\u003Cstrong style=\"background: transparent; border: 0px; margin: 0px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EPreliminary submission\u003C\/strong\u003E\u0026nbsp;consisting of a title and an abstract, and,\u0026nbsp;\u003Cstrong style=\"background: transparent; border: 0px; margin: 0px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003EFinal submission\u003C\/strong\u003E\u0026nbsp;consisting of a one page summary with technology overview, feasibility, outreach (customers and market size) and monetization (business model). The summary should be tailored at soliciting funding from sources such as venture capital to invest in the idea. The applicants should indicate whether they are academic researchers or industry professionals. \u003Cb\u003EOnly non-confidential material may be submitted.\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv style=\"background: rgb(255, 255, 255); border: 0px; color: #666666; font-family: 'Helvetica Neue', arial, sans-serif; font-size: 14px; line-height: 21px; margin-bottom: 20px; outline: 0px; padding: 0px; vertical-align: baseline;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EEven if you're not ready to pitch, you can submit a poster or demo to the Industry Session part of the \u003Ca href=\"http:\/\/cvisioncentral.com\/promotion\/view2015\/\"\u003EVIEW 2015 Workshop\u003C\/a\u003E. Great place to show off your new computer vision-powered app. \u0026nbsp;One of the organizers, Samson Timoner, told me the deadlines for submission have been extended. Here are the new dates:\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003ESubmission\u003C\/b\u003E: April 3, 2015 (extended)\u003Cbr \/\u003E\u003Cb\u003ENotification\u003C\/b\u003E: April 8, 2015 (extended)\u003Cbr \/\u003E\u003Cb\u003EWorkshop\u003C\/b\u003E: June 11, 2015\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv\u003EThis year's CVPR is going to be a great place to network with startups, share ideas, see cutting-edge research and (NEW in 2015) meet folks from the venture capital world. Who knows, if I'm there, I might be wearing a \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E T-shirt.\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/7265067667792142150\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/venture-pitch-contest-at-cvpr-2015-in.html#comment-form","title":"1 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/7265067667792142150"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/7265067667792142150"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/venture-pitch-contest-at-cvpr-2015-in.html","title":"Venture Pitch Contest at CVPR 2015 in Boston, MA"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"thr$total":{"$t":"1"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-5552908452350401895"},"published":{"$t":"2015-03-26T02:25:00.001-05:00"},"updated":{"$t":"2016-06-13T07:44:19.948-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"AI"},{"scheme":"http://www.blogger.com/atom/ns#","term":"Amnon Shashua"},{"scheme":"http://www.blogger.com/atom/ns#","term":"autonomous vehicle"},{"scheme":"http://www.blogger.com/atom/ns#","term":"chips"},{"scheme":"http://www.blogger.com/atom/ns#","term":"cloud-based computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"CTO"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"embedded computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"entrepreneurship"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"heroku"},{"scheme":"http://www.blogger.com/atom/ns#","term":"markets"},{"scheme":"http://www.blogger.com/atom/ns#","term":"MIT"},{"scheme":"http://www.blogger.com/atom/ns#","term":"mobileye"},{"scheme":"http://www.blogger.com/atom/ns#","term":"self-driving car"}],"title":{"type":"text","$t":"Mobileye's quest to put Deep Learning inside every new car"},"content":{"type":"html","$t":"\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003EIn Amnon Shashua's vision of the future, every car can see\u003C\/span\u003E. \u0026nbsp;He's convinced that the key technology behind the imminent driving revolution is going to be \u003Cb\u003Ecomputer vision\u003C\/b\u003E, and to experience this technology,\u0026nbsp;\u003Cb\u003Ewe won't have to wait for fully autonomous cars to become mainstream\u003C\/b\u003E.\u003Cb\u003E\u0026nbsp;\u0026nbsp;\u003C\/b\u003EI had the chance to hear Shashua's vision of the future this past Monday, and from what I'm about to tell you, it looks like there's going to be \u003Cb\u003Ea whole lot of Deep Learning inside tomorrow's car\u003Ci\u003E.\u0026nbsp;\u003C\/i\u003E\u003C\/b\u003ECars equipped with Deep Learning-based pedestrian avoidance systems (See Figure 1) can sense people and dangerous situations while you're behind the wheel. From winning large-scale object recognition competitions like ImageNet, to heavy internal use by Google, Deep Learning is now at the foundation of many hi-tech startups and giants. And when it comes to cars, Deep Learning promises to give us both safer roads\u0026nbsp;and the highly-anticipated hands-free driving experience.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-99sL1hGWDbY\/VRI9gcyf1bI\/AAAAAAAAN2c\/ViA3bMwa4hs\/s1600\/mobileye_technology.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"168\" src=\"https:\/\/4.bp.blogspot.com\/-99sL1hGWDbY\/VRI9gcyf1bI\/AAAAAAAAN2c\/ViA3bMwa4hs\/s1600\/mobileye_technology.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.mobileye.com\/technology\/\"\u003EMobileye's\u0026nbsp;\u003C\/a\u003EDeep Learning-based Pedestrian Detector\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p4\"\u003E\u003Cspan class=\"s2\"\u003E\u003Cb\u003EMobileye Co-founder Amnon Shashua shares his vision during an invited lecture at MIT\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p4\"\u003E\u003Cspan class=\"s2\"\u003E       \u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Amnon_Shashua\"\u003EAmnon Shashua\u003C\/a\u003E\u0026nbsp;is the\u0026nbsp;\u003C\/span\u003E\u003Cspan class=\"s2\"\u003ECo-founder \u0026amp; CTO of \u003Ca href=\"http:\/\/www.mobileye.com\/\"\u003EMobileye\u003C\/a\u003E\u0026nbsp;and\u0026nbsp;\u003C\/span\u003Ethis past Monday (March 23, 2015) he\u0026nbsp;\u003Cspan class=\"s2\"\u003Egave a compelling talk at\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E MIT’s \u003Ca href=\"https:\/\/cbmm.mit.edu\/news-events\/events\/brains-minds-machines-seminar-series-computer-vision-changing-our-lives\"\u003EBrains, Minds \u0026amp; Machines Seminar\u0026nbsp;Series\u003C\/a\u003E titled “\u003C\/span\u003E\u003Cspan class=\"s2\"\u003EComputer Vision that is Changing Our Lives”.\u0026nbsp;Shashua discussed Mobileye’s Deep Learning chips, robots, autonomous driving, as well as introduced his most recent project, a wearable computer vision unit called \u003Ca href=\"http:\/\/www.orcam.com\/\"\u003EOrCam\u003C\/a\u003E.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-cK3SV7jfZ3U\/VRI8yeCk_zI\/AAAAAAAAN2U\/f3bhiQk1KUA\/s1600\/mobileye.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"212\" src=\"https:\/\/4.bp.blogspot.com\/-cK3SV7jfZ3U\/VRI8yeCk_zI\/AAAAAAAAN2U\/f3bhiQk1KUA\/s1600\/mobileye.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 2. Prof Amnon Shashua, CTO of Mobileye\u003C\/div\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s2\"\u003ELet's take a deeper look at the man behind Mobileye and his vision. Below is my summary of Shashua's talk as well as some personal insights regarding Mobileye's\u0026nbsp;embedded computer vision technology and how it relates to cloud-based computer vision.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EMobileye's academic roots\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan class=\"s2\"\u003EYou might have heard stories of bold entrepreneurs dropping out of college to form million dollar startups, but this isn't one of them. \u0026nbsp;This is the story of a professor who turned his ideas into a publicly traded company, Mobileye (NYSE:MBLY). Amnon Shashua is a Professor at Hebrew University, and his lifetime achievements suggest that \u003Cb\u003Efor high-tech entrepreneurship, it is pretty cool to stay in school\u003C\/b\u003E. And while Shashua and I never overlapped academically (he is 23 years older than me), both of us spent some time at MIT as postdoctoral researchers.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s2\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003E\u003Cb\u003EDeep Learning's impact on Mobileye\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003EDuring his presentation at MIT, Amnon Shashua showcased a wide array of of computer vision problems that are currently being solved by Mobileye real-time computer vision systems. These systems are image-based and do not require expensive 3D sensors such as the ones commonly found on top of self-driving cars. \u0026nbsp;He showed videos of real-time lane detection, pedestrian detection, animal detection, and road surface detection. I have seen many similar visualizations during my academic career; however, Shashua emphasized that \u003Cb\u003Edeep learning is now used to power most of Mobileye's computer vision systems\u003C\/b\u003E.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s2\"\u003E\u003Cb\u003EQuestion:\u003C\/b\u003E I genuinely wonder how much the shift to Deep methods improved Mobileye's algorithms, or if the move is a strategic technology upgrade to stay relevant in the era where Google and and competition is feverishly pouncing on the landscape of deep learning. There's a lot of competition on the hardware front, and it \u003Cb\u003Eseems like the chase for ASIC-like Deep Learning Miners\/Trainers is on\u003C\/b\u003E.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-TE9BUs2F8XA\/VRMo02YyWcI\/AAAAAAAAN3Y\/5v8-Zc8cFy4\/s1600\/alexnet.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"212\" src=\"https:\/\/1.bp.blogspot.com\/-TE9BUs2F8XA\/VRMo02YyWcI\/AAAAAAAAN3Y\/5v8-Zc8cFy4\/s1600\/alexnet.png\" width=\"640\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EThe AlexNet CNN diagram from the \u003Ca href=\"http:\/\/www.cs.toronto.edu\/~fritz\/absps\/imagenet.pdf\"\u003Epopular\u0026nbsp;Krizhevsky\/Sutskever\/Hinton paper\u003C\/a\u003E. Shashua explicitly mentioned the AlexNet model during his MIT talk, and it appears that Mobileye has done their Deep Learning homework.\u003C\/div\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s2\"\u003E\u003Cb\u003EThe early Mobileye:\u0026nbsp;\u003C\/b\u003E\u003C\/span\u003EMobileye didn’t wait for the deep learning revolution to happen. They started shipping computer vision technology for vehicles using traditional techniques more than a decade ago. In fact, I attended a Mobileye presentation at CMU almost a full decade ago -- it was given by Andras Ferencz at the\u0026nbsp;\u003Ca href=\"http:\/\/vasc.ri.cmu.edu\/seminar\/old\/F05.html\"\u003E2005 CMU VASC Seminar\u003C\/a\u003E. \u0026nbsp;This week's talk by Shashua suggests that \u003Cb\u003EMobileye was able to successfully modernize their algorithms to use deep learning.\u003C\/b\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EFurther reading:\u003C\/b\u003E\u0026nbsp;To learn about object recognition methods in computer vision which were popular before Deep Learning, see my January blog post, titled\u0026nbsp;\u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/01\/from-feature-descriptors-to-deep.html\"\u003EFrom feature descriptors to deep learning: 20 years of computer vision\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-DVxBJt9hNJI\/VRJBOmQux0I\/AAAAAAAAN2w\/5-d9P81DH70\/s1600\/amnon%2Bdeep_learning.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"272\" src=\"https:\/\/1.bp.blogspot.com\/-DVxBJt9hNJI\/VRJBOmQux0I\/AAAAAAAAN2w\/5-d9P81DH70\/s1600\/amnon%2Bdeep_learning.png\" width=\"640\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 3. \"Deep Learning at Mobileye\" presentation at the 2015 Deutsche Bank Global\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EAuto Industry Conference.\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cb\u003E\u003Cspan class=\"s2\"\u003E\u003C\/span\u003EMobileye's custom Computer Vision hardware\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003EMobileye is not a software computer vision company -- they bake their algorithms into custom computer vision chips. Shashua reported some\u003Cb\u003E impressive computation speeds on what appears to be tiny vision chips\u003C\/b\u003E. Their custom hardware is more specific than GPUs (which are quite common for deep learning, scientific computations, computer graphics, and actually affordable). But Mobileye chips do not need to perform the computationally expensive big-data training stage onboard, so their devices can be much leaner than GPUs. Mobileye has lots of hardware experience, and regarding machine learning, Shashua mentioned that Mobileye has more vehicle-related training data than they know what to do with. \u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-SjGNQxZMyMw\/VRJARTrOJtI\/AAAAAAAAN2o\/o_uTMoySXl4\/s1600\/Lane_Guidance_Camera_PCB.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"241\" src=\"https:\/\/4.bp.blogspot.com\/-SjGNQxZMyMw\/VRJARTrOJtI\/AAAAAAAAN2o\/o_uTMoySXl4\/s1600\/Lane_Guidance_Camera_PCB.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 4. The Mobileye Q2 lane detection chip.\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003Cb\u003EEmbedded vs. Cloud-based computer vision\u003C\/b\u003E\u003Cbr \/\u003EWhile Mobileye makes a strong case for embedded computer vision, there are many scenarios today where the alternative cloud-based computer vision approach triumphs. \u0026nbsp;\u003Cb\u003ECloud-based computer vision is about delivering powerful algorithms as a service, over the web.\u003C\/b\u003E \u0026nbsp;In a cloud-based architecture, the algorithms live in a data center and applications talk to the vision backend via an API layer. \u0026nbsp;And while certain mission-critical applications cannot have a cloud-component (e.g., a drones flying over the desert), cloud-based vision system promise to turn laptops and smartphones into smart devices, without the need to bake algorithms into chips. In-home surveillance apps, home-automation apps, exploratory robotics projects, and even scientific research can benefit from cloud-based computer vision. \u0026nbsp;Most importantly, cloud-based deployment means that startups can innovate faster, and entire products can evolve much faster.\u003Cbr \/\u003E\u003Cbr \/\u003EUnlike Mobileye's decade-long journey, I suspect\u0026nbsp;\u003Cb\u003Ecloud-based computer vision platforms are going to make computer vision development much faster, \u003C\/b\u003Egiving developers a Heroku-like button for visual AI. \u0026nbsp;Choosing diverse compilation targets such as a custom chip or Javascript will be handled by the computer vision platform, allowing computer vision developers to work smarter and deploy to more devices.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s2\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s2\"\u003E\u003Cb\u003EConclusion and Predictions\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003EEven if you don't believe that today's computer vision-based safety features make cars smart enough to call them robots, driving tomorrow's car is sure going to \u003Ci\u003Efeel\u003C\/i\u003E different. \u0026nbsp;I will leave you with one final note: Mobileye's CTO hinted that if you are going to design a car in 2015 on top of computer vision tech, you \u003Cb\u003Emight reconsider traditional safety features such as airbags, and create a leaner, less-expensive AI-enabled vehicle\u003C\/b\u003E.\u003Cbr \/\u003E\u003Cspan style=\"font-size: xx-small; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/safety.trw.com\/wp-content\/uploads\/2013\/08\/photo-sae-Data-Fusion-jpg.jpg\" imageanchor=\"1\" style=\"display: inline !important; margin-left: 1em; margin-right: 1em; text-align: center;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/safety.trw.com\/wp-content\/uploads\/2013\/08\/photo-sae-Data-Fusion-jpg.jpg\" height=\"284\" width=\"640\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 5. Mobileye technology illustration [\u003Ca href=\"http:\/\/safety.trw.com\/trws-new-video-camera-supports-global-safety-trends-2\/0910\/\"\u003Esafety.trw.com\u003C\/a\u003E].\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cb\u003EWatch the Mobileye presentation on YouTube:\u003C\/b\u003E If you are interested in embedded deep learning, autonomous vehicles, or want to get a taste of how the industry veterans compile their deep networks into chips, you can watch the full 38-minute presentation from Amnon's January 2015 Mobileye presentation.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ccenter\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"https:\/\/www.youtube.com\/embed\/kp3ik5f3-2c\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/center\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003EI hope you learned a little bit about vehicle computer vision systems, embedded Deep Learning, and got a glimpse of the visual intelligence revolution that is happening today. Feel free to comment below, follow me on Twitter (\u003Ca href=\"https:\/\/twitter.com\/quantombone\"\u003E@quantombone\u003C\/a\u003E), or sign-up to the \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E mailing list if you are a developer interested in taking vision.ai's cloud-based computer vision platform for a spin.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Ca class=\"twitter-follow-button\" data-show-count=\"false\" href=\"https:\/\/twitter.com\/quantombone\"\u003EFollow @quantombone\u003C\/a\u003E\u003Cscript\u003E!function(d,s,id){var js,fjs=d.getElementsByTagName(s)[0],p=\/^http:\/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+':\/\/platform.twitter.com\/widgets.js';fjs.parentNode.insertBefore(js,fjs);}}(document, 'script', 'twitter-wjs');\u003C\/script\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/5552908452350401895\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/mobileyes-quest-to-put-deep-learning.html#comment-form","title":"2 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5552908452350401895"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5552908452350401895"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/mobileyes-quest-to-put-deep-learning.html","title":"Mobileye's quest to put Deep Learning inside every new car"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/4.bp.blogspot.com\/-99sL1hGWDbY\/VRI9gcyf1bI\/AAAAAAAAN2c\/ViA3bMwa4hs\/s72-c\/mobileye_technology.png","height":"72","width":"72"},"thr$total":{"$t":"2"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-5986936212375969343"},"published":{"$t":"2015-03-20T19:58:00.000-05:00"},"updated":{"$t":"2016-06-14T03:43:35.565-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"artificial intelligence"},{"scheme":"http://www.blogger.com/atom/ns#","term":"big-data"},{"scheme":"http://www.blogger.com/atom/ns#","term":"classification"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"gpu"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"paas"},{"scheme":"http://www.blogger.com/atom/ns#","term":"pattern recognition"},{"scheme":"http://www.blogger.com/atom/ns#","term":"smart software"}],"title":{"type":"text","$t":"Deep Learning vs Machine Learning vs Pattern Recognition"},"content":{"type":"html","$t":"\u003Cdiv class=\"p1\"\u003ELets take a close look at three related terms (Deep Learning vs Machine Learning vs Pattern Recognition), and see how they relate to some of the hottest tech-themes in 2015 (namely Robotics and Artificial Intelligence). In our short journey through jargon, you should acquire a better understanding of how computer vision fits in, as well as gain an intuitive feel for how the machine learning zeitgeist has slowly evolved over time.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-A8krme3OoTg\/VT3_afoWesI\/AAAAAAAAOCk\/A1CztkiMC2A\/s1600\/unknown.jpeg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"240\" src=\"https:\/\/1.bp.blogspot.com\/-A8krme3OoTg\/VT3_afoWesI\/AAAAAAAAOCk\/A1CztkiMC2A\/s1600\/unknown.jpeg\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EFig 1. Putting a human inside a computer is not Artificial Intelligence\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E(Photo from\u0026nbsp;\u003Ca href=\"http:\/\/crowdcomputingblog.com\/2013\/11\/01\/whats-the-true-definition-of-a-platform\/\"\u003EWorkFusion Blog\u003C\/a\u003E)\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003EIf you look around, you'll see no shortage of jobs at high-tech startups looking for machine learning experts. While only a fraction of them are looking for Deep Learning experts, I bet most of these startups can benefit from even the most elementary kind of data scientist. So how do you spot a future data-scientist? You learn how they think.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cb\u003EThe three highly-related \"learning\" buzz words\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E“Pattern recognition,” “machine learning,” and “deep learning” represent three different schools of thought.\u0026nbsp; Pattern recognition is the oldest (and as a term is quite outdated). Machine Learning is the most fundamental (one of the hottest areas for startups and research labs as of today, early 2015). And \u003Ci\u003EDeep Learning is the new, the big, the bleeding-edge -- we’re not even close to thinking about the post-deep-learning era\u003C\/i\u003E. \u0026nbsp;Just take a look at the following Google Trends graph. \u0026nbsp;You'll see that a) Machine Learning is rising like a true champion, b) Pattern Recognition started as synonymous with Machine Learning, c) Pattern Recognition is dying, and d) Deep Learning is new and rising fast.\u003C\/div\u003E\u003Cbr \/\u003E\u003Cscript src=\"\/\/www.google.com\/trends\/embed.js?hl=en-US\u0026amp;q=machine+learning,+pattern+recognition,+deep+learning\u0026amp;cmpt=q\u0026amp;tz\u0026amp;tz\u0026amp;content=1\u0026amp;cid=TIMESERIES_GRAPH_0\u0026amp;export=5\u0026amp;w=500\u0026amp;h=330\" type=\"text\/javascript\"\u003E\u003C\/script\u003E \u003Cbr \/\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E1. Pattern Recognition: The birth of smart programs\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003EPattern recognition was a term popular in the 70s and 80s. The emphasis was on getting a computer program to do something “smart” like recognize the character \"3\". And it really took a lot of cleverness and intuition to build such a program. Just think of \"3\" vs \"B\" and \"3\" vs \"8\". \u0026nbsp;Back in the day, i\u003C\/span\u003Et didn’t really matter how you did it as long as there was no human-in-a-box pretending to be a machine. (See Figure 1) \u0026nbsp;So if your algorithm would apply some filters to an image, localize some edges, and apply morphological operators, it was definitely of interest to the pattern recognition community.\u0026nbsp; Optical Character Recognition grew out of this community and it is fair to call “Pattern Recognition” as the “Smart\" Signal Processing of the 70s, 80s, and early 90s. Decision trees, heuristics, quadratic discriminant analysis, etc all came out of this era. Pattern Recognition become something CS folks did, and not EE folks. \u0026nbsp;One of the most popular books from that time period is the \u003Cstrike\u003Einfamous\u003C\/strike\u003E invaluable Duda \u0026amp; Hart \"Pattern Classification\" book and is still a great starting point for young researchers. \u0026nbsp;But don't get too caught up in the vocabulary, it's a bit dated.\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-uxnx5mRW1k4\/VR9MI0PK0KI\/AAAAAAAAN6U\/HfjMzLEzK4g\/s1600\/315.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"314\" src=\"https:\/\/3.bp.blogspot.com\/-uxnx5mRW1k4\/VR9MI0PK0KI\/AAAAAAAAN6U\/HfjMzLEzK4g\/s1600\/315.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EThe character \"3\" partitioned into 16 sub-matrices. Custom rules, custom decisions, and custom \"smart\" programs used to be all the rage.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EQuiz\u003C\/b\u003E:\u0026nbsp;\u003C\/span\u003EThe most popular Computer Vision conference is called CVPR and the PR stands for Pattern Recognition. \u0026nbsp;Can you guess the year of the first CVPR conference?\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E2. Machine Learning: Smart programs can learn from examples\u003C\/b\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003ESometime in the early 90s people started realizing that a more powerful way to build pattern recognition algorithms is to replace an expert (who probably knows way too much about pixels) with data (which can be mined from cheap laborers).\u0026nbsp; So you collect a bunch of face images and non-face images, choose an algorithm, and wait for the computations to finish.\u0026nbsp; This is the spirit of machine learning. \u0026nbsp;\"Machine Learning\" emphasizes that the computer program (or machine) must do some work after it is given data. \u0026nbsp;The Learning step is made explicit. \u0026nbsp;And believe me, waiting 1 day for your computations to finish scales better than inviting your academic colleagues to your home institution to design some classification rules by hand.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-Br9BfM-VV1s\/VT3_qrSBiiI\/AAAAAAAAOCs\/oirhh9wS0SA\/s1600\/ml-eng.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"255\" src=\"https:\/\/1.bp.blogspot.com\/-Br9BfM-VV1s\/VT3_qrSBiiI\/AAAAAAAAOCs\/oirhh9wS0SA\/s1600\/ml-eng.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\"What is Machine Learning\" from \u003Ca href=\"http:\/\/nkonst.com\/machine-learning-explained-simple-words\/\"\u003EDr Natalia Konstantinova's Blog\u003C\/a\u003E. The most important part of this diagram are the \"Gears\" which suggests that crunching\/working\/computing is an important step in the ML pipeline.\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003EAs Machine Learning grew into a major research topic in the mid 2000s, computer scientists began applying these ideas to a wide array of problems.\u0026nbsp; No longer was it only character recognition, cat vs. dog recognition, and other “recognize a pattern inside an array of pixels” problems.\u0026nbsp; Researchers started applying Machine Learning to Robotics (reinforcement learning, manipulation, motion planning, grasping), to genome data, as well as to predict financial markets.\u0026nbsp; Machine Learning was married with Graph Theory under the brand “Graphical Models,” every robotics expert had no choice but to become a Machine Learning Expert, and \u003Cb\u003EMachine Learning quickly became one of the most desired and versatile computing skills\u003C\/b\u003E. \u0026nbsp;However \"Machine Learning\" says nothing about the underlying algorithm. \u0026nbsp;We've seen convex optimization, Kernel-based methods, Support Vector Machines, as well as Boosting have their winning days. \u0026nbsp;Together with some custom manually engineered features, we had lots of recipes, lots of different schools of thought, and it wasn't entirely clear how a newcomer should select features and algorithms. \u0026nbsp;But that was all about to change...\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EFurther reading: To learn more about the kinds of features that were used in Computer Vision research see my blog post: \u003Ca href=\"http:\/\/www.computervisionblog.com\/2015\/01\/from-feature-descriptors-to-deep.html\"\u003EFrom feature descriptors to deep learning: 20 years of computer vision\u003C\/a\u003E.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E3. Deep Learning: one architecture to rule them all\u003C\/b\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003EFast forward to today and what we’re seeing is a large interest in something called Deep Learning. The most popular kinds of Deep Learning models, as they are using in large scale image recognition tasks, are known as Convolutional Neural Nets, or simply ConvNets.\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-uHgA5CQk22A\/VT3_7lLLXtI\/AAAAAAAAOC0\/YMAyuywwaQ0\/s1600\/convnet.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"138\" src=\"https:\/\/4.bp.blogspot.com\/-uHgA5CQk22A\/VT3_7lLLXtI\/AAAAAAAAOC0\/YMAyuywwaQ0\/s400\/convnet.png\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EConvNet diagram from \u003Ca href=\"http:\/\/torch.cogbits.com\/doc\/tutorials_supervised\/\"\u003ETorch Tutorial\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003EDeep Learning emphasizes the kind of model you might want to use (e.g., a deep convolutional multi-layer neural network) and that you can use data fill in the missing parameters.\u0026nbsp; But with deep-learning comes great responsibility.\u0026nbsp; Because you are starting with a model of the world which has a high dimensionality, you really need a lot of data (big data) and a lot of crunching power (GPUs). Convolutions are used extensively in deep learning (especially computer vision applications), and the architectures are far from shallow.\u003Cbr \/\u003E\u003Cbr \/\u003EIf you're starting out with Deep Learning, simply brush up on some elementary Linear Algebra and start coding. \u0026nbsp;I highly recommend Andrej Karpathy's \u003Ca href=\"http:\/\/karpathy.github.io\/neuralnets\/\"\u003EHacker's guide to Neural Networks\u003C\/a\u003E. Implementing your own CPU-based backpropagation algorithm on a non-convolution based problem is a good place to start.\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cbr \/\u003EThere are still lots of unknowns. The theory of why deep learning works is incomplete, and no single guide or book is better than true machine learning experience. \u0026nbsp;There are lots of reasons why Deep Learning is gaining popularity, but Deep Learning is not going to take over the world. \u0026nbsp;As long as you continue brushing up on your machine learning skills, your job is safe. But don't be afraid to chop these networks in half, slice 'n dice at will, and build software architectures that work in tandem with your learning algorithm. \u0026nbsp;The Linux Kernel of tomorrow might run on \u003Ca href=\"http:\/\/caffe.berkeleyvision.org\/\"\u003ECaffe\u003C\/a\u003E (one of the most popular deep learning frameworks), but great products will always need great vision, domain expertise, market development, and most importantly: human creativity.\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EOther related buzz-words\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EBig-data\u003C\/b\u003E is the philosophy of measuring all sorts of things, saving that data, and looking through it for information.\u0026nbsp; For business, this big-data approach can give you actionable insights.\u0026nbsp; In the context of learning algorithms, we’ve only started seeing the marriage of big-data and machine learning within the past few years.\u0026nbsp; \u003Cb\u003ECloud-computing\u003C\/b\u003E, \u003Cb\u003EGPUs\u003C\/b\u003E, \u003Cb\u003EDevOps\u003C\/b\u003E, and \u003Cb\u003EPaaS\u003C\/b\u003E providers have made large scale computing within reach of the researcher and ambitious \"everyday\" developer.\u0026nbsp;\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EArtificial Intelligence\u003C\/b\u003E is perhaps the oldest term, the most vague, and the one that was gone through the most ups and downs in the past 50 years. When somebody says they work on Artificial Intelligence, you are either going to want to laugh at them or take out a piece of paper and write down everything they say.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EFurther reading: My 2011 Blog post \u003Ca href=\"http:\/\/www.computervisionblog.com\/2011\/03\/computer-vision-is-artificial.html\"\u003EComputer Vision is Artificial Intelligence\u003C\/a\u003E.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EConclusion\u003C\/b\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003E\u003Cbr \/\u003E\u003C\/b\u003E\u003C\/span\u003EMachine Learning is here to stay. Don't think about it as Pattern Recognition vs Machine Learning vs Deep Learning, just realize that each term emphasizes something a little bit different. \u0026nbsp;But the search continues. \u0026nbsp;Go ahead and explore. Break something. We will continue building smarter software and our algorithms will continue to learn, but we've only begun to explore the kinds of architectures that can truly rule-them-all.\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003EIf you're interested in real-time vision applications of deep learning, namely those suitable for robotic and home automation applications, then you should check out what we've been building at \u003Ca href=\"http:\/\/vision.ai\/\"\u003Evision.ai\u003C\/a\u003E. Hopefully in a few days, I'll be able to say a little bit more. :-)\u003Cbr \/\u003E\u003Cbr \/\u003EUntil next time.\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p3\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003ESee \u003Ca href=\"https:\/\/news.ycombinator.com\/item?id=9247851\"\u003Ediscussion about this blog post on Hacker News\u003C\/a\u003E.\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/5986936212375969343\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html#comment-form","title":"25 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5986936212375969343"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5986936212375969343"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/03\/deep-learning-vs-machine-learning-vs.html","title":"Deep Learning vs Machine Learning vs Pattern Recognition"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/1.bp.blogspot.com\/-A8krme3OoTg\/VT3_afoWesI\/AAAAAAAAOCk\/A1CztkiMC2A\/s72-c\/unknown.jpeg","height":"72","width":"72"},"thr$total":{"$t":"25"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-2752539545980854411"},"published":{"$t":"2015-01-20T10:45:00.002-05:00"},"updated":{"$t":"2016-06-13T07:46:33.373-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"bengio"},{"scheme":"http://www.blogger.com/atom/ns#","term":"big data"},{"scheme":"http://www.blogger.com/atom/ns#","term":"convolution"},{"scheme":"http://www.blogger.com/atom/ns#","term":"dalal"},{"scheme":"http://www.blogger.com/atom/ns#","term":"deep learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"feature engineering"},{"scheme":"http://www.blogger.com/atom/ns#","term":"features"},{"scheme":"http://www.blogger.com/atom/ns#","term":"google"},{"scheme":"http://www.blogger.com/atom/ns#","term":"hinton"},{"scheme":"http://www.blogger.com/atom/ns#","term":"HOG"},{"scheme":"http://www.blogger.com/atom/ns#","term":"learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"lecun"},{"scheme":"http://www.blogger.com/atom/ns#","term":"lowe"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"object recognition"},{"scheme":"http://www.blogger.com/atom/ns#","term":"sift"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision"}],"title":{"type":"text","$t":"From feature descriptors to deep learning: 20 years of computer vision"},"content":{"type":"html","$t":"\u003Cdiv class=\"p1\"\u003EWe all know that deep convolutional neural networks have produced some stellar results on object detection and recognition benchmarks in the past two years (2012-2014), so you might wonder: \u003Ci\u003Ewhat did the earlier object recognition techniques look like\u003C\/i\u003E? \u003Ci\u003EHow do the designs of earlier recognition systems relate to the modern multi-layer convolution-based framework\u003C\/i\u003E?\u003Cbr \/\u003E\u003Cbr \/\u003ELet's take a look at some of the big ideas in Computer Vision from the last 20 years.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EThe rise of the local feature descriptors: ~1995 to ~2000\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EWhen \u003Cb\u003ESIFT\u003C\/b\u003E (an acronym for Scale Invariant Feature Transform) was introduced by \u003Cb\u003EDavid Lowe\u003C\/b\u003E in 1999, the world of computer vision research changed almost overnight. It was robust solution to the problem of comparing image patches. Before SIFT entered the game, people were just using SSD (sum of squared distances) to compare patches and not giving it much thought.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-2Lw3DxApZrw\/VKBKMeAwTnI\/AAAAAAAANyU\/7IQxfszsclc\/s1600\/sift_pic.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"149\" src=\"https:\/\/3.bp.blogspot.com\/-2Lw3DxApZrw\/VKBKMeAwTnI\/AAAAAAAANyU\/7IQxfszsclc\/s1600\/sift_pic.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EThe SIFT recipe: gradient orientations, normalization tricks\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003ESIFT is something called a local feature descriptor -- it is one of those research findings which is the result of one ambitious man hackplaying with pixels for more than a decade. \u0026nbsp;Lowe and the University of British Columbia got a patent on SIFT and \u003Ci\u003ELowe released a nice compiled binary of his very own SIFT implementation for researchers to use in their work\u003C\/i\u003E. \u0026nbsp;SIFT allows a point inside an RGB imagine to be represented robustly by a low dimensional vector.\u0026nbsp; When you take multiple images of the same physical object while rotating the camera, the SIFT descriptors of corresponding points are very similar in their 128-D space.\u0026nbsp; At first glance it seems silly that you need to do something as complex as SIFT, but believe me: just because you, a human, can look at two image patches and quickly \"understand\" that they belong to the same physical point, this is not the same for machines.\u0026nbsp; SIFT had massive implications for the geometric side of computer vision (stereo, Structure from Motion, etc) and later became the basis for the popular Bag of Words model for object recognition.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003ESeeing a technique like SIFT dramatically outperform an alternative method like Sum-of-Squared-Distances (SSD) Image Patch Matching firsthand is an important step in every aspiring vision scientist's career. And SIFT isn't just a vector of filter bank responses, the binning and normalization steps are very important. It is also worthwhile noting that while SIFT was initially (in its published form) applied to the output of an interest point detector, later it was found that the interest point detection step was not important in categorization problems. \u0026nbsp;For categorization, researchers eventually moved towards vector quantized SIFT applied densely across an image.\u003Cbr \/\u003E\u003Cbr \/\u003EI should also mention that other descriptors such as \u003Cb\u003ESpin Images\u003C\/b\u003E (see my \u003Ca href=\"http:\/\/www.computervisionblog.com\/2009\/07\/spin-images-for-object-recognition-in.html\"\u003E2009 blog post on spin images\u003C\/a\u003E) came out a little bit earlier than SIFT, but because Spin Images were solely applicable to 2.5D data, this feature's impact wasn't as great as that of SIFT.\u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EThe modern dataset (aka the hardening of vision as science): ~2000 to ~2005\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003EHomography estimation, ground-plane estimation, robotic vision, SfM, and all other geometric problems in vision greatly benefited from robust image features such as SIFT. \u0026nbsp;But towards the end of the 1990s, it was clear that \u003Ci\u003Ethe internet was the next big thing\u003C\/i\u003E. \u0026nbsp;Images were going online. Datasets were being created. \u0026nbsp;And no longer was the current generation solely interested in structure recovery (aka geometric) problems. \u0026nbsp;This was the beginning of the large-scale dataset era with \u003Ca href=\"http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech101\/\"\u003ECaltech-101\u003C\/a\u003E slowly gaining popularity and categorization research on the rise. No longer were researchers evaluating their own algorithms on their own in-house datasets -- we now had a more objective and standard way to determine if yours is bigger than mine. \u0026nbsp;Even though Caltech-101 is considered outdated by 2015 standards, it is fair to think of this dataset as the Grandfather of the more modern ImageNet dataset. Thanks \u003Ca href=\"http:\/\/vision.stanford.edu\/feifeili\/\"\u003EFei-Fei Li\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech101\/\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"270\" src=\"https:\/\/3.bp.blogspot.com\/-GotW1sXXx_4\/VKFVGQ62vRI\/AAAAAAAANzU\/y5S_qZKAoG4\/s1600\/caltech101.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ECategory-based datasets: the infamous Caltech-101 TorralbaArt image\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003EBins, Grids, and Visual Words (aka Machine Learning meets descriptors): ~2000 to ~2005\u003C\/b\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003EAfter the community shifted towards more ambitious object recognition problems and away from geometry recovery problems, we had a flurry of research in Bag of Words, Spatial Pyramids, Vector Quantization, as well as machine learning tools used in any and all stages of the computer vision pipeline. \u0026nbsp;Raw SIFT was great for wide-baseline stereo, but it wasn't powerful enough to provide matches between two distinct object instances from the same visual object category. \u0026nbsp;What was needed was a way to encode the following ideas: object parts can deform relative to each other and some image patches can be missing. \u0026nbsp;Overall, a much more \u003Ci\u003Estatistical way to characterize objects was needed\u003C\/i\u003E.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EVisual Words were introduced by Josef Sivic and Andrew Zisserman in approximately 2003 and this was a clever way of taking algorithms from large-scale text matching and applying them to visual content. \u0026nbsp;A visual dictionary can be obtained by performing unsupervised learning (basically just K-means) on SIFT descriptors which maps these 128-D real-valued vectors into integers (which are cluster center assignments). \u0026nbsp;A histogram of these visual words is a fairly robust way to represent images. \u0026nbsp;Variants of the Bag of Words model are still heavily utilized in vision research.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-pfeV3FAW_fA\/VKFaZcYaxjI\/AAAAAAAANzk\/cMErRKX7rAA\/s1600\/lola.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"121\" src=\"https:\/\/3.bp.blogspot.com\/-pfeV3FAW_fA\/VKFaZcYaxjI\/AAAAAAAANzk\/cMErRKX7rAA\/s1600\/lola.png\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EJosef Sivic's \"Video Google\": Matching Graffiti inside the Run Lola Run video\u003C\/div\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EAnother idea which was gaining traction at the time was the idea of using some sort of binning structure for matching objects. \u0026nbsp;Caltech-101 images mostly contained objects, so these grids were initially placed around entire images, and later on they would be placed around object bounding boxes. \u0026nbsp;Here is a picture from Kristen Grauman's famous \u003Ca href=\"http:\/\/www.cs.utexas.edu\/~grauman\/research\/projects\/pmk\/pmk_projectpage.htm\"\u003EPyramid Match Kernel\u003C\/a\u003E paper which introduced a powerful and hierarchical way of integrating spatial information into the image matching process.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-5aTdQ2Py6ak\/VKBO33A5xII\/AAAAAAAANyg\/x9TWuramoKw\/s1600\/pmk.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"213\" src=\"https:\/\/3.bp.blogspot.com\/-5aTdQ2Py6ak\/VKBO33A5xII\/AAAAAAAANyg\/x9TWuramoKw\/s1600\/pmk.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EGrauman's Pyramid Match Kernel for Improved Image Matching\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EAt some point it was not clear whether researchers should focus on better features, better comparison metrics, or better learning. \u0026nbsp;In the mid 2000s it wasn't clear if young PhD students should spend more time concocting new descriptors or kernelizing their support vector machines to death.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EObject Templates (aka the reign of HOG and DPM): ~2005 to ~2010\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EAt around 2005, a young researcher named Navneet Dalal showed the world just what can be done with his own new badass feature descriptor, HOG. \u0026nbsp;(It is sometimes written as HoG, but because it is an acronym for “Histogram of Oriented Gradients” it should really be HOG. The confusion must have came from an earlier approach called DoG which stood for Difference of Gaussian, in which case the “o” should definitely be lower case.)\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-7BaGjkSq6rc\/VKBPTRYzNgI\/AAAAAAAANyo\/4VsIBTP-NVY\/s1600\/hog.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"90\" src=\"https:\/\/2.bp.blogspot.com\/-7BaGjkSq6rc\/VKBPTRYzNgI\/AAAAAAAANyo\/4VsIBTP-NVY\/s1600\/hog.jpg\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003ENavneet Dalal's HOG Descriptor\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EHOG came at the time when everybody was applying spatial binning to bags of words, using multiple layers of learning, and making their systems overly complicated. Dalal’s ingenious descriptor was actually quite simple.\u0026nbsp; The seminal HOG paper was published in 2005 by Navneet and his PhD advisor, Bill Triggs. Triggs got his fame from earlier work on geometric vision, and Dr. Dalal got his fame from his newly found descriptor.\u0026nbsp; HOG was initially applied to the problem of pedestrian detection, and one of the reasons it because so popular was that the machine learning tool used on top of HOG was quite simple and well understood, it was the linear Support Vector Machine.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EI should point out that in 2008, a follow-up paper on object detection, which introduced a technique called the Deformable Parts-based Model (or DPM as we vision guys call it), helped reinforce the popularity and strength of the HOG technique. I personally jumped on the HOG bandwagon in about 2008.\u0026nbsp; My first few years as a grad student (2005-2008) I was hackplaying with my own vector quantized filter bank responses, and definitely developed some strong intuition regarding features. \u0026nbsp;In the end I realized that my own features were only \"okay,\" and because I was applying them to the outputs of image segmentation algorithms they were extremely slow.\u0026nbsp; Once I started using HOG, it didn’t take me long to realize there was no going back to custom, slow, features. \u0026nbsp;Once I started using a multiscale feature pyramid with a slightly improved version of HOG introduced by master hackers such as Ramanan and Felzenszwalb, I was processing images at 100x the speed of multiple segmentations + custom features (my earlier work).\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-9ZrYA5J3R3k\/VKBPu-uaCMI\/AAAAAAAANyw\/FyEgea8HL5o\/s1600\/dpm.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"152\" src=\"https:\/\/1.bp.blogspot.com\/-9ZrYA5J3R3k\/VKBPu-uaCMI\/AAAAAAAANyw\/FyEgea8HL5o\/s1600\/dpm.jpg\" width=\"200\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003EThe infamous Deformable Part-based Model (for a Person)\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003EDPM was the reigning champ on the PASCAL VOC challenge, and one of the reasons why it became so popular was \u003Ci\u003Ethe excellent MATLAB\/C++\u0026nbsp;implementation by Ramanan and Felzenszwalb\u003C\/i\u003E.\u0026nbsp; I still know many researchers who never fully acknowledged what releasing such great code really meant for the fresh generation of incoming PhD students, but at some point it seems like everybody was modifying the DPM codebase for their own CVPR attempts.\u0026nbsp; Too many incoming students were lacking solid software engineering skills and giving them the DPM code was a surefire way to get some some experiments up and running.\u0026nbsp; Personally, I never jumped on the parts-based methodology, but I did take apart the DPM codebase several times.\u0026nbsp; However, when I put it back together, the \u003Ca href=\"http:\/\/www.cs.cmu.edu\/~tmalisie\/projects\/iccv11\/\"\u003EExemplar-SVM\u003C\/a\u003E was the result.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EBig data, Convolutional Neural Networks and the promise of Deep Learning: ~2010 to ~2015\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003ESometime around 2008, it was pretty clear that scientists were getting more and more comfortable with large datasets.\u0026nbsp; It wasn’t just the rise of “Cloud Computing” and “Big Data,” it was the rise of the data scientists.\u0026nbsp; Hacking on equations by morning, developing a prototype during lunch, deploying large scale computations in the evening, and integrating the findings into a production system by sunset.\u0026nbsp; I spent two summers at Google Research, I saw lots of guys who had made their fame as vision hackers.\u0026nbsp; But they weren’t just writing “academic” papers at Google -- sharding datasets with one hand, compiling results for their managers, writing Borg scripts in their sleep, and piping results into gnuplot (because Jedis don’t need GUIs?). It was pretty clear that big data, and a DevOps mentality was here to stay, and the vision researcher of tomorrow would be quite comfortable with large datasets. \u0026nbsp;No longer did you need one guy with a mathy PhD, one software engineer, one manager, and one tester.\u0026nbsp; Plenty of guys who could do all of those jobs.\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cspan class=\"s1\"\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cb\u003EDeep Learning: 1980s - 2015\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E2014 was definitely a big year for Deep Learning.\u0026nbsp; What’s interesting about Deep Learning is that it is a very old technique. \u0026nbsp;What we're seeing now is essentially the Neural Network 2.0 revolution -- but this time around, there's we're 20 years ahead R\u0026amp;D-wise and our computers are orders of magnitude faster. \u0026nbsp;And what’s funny is that the same guys that were championing such techniques in the early 90s were the same guys we were laughing at in the late 90s (because clearly convex methods were superior to the magical NN learning-rate knobs). I guess they really had the last laugh because eventually these relentless neural network gurus became the same guys we now all look up to.\u0026nbsp; \u003Cb\u003EGeoffrey Hinton, Yann LeCun, Andrew Ng, and Yeshua Bengio are the 4 Titans of Deep Learning.\u003C\/b\u003E\u0026nbsp; By now, just about everybody has jumped ship to become a champion of Deep Learning.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cspan class=\"s1\"\u003EBut with Google, Facebook, Baidu, and a multitude of little startups riding the Deep Learning wave, \u003Cb\u003Ewho will rise to the top as the master of artificial intelligence?\u003C\/b\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/www.cs.nyu.edu\/~yann\/research\/deep\/images\/ff1.gif\" height=\"320\" width=\"301\" \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.cs.nyu.edu\/~yann\/research\/deep\/\"\u003EYann's Deep Learning Page\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p2\"\u003E\u003Cb\u003EHow to today's deep learning systems resemble the recognition systems of yesteryear?\u003C\/b\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003EMultiscale convolutional neural networks aren't that much different than the feature-based systems of the past. \u0026nbsp;The first level neurons in deep learning systems learn to utilize gradients in a way that is similar to hand-crafted features such as SIFT and HOG. \u0026nbsp;Objects used to be found in a sliding-window fashion, but now it is easier and sexier to think of this operation as convolving an image with a filter. Some of the best detection systems used to use multiple linear SVMs, combined in some ad-hoc way, and now we are essentially using even more of such linear decision boundaries. \u0026nbsp;Deep learning systems can be thought of a multiple stages of applying linear operators and piping them through a non-linear activation function, but deep learning is more similar to a clever combination of linear SVMs than a memory-ish Kernel-based learning system.\u003Cbr \/\u003E\u003Cbr \/\u003EFeatures these days aren't engineered by hand. \u0026nbsp;However, architectures of Deep systems are still being designed manually -- and it looks like the experts are the best at this task. \u0026nbsp;The operations on the inside of both classic and modern recognition systems are still very much the same. \u0026nbsp;You still need to be clever to play in the game, but \u003Ci\u003Enow you need a big computer\u003C\/i\u003E. There's still lot of room for improvement, so I encourage all of you to be creative in your research.\u003Cbr \/\u003E\u003Cbr \/\u003EResearch-wise, it never hurts to know where we have been before so that we can better plan for our journey ahead. \u0026nbsp;I hope you enjoyed this brief history lesson and the next time you look for insights in your research, don't be afraid to look back.\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003ETo learn more about computer vision techniques:\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cspan class=\"s1\"\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Scale-invariant_feature_transform\"\u003ESIFT article on Wikipedia\u003C\/a\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Bag-of-words_model_in_computer_vision\"\u003EBag of Words article on Wikipedia\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Histogram_of_oriented_gradients\"\u003EHOG article on Wikipedia\u003C\/a\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.berkeley.edu\/~rbg\/latent\/\"\u003EDeformable Part-based Model Homepage\u003C\/a\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.utexas.edu\/~grauman\/research\/projects\/pmk\/pmk_projectpage.htm\"\u003EPyramid Match Kernel Homepage\u003C\/a\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.robots.ox.ac.uk\/~vgg\/research\/vgoogle\/\"\u003E\"Video Google\" Image Retrieval System\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003ESome Computer Vision datasets:\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.vision.caltech.edu\/Image_Datasets\/Caltech101\/\"\u003ECaltech-101 Dataset\u003C\/a\u003E\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.image-net.org\/\"\u003EImageNet Dataset\u003C\/a\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003ETo learn about the people mentioned in this article:\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/www.cs.utexas.edu\/~grauman\/\"\u003EKristen Grauman\u003C\/a\u003E\u0026nbsp;(creator of Pyramid Match Kernel, Prof at Univ of Texas)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/lear.inrialpes.fr\/people\/triggs\/\"\u003EBill Triggs's\u003C\/a\u003E\u0026nbsp;(co-creator of HOG, Researcher at INRIA)\u003Cbr \/\u003E\u003Ca href=\"https:\/\/sites.google.com\/site\/navneetdalal\/\"\u003ENavneet Dalal\u003C\/a\u003E\u0026nbsp;(co-creator of HOG, now at Google)\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/yann.lecun.com\/\"\u003EYann LeCun\u003C\/a\u003E\u0026nbsp;(one of the Titans of Deep Learning, at NYU and Facebook)\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/www.cs.toronto.edu\/~hinton\/\"\u003EGeoffrey Hinton\u003C\/a\u003E\u0026nbsp;(one of the Titans of Deep Learning, at Univ of Toronto and Google)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/cs.stanford.edu\/people\/ang\/\"\u003EAndrew Ng\u003C\/a\u003E\u0026nbsp;(leading the Deep Learning effort at Baidu, Prof at Stanford)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.iro.umontreal.ca\/~bengioy\/yoshua_en\/index.html\"\u003EYoshua Bengio\u003C\/a\u003E\u0026nbsp;(one of the Titans of Deep Learning, Prof at U Montreal)\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Ca href=\"http:\/\/www.ics.uci.edu\/~dramanan\/\"\u003EDeva Ramanan\u003C\/a\u003E\u0026nbsp;(one of the creators of DPM, Prof at UC Irvine)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/cs.brown.edu\/~pff\/\"\u003EPedro Felzenszwalb\u003C\/a\u003E\u0026nbsp;(one of the creators of DPM, Prof at Brown)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/vision.stanford.edu\/feifeili\/\"\u003EFei-fei Li\u003C\/a\u003E\u0026nbsp;(Caltech101 and ImageNet, Prof at Stanford)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.di.ens.fr\/~josef\/\"\u003EJosef Sivic\u003C\/a\u003E\u0026nbsp;(Video Google and Visual Words, Researcher at INRIA\/ENS)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Andrew_Zisserman\"\u003EAndrew Zisserman\u003C\/a\u003E\u0026nbsp;(Geometry-based methods in vision, Prof at Oxford)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www-robotics.jpl.nasa.gov\/people\/Andrew_Johnson\/\"\u003EAndrew E. Johnson\u003C\/a\u003E\u0026nbsp;(SPIN Images creator, Researcher at JPL)\u003Cbr \/\u003E\u003Ca href=\"http:\/\/www.cs.cmu.edu\/~hebert\/\"\u003EMartial Hebert\u003C\/a\u003E\u0026nbsp;(Geometry-based methods in vision, Prof at CMU)\u003Cbr \/\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"p1\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/2752539545980854411\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/01\/from-feature-descriptors-to-deep.html#comment-form","title":"18 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2752539545980854411"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2752539545980854411"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2015\/01\/from-feature-descriptors-to-deep.html","title":"From feature descriptors to deep learning: 20 years of computer vision"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-2Lw3DxApZrw\/VKBKMeAwTnI\/AAAAAAAANyU\/7IQxfszsclc\/s72-c\/sift_pic.png","height":"72","width":"72"},"thr$total":{"$t":"18"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-4411090877285667097"},"published":{"$t":"2014-11-27T12:06:00.003-05:00"},"updated":{"$t":"2014-11-27T12:06:40.487-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"adrian rosebrock"},{"scheme":"http://www.blogger.com/atom/ns#","term":"barcode"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"opencv"},{"scheme":"http://www.blogger.com/atom/ns#","term":"pyimagesearch"},{"scheme":"http://www.blogger.com/atom/ns#","term":"python"},{"scheme":"http://www.blogger.com/atom/ns#","term":"training"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision products"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision software"},{"scheme":"http://www.blogger.com/atom/ns#","term":"visual training GUI"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX"}],"title":{"type":"text","$t":"Barcodes: Realtime Training and Detection with VMX "},"content":{"type":"html","$t":"In this VMX screencast, witness the creation of a visual barcode detection program in under 9 minutes. You can see the entire training procedure -- creating an initial data set of labeled barcodes, improving the detector via a 5 minute interactive learning step, and finishing off with a qualitative evaluation of the trained barcode detector.\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"\/\/www.youtube.com\/embed\/-0eTty0uRt4\" width=\"420\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EThe inspiration came after reading Dr. Rosebrock's blog post on detecting barcodes using OpenCV and Python (\u003Ca href=\"http:\/\/www.pyimagesearch.com\/2014\/11\/24\/detecting-barcodes-images-python-opencv\/\"\u003Ehttp:\/\/www.pyimagesearch.com\/2014\/11\/24\/detecting-barcodes-images-python-opencv\/\u003C\/a\u003E). \u0026nbsp;While the code presented in Rosebrock's blog post is quite simple, it is most definitely domain-specific. \u0026nbsp;Different domain-specific programs must be constructed for different objects. \u0026nbsp;In other words, different kinds of morphological operations, features, and thresholds must be used for detecting different objects and it is not even clear how you would construct the rules to detect a complex object such as a \"monkey.\" \u0026nbsp;If you are just getting started with programming and want to learn how to construct some of these domain-specific programs, you're just going to have to subscribe to \u003Ca href=\"http:\/\/www.pyimagesearch.com\/\"\u003Ehttp:\/\/www.pyimagesearch.com\/\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003EWriting these kinds of vision programs is hard. \u0026nbsp;Unless... you address the problem with some advanced machine learning techniques. \u0026nbsp;Applying machine learning to visual problems is \"the backbone\" of what we do at vision.ai and computer vision research has been a personal passion of mine for over a decade. \u0026nbsp;So I decided to take our most recent piece of vision tech for a spin. \u0026nbsp;We try not to code while on vacation (a good team needs good rest), and I don't consider using our GUI-based VMX software as hardcore as \"coding.\" \u0026nbsp;\u003Cb\u003EUnlike traditional vision systems whose operation might leave you with an engineering-hangover, using VMX is more akin to playing Minecraft.\u003C\/b\u003E \u0026nbsp;I figured that playing a video game or two on vacation is permissible. \u003Cbr \/\u003E\u003Cbr \/\u003EEliminating the residual sunscreen from my hands, I rebooted my soul with an iced gulp of Spice Isle Coffee and fired up my trusty Macbook Pro. \u0026nbsp;I then grabbed the first few vacation-themed objects from the kitchen. (\u003Cb\u003EAnd yes, I'm on vacation for Thanksgiving\u003C\/b\u003E -- the objects include canned fruit, sunscreen, and a bottle of booze.) \u0026nbsp;Then it was time to throw the barcode detection problem at VMX.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ctable cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"float: right; margin-left: 1em; text-align: right;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-mkY7cgywdVY\/VHdXjaw2ctI\/AAAAAAAANx0\/MrytvKq6QnY\/s1600\/bar_code_creation.png\" imageanchor=\"1\" style=\"clear: right; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/3.bp.blogspot.com\/-mkY7cgywdVY\/VHdXjaw2ctI\/AAAAAAAANx0\/MrytvKq6QnY\/s1600\/bar_code_creation.png\" height=\"193\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cb\u003EStep 1: \u003C\/b\u003EBarcode Initial Selections\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003E30 seconds worth of initial clicks followed by several minutes worth of waving objects in front of the webcam is not hard work. \u0026nbsp;5 minutes later we have a sexy barcode detector. \u0026nbsp;Not too bad for computer vision in a non-laboratory setting. \u0026nbsp;While on vacation, I don't have access to a lab and neither should you. \u0026nbsp;A sun-filled patio will have to suffice. \u0026nbsp;In fact, it was so bright outside that I had to wear sunglasses the entire time. (Towards the end of the video, a \"sunglasses\" detector makes a cameo.)\u003Cbr \/\u003E\u003Cbr \/\u003EPlease note that he barcode is not actually \"read\" (so this program can't tell whether the region corresponds to canned pineapples or sunscreen), the region of interest is simply detected and tracked in real-time.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Ctable cellpadding=\"0\" cellspacing=\"0\" class=\"tr-caption-container\" style=\"float: left; margin-right: 1em; text-align: left;\"\u003E\u003Ctbody\u003E\u003Ctr\u003E\u003Ctd style=\"text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-5t8GBjBVnVQ\/VHdXj4HbatI\/AAAAAAAANx4\/Vr-_HD5J1tQ\/s1600\/bar_code_positives_negatives.png\" imageanchor=\"1\" style=\"clear: left; margin-bottom: 1em; margin-left: auto; margin-right: auto;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/1.bp.blogspot.com\/-5t8GBjBVnVQ\/VHdXj4HbatI\/AAAAAAAANx4\/Vr-_HD5J1tQ\/s1600\/bar_code_positives_negatives.png\" height=\"166\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/td\u003E\u003C\/tr\u003E\u003Ctr\u003E\u003Ctd class=\"tr-caption\" style=\"text-align: center;\"\u003E\u003Cb\u003EFinal Step: \u003C\/b\u003ETweaking Learned Positives and Negatives\u003C\/td\u003E\u003C\/tr\u003E\u003C\/tbody\u003E\u003C\/table\u003EThis video is \u003Cb\u003Ean example of a pure machine-learning based approach to barcode detection\u003C\/b\u003E. \u0026nbsp;The underlying algorithm can be used to learn just about any visual concept you're interested in detecting. \u0026nbsp;A bar code is just like a face or a car -- it is a 2D pattern which can be recognized by machines. \u0026nbsp;Throughout my career I've trained thousands of detectors (mostly in an academic setting). \u0026nbsp;VMX is the most fun with object recognition I've ever had and it lets me train detectors without having to worry about the mathematical details. \u0026nbsp;Once you get your own copy of VMX, what will \u003Ci\u003Eyou\u003C\/i\u003E train?\u003Cbr \/\u003E\u003Cbr \/\u003ETo learn how to get your hands on VMX, sign up on the mailing list at\u0026nbsp;\u003Ca href=\"http:\/\/vision.ai\/\"\u003Ehttp:\/\/vision.ai\u003C\/a\u003E\u0026nbsp;or if you're daring enough, you can purchase an early beta license key from\u0026nbsp;\u003Ca href=\"https:\/\/beta.vision.ai\/\"\u003Ehttps:\/\/beta.vision.ai\u003C\/a\u003E.\u003Cbr \/\u003E\u003Cbr \/\u003ESo what's next? \u0026nbsp;Should I build\u0026nbsp;\u003Ci\u003Ea boat detector\u003C\/i\u003E? Maybe I should train a detector to \u003Ci\u003Elet me know when I run low on Spice Isle Coffee\u003C\/i\u003E? Or how about going on a field trip and \u003Ci\u003Ecounting bikinis on the beach\u003C\/i\u003E?"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/4411090877285667097\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/11\/barcodes-realtime-training-and.html#comment-form","title":"5 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/4411090877285667097"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/4411090877285667097"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/11\/barcodes-realtime-training-and.html","title":"Barcodes: Realtime Training and Detection with VMX "}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-mkY7cgywdVY\/VHdXjaw2ctI\/AAAAAAAANx0\/MrytvKq6QnY\/s72-c\/bar_code_creation.png","height":"72","width":"72"},"thr$total":{"$t":"5"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-6921076024767410776"},"published":{"$t":"2014-10-26T11:07:00.001-05:00"},"updated":{"$t":"2014-10-26T11:25:37.132-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"software"},{"scheme":"http://www.blogger.com/atom/ns#","term":"training"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision.ai"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX"}],"title":{"type":"text","$t":"VMX is ready"},"content":{"type":"html","$t":"I haven't posted anything here in the last few months, so let me give you guys a brief update. VMX has matured since the Prototype stage last year and the vision.ai team has already started circulating some beta versions of our software. \u003Cbr \/\u003E\u003Cdiv\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EFor those of you who don't remember, last year I decided to leave my super-academic life at MIT and go the startup-route focusing on vision, learning, and automation. \u0026nbsp;Our goal is to make building and deploying vision applications as easy as pie. We want to be the Heroku of computer vision. \u0026nbsp;Personally, I've always wanted to expose the magic of vision to a broader audience. \u0026nbsp;I don't know if the robots of the future are going to have two legs, four arms, or they will forever be airborne -- but I can tell you that these creatures are going to have to perceive the world around them. 2014 is not a bad place to be for a vision company.\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EVMX, the suite of vision and automation tools which we showcased last year in our Kickstarter campaign, \u003Cb\u003Eis going live very soon\u003C\/b\u003E. \u0026nbsp;VMX will be vision.ai's first product. \u0026nbsp;While VMX doesn't do everything vision-related (there's OpenCV for that), it makes training visual object detectors really easy. \u0026nbsp;Whether you're just starting out with vision or AI, have a killer vision-app idea, want to automate more things in your home, you're gonna want to experience VMX yourself.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/1.bp.blogspot.com\/-cDl8M5YIiLM\/VE0gddz1tOI\/AAAAAAAANxI\/xvhKZs3plLU\/s1600\/vmx_pkg_image.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/1.bp.blogspot.com\/-cDl8M5YIiLM\/VE0gddz1tOI\/AAAAAAAANxI\/xvhKZs3plLU\/s1600\/vmx_pkg_image.png\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EWe will be providing a \u003Cb\u003Enative installer for Mac OS X\u003C\/b\u003E as well as single command \u003Cb\u003Einstaller for Linux machines based on Docker\u003C\/b\u003E. VMX will run on your machine without an internet connection (the download plus all dependencies plus all necessary pre-trained files is approximately 2GB and an activation license will cost between $100 and $1000). \u0026nbsp;The VMX App Builder runs in your browser, is built in AngularJS, and our REST API will allow you to write your own scripts\/apps in any language you like. \u0026nbsp;We even have lots of command line examples if you're a curl-y kind of guy\/gal. If there's sufficient demand, we'll work on a native Windows installer.\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EWe have been letting some of our close friends and colleagues beta-test our software and we're confident you're going to love it. \u0026nbsp;If you would like to beta-test our software, please sign up on the vision.ai mailing list and send us a beta-key request. \u0026nbsp;We have a limited number of beta-testing keys, so I'm sorry if we don't get back to you. \u0026nbsp;If you want a hands-on demo by one of the VMX creators, we are more than happy to take a hacking break and show off some VMX magic. \u0026nbsp;We can be found in Boston, MA and\/or Burlington, VT. \u0026nbsp;If you're thinking of competing in a Hackathon near one of our offices, drop us a line, we'll try to send a vision.ai jedi your way.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EGeoff has been championing Docker for the last year and he's done amazing things Dockerizing our build pipeline while I refactored the vision engine API using some ideas I picked up from Haskell, and made considerable performance tweaks to the underlying learning algorithm. \u0026nbsp;I spent a few months toying with different deep network representations, and modernized the internal representation so I can find another deep learning guru to help us out with R\u0026amp;D in 2015.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-NOj7R9hOfew\/VEwhgW1tw0I\/AAAAAAAANw0\/XZwqWDut6a8\/s1600\/vmx_server_running.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/3.bp.blogspot.com\/-NOj7R9hOfew\/VEwhgW1tw0I\/AAAAAAAANw0\/XZwqWDut6a8\/s1600\/vmx_server_running.png\" height=\"209\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E4 VMXserver processes running on Macbok Pro\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EWe're going to release plenty of pre-trained models plus all the tools and video tutorials you'll need to create your own models from scratch. \u0026nbsp;\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cdiv\u003EWe will be offering a $100 personal license and a $1000 professional license of VMX. \u0026nbsp;Beta testers get a personal license in return for helping find installation bugs. Internally, we are at version 0.1.3 of VMX and once we attain 90%+ code coverage we will have VMX 1.0 sometime in early 2015. \u0026nbsp;We typically release stable versions every 1 months and bleeding edge development builds every week.\u0026nbsp;\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Ci\u003EThe future of vision.ai\u0026nbsp;\u003C\/i\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003EIn the upcoming months, we'll be perfecting our cloud-based deployment platform, so if you're interested in building on top of our vision.ai infrastructure or want to have fun running some massively parallel vision computations with us, just shoot us an email.\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003Cdiv\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/6921076024767410776\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/10\/vmx-is-ready.html#comment-form","title":"3 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6921076024767410776"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/6921076024767410776"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/10\/vmx-is-ready.html","title":"VMX is ready"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/1.bp.blogspot.com\/-cDl8M5YIiLM\/VE0gddz1tOI\/AAAAAAAANxI\/xvhKZs3plLU\/s72-c\/vmx_pkg_image.png","height":"72","width":"72"},"thr$total":{"$t":"3"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-4740690553547688086"},"published":{"$t":"2014-01-20T16:00:00.003-05:00"},"updated":{"$t":"2014-01-20T16:01:07.363-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"kickstarter"},{"scheme":"http://www.blogger.com/atom/ns#","term":"object detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"reward"},{"scheme":"http://www.blogger.com/atom/ns#","term":"smile detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"sponsor"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX"}],"title":{"type":"text","$t":"Sponsor Your Favorite Object Detector + VMX Smile Detector"},"content":{"type":"html","$t":"\u003Cspan style=\"background-color: white; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px;\"\u003EMany of you asked if the \u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Project\u003C\/a\u003E will come with an initial set of object detectors. Yes!\u0026nbsp;\u003C\/span\u003E\u003Cb style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003EVMX will come equipped with a library of pre-trained object detectors.\u003C\/b\u003E\u003Cspan style=\"background-color: white; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px;\"\u003E\u0026nbsp;We are committed to providing you with an amazing VMX computer vision experience and want to give you as much as possible when you start using VMX.\u003C\/span\u003E\u003Cbr \/\u003E\u003Cspan style=\"background-color: white; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/3.bp.blogspot.com\/-0uthjIjAKCU\/Ut2MymHlLBI\/AAAAAAAANZM\/FzgJD4FBEQE\/s1600\/sponsor.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/3.bp.blogspot.com\/-0uthjIjAKCU\/Ut2MymHlLBI\/AAAAAAAANZM\/FzgJD4FBEQE\/s1600\/sponsor.png\" height=\"364\" width=\"640\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cspan style=\"background-color: white; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003Cbr \/\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EToday, we’d like to introduce a special “\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003Esponsor your favorite object detector\u003C\/b\u003E” reward. We’re introducing a\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003Enew $300 pledge level\u003C\/b\u003E\u0026nbsp;to our Kickstarer page, one which lets you sponsor a object detector that will be come inside the VMX pre-trained object library. In addition to sponsorship, you will obtain all the other perks of being a $300 level backer: 650 Compute Hours, a local VMX install, early-access, the VMX cookbook, and VMX developer status. By sponsoring a detector,\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003Eyour name will appear inside the model library when a VMX user mouses over your favorite detector.\u003C\/b\u003E\u0026nbsp;This is your chance to make a pledge which will have an ever-lasting effect on our project. Consider the number of people that at some point use a generic car detector! Each time they visit the VMX model library, you will have your own claim to fame. “\u003Ci style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-variant: inherit; font-weight: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003ELook mom, I sponsored the car detector!\u003C\/i\u003E”\u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EWe have\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003E100 slots for the $300\u003C\/b\u003E\u0026nbsp;“sponsor an object detector reward,” and the name of the backer sponsoring the an object detector will appear as you mouseover the object model in the model library. This way, your name will be inside the VMX webapp model library, in addition to the wall of backers on our company page. You will be able to choose your name, your best friend’s name, your twitter handle (such as @quantombone), or your nickname.\u0026nbsp;\u003Ci style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-variant: inherit; font-weight: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003ESorry, no profanity allowed.\u003C\/i\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/4.bp.blogspot.com\/-GbBJ5U7wD9Q\/Ut2OB1Ais5I\/AAAAAAAANZk\/4UwQL44m9cM\/s1600\/sponsor2.jpg\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/4.bp.blogspot.com\/-GbBJ5U7wD9Q\/Ut2OB1Ais5I\/AAAAAAAANZk\/4UwQL44m9cM\/s1600\/sponsor2.jpg\" height=\"191\" width=\"200\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Cspan style=\"font-size: 16.363636016845703px;\"\u003EWe will release the list of 100 object detectors which will come with VMX at the end of January.\u0026nbsp;\u003C\/span\u003E\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003ESponsors will get the chance to choose their object detectors on a first-come-first-serve basis.\u003C\/b\u003E\u003Cspan style=\"font-size: 16.363636016845703px;\"\u003E\u0026nbsp;If you are the first one to become a sponsor, you will get to choose “face,” “car,” “guitar” or whatever other object you might be excited about! As always, you can change your pledge level and reward. \u0026nbsp;So act now and don’t forget that by sponsoring an object detector you are supporting our dream project come to life!\u0026nbsp;\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Ci style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-variant: inherit; font-weight: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003E\u003C\/i\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EAnd for those of you interested in seeing more VMX action shots, here's a new video showing off VMX detecting smiles. \u0026nbsp;This one was taken with Tom's iPhone because the screencapture software on his computer slows everything down. \u0026nbsp;No post-processing, this is as fast as the prototype runs. Enjoy!\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"\/\/www.youtube.com\/embed\/DmAwQu20anQ\" width=\"560\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E(Cross-posted from the\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\/posts\/724798\"\u003E VMX Project Kickstarter Update #10\u003C\/a\u003E)"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/4740690553547688086\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/sponsor-your-favorite-object-detector.html#comment-form","title":"3 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/4740690553547688086"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/4740690553547688086"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/sponsor-your-favorite-object-detector.html","title":"Sponsor Your Favorite Object Detector + VMX Smile Detector"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-0uthjIjAKCU\/Ut2MymHlLBI\/AAAAAAAANZM\/FzgJD4FBEQE\/s72-c\/sponsor.png","height":"72","width":"72"},"thr$total":{"$t":"3"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-2089670277598138487"},"published":{"$t":"2014-01-14T07:40:00.001-05:00"},"updated":{"$t":"2014-01-14T07:40:39.705-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"education"},{"scheme":"http://www.blogger.com/atom/ns#","term":"high school"},{"scheme":"http://www.blogger.com/atom/ns#","term":"kickstarter"},{"scheme":"http://www.blogger.com/atom/ns#","term":"research"},{"scheme":"http://www.blogger.com/atom/ns#","term":"stem"},{"scheme":"http://www.blogger.com/atom/ns#","term":"students"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX project"}],"title":{"type":"text","$t":"10% of our Kickstarter campaign total will go to free High School Student technology licenses"},"content":{"type":"html","$t":"\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EDear Kickstarters, technology enthusiasts, and STEM educators,\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EWe’re happy to announce a new reward in our Kickstarter project, one designed for free access of our robotic vision technology to high school students. If we reach our Kickstarter campaign milestone of $100K,\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003Ewe will give 10% of Kickstarter generated funds to high school students and clubs in the form of software licenses\u003C\/b\u003E. $100K raised will translate to 100 single-machine VMX licenses given out to 100 different high schools and clubs during the Summer of 2014, free of charge. Optionally, qualifying high schools can choose to claim 100 VMX Compute hours if they have a problem with local performance, don’t have access to a Linux machine, and\/or their security policy doesn’t allow virtual machines.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EOur Kickstarter project, the VMX Project, is an easy-to-use and fully trainable computer vision programming environment. With VMX, you can teach your computer to recognize objects using the webcam. We’ve already surpassed the 30% funding milestone and generated lots of great ideas from our community. Ideas ranging from medical disease diagnosis and 3D object reconstruction to smart wine inventory management. By bringing a computer vision app-building environment to students, we’re excited about the prospect of giving teens a sandbox for innovation -- an ecosystem to achieve their own technology-oriented Eureka moments. So whether a student decides to study computer science in college or comes up with the next great startup idea, we want to give them a headache-free entry to the world of computer vision.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EIf you want to learn more about the VMX Project, please see our Kickstarter page:\u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\" style=\"border: 0px; color: #0096bb; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; font-weight: inherit; line-height: inherit; margin: 0px; padding: 0px; text-decoration: none; vertical-align: baseline;\" target=\"_blank\"\u003Ehttp:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003EThe VMX High School Program\u003C\/b\u003E\u0026nbsp;is designed to give a limited number of students and student clubs free access to VMX in-browser object recognition technology. We understand that “Computer Vision for Everyone” needs to include a broader range of individuals, individuals with little or no spending income. We’re committed to letting those who can be most influenced by new technology, the young innovators inside our classrooms, get access to our technology.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EBy supporting our Kickstarter campaign, you are backing our vision of bringing computer vision technology to the masses. So whether you want VMX for your own creative use or want to give something to your community, we hope you’ll appreciate our new VMX Project High School Program reward and back our project. In addition, backers of our project will be able to donate any of their unused Compute Hours into the Eureka fund so that additional high school students get access to our technology.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EIf you are a high school student or high school teacher and would like get some cool computer vision technology for your school, please send an email to “admin@vision.ai” with “VMX Project High School Program” in the title, briefly describing what you’d like to do with VMX, your age, and school name. To generate interest among your students and friends, share our VMX Kickstarter video with your classroom and have one of your students email us with their idea.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EKickstarter is\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003Eall-or-nothing\u003C\/b\u003E, so we need to reach the $100K funding milestone to make this project a reality.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003EWe are excited that as software developers, our creations have the potential to spread rapidly. But we want to make sure that one of valuable demographics, creative high school students, isn’t left-behind.\u0026nbsp;\u003Cb style=\"border: 0px; font-family: inherit; font-size: 16.363636016845703px; font-style: inherit; font-variant: inherit; line-height: inherit; margin: 0px; padding: 0px; vertical-align: baseline;\"\u003EHelp spread the word about VMX\u0026nbsp;\u003C\/b\u003Eusing social networking and let’s make 2014 the year of new technology by bringing computer vision technology to the masses.\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003ESincerely,\u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003ETomasz Malisiewicz, PhD\u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003ECo-Founder of vision.ai\u003C\/div\u003E\u003Cdiv style=\"background-color: white; border: 0px; color: #0b1902; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16.363636016845703px; line-height: 21.81818199157715px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\/posts\/719463\" style=\"font-size: medium;\"\u003E\u003Cspan style=\"font-size: xx-small;\"\u003E(Cross-posted from VMX Project kickstarter blog Entry #8)\u003C\/span\u003E\u003C\/a\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/2089670277598138487\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/10-of-our-kickstarter-campaign-total.html#comment-form","title":"2 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2089670277598138487"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/2089670277598138487"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/10-of-our-kickstarter-campaign-total.html","title":"10% of our Kickstarter campaign total will go to free High School Student technology licenses"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"thr$total":{"$t":"2"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-1271268792821663535"},"published":{"$t":"2014-01-12T18:37:00.000-05:00"},"updated":{"$t":"2016-06-13T07:47:15.836-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"face detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"face recognition"},{"scheme":"http://www.blogger.com/atom/ns#","term":"kickstarter"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX project"}],"title":{"type":"text","$t":"Can a person-specific face recognition algorithm be used to determine a person's race?"},"content":{"type":"html","$t":"It's a valid question: can a person-specific face recognition algorithm be used to determine a person's race?\u003Cbr \/\u003E\u003Cbr \/\u003EI trained \u003Cb\u003Etwo separate person-specific face detectors\u003C\/b\u003E. \u0026nbsp;For each detector I used videos of the target person's face to generate positive examples and faces from [google image search for \"faces\"] as negative examples. \u0026nbsp;This is a fairly straightforward machine learning problem: find a decision boundary between the positive examples and the negative examples. \u0026nbsp;I used the VMX Project recognition algorithm which learns from videos with minimal human supervision. \u0026nbsp;In both cases, I used the VMX webapp for training (training each detector took about ~20 minutes from scratch). \u0026nbsp;In fact, I didn't even have to touch the command line. \u0026nbsp;Since videos were used an input, what I created are essentially full-blown sliding window detectors, meaning that they scan an entire image and can even find small faces. I then ran this detector on the large average male face image. \u0026nbsp;This average face image has been around the internet for a while now and it was created by averaging people's faces. \u0026nbsp;By running the algorithm on this one image, it analyzed all of the faces contained inside and I was able to see which country returned the highest scoring detection!\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cspan style=\"font-size: large;\"\u003EExperiment #1\u003C\/span\u003E\u003C\/b\u003E\u003Cbr \/\u003EFor the first experiment, I used a video of my own face. \u0026nbsp;Because I was using a live video stream, I was able to move my face around so that the algorithm saw lots of different viewing conditions. \u0026nbsp;Here is a the output. \u0026nbsp;\u003Cb\u003ENotice the green box around \"Poland.\"\u003C\/b\u003E \u0026nbsp;Pretty good guess, especially since I moved from Poland to the US when I was 8.\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-Bf3bMqXuXGQ\/UtTar6eFZII\/AAAAAAAANXs\/FFTDVurB4G8\/s1600\/tomasz_crop.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"320\" src=\"https:\/\/3.bp.blogspot.com\/-ENZ-749s86g\/UtMhEZWaDSI\/AAAAAAAANXM\/Fa3kaFB10po\/s400\/tomasz_top_detection.png\" width=\"276\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: left;\"\u003EHere is a 5 min video (VMX screencapture) of me running the \"Tomasz\" (that's my name in case you don't know) detector as I fly around the average male image. \u0026nbsp;You can see the scores on lots of different races. \u0026nbsp;High scoring detections are almost always on geographically relevant races.\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"\/\/www.youtube.com\/embed\/2-zmU4rIcJA\" width=\"420\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cb\u003E\u003Cspan style=\"font-size: large;\"\u003EExperiment #2\u003C\/span\u003E\u003C\/b\u003E\u003Cbr \/\u003EFor the second target, I used a few videos of \u003Ca href=\"http:\/\/cs.stanford.edu\/people\/ang\/\"\u003EAndrew Ng\u003C\/a\u003E\u0026nbsp;to get positives.\u0026nbsp; For those of you who don't know, Andrew Ng is a machine learning researcher, entrepreneur, professor at Stanford, and MOOC visionary. \u0026nbsp;Here is the result. \u0026nbsp;\u003Cb\u003ENotice the green box around \"Japan.\"\u003C\/b\u003E \u0026nbsp;Very reasonable answer -- especially since I didn't give the algorithm an extra Asian faces for negatives.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/2.bp.blogspot.com\/-2wijfQGhLvM\/UtMhQ-cFRgI\/AAAAAAAANXU\/F03THL7b3ws\/s1600\/top_andrewng_detection.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" height=\"320\" src=\"https:\/\/2.bp.blogspot.com\/-2wijfQGhLvM\/UtMhQ-cFRgI\/AAAAAAAANXU\/F03THL7b3ws\/s400\/top_andrewng_detection.png\" width=\"276\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003EHere is a 5 min video (VMX screencapture) of me running the \"Andrew Ng\" detector as I fly around the average male image.\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"\/\/www.youtube.com\/embed\/waTAkSGIbcM\" width=\"420\"\u003E\u003C\/iframe\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003EIn conclusion, \u003Cb\u003Eperson-specific face detectors from VMX can be used to help determine a person's race\u003C\/b\u003E. \u0026nbsp;At least the two VMX face detectors I trained behaved as expected. \u0026nbsp;This is far from a full-out study, but I only had the chance to try out on two subjects and wanted to share what I found. \u0026nbsp;The underlying algorithm inside VMX is a non-parametric exemplar-based model. \u0026nbsp;During training the algorithm uses ideas from max-margin learning to create a separator between the positives and negatives. \u0026nbsp;\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003EIf you've been following up on my computer vision research projects, you should have a good idea of how these things work. \u0026nbsp;I want to mention that while I showcase VMX being used for face detection, there is nothing face-specific inside the algorithm. \u0026nbsp;The same representation is used for bottles, cars, hands, mouths, etc. \u0026nbsp;\u003Cb\u003EVMX is a general purpose object recognition ecosystem\u003C\/b\u003E and we're excited to finally be releasing this technology to the world.\u003Cbr \/\u003E\u003Cbr \/\u003EThere are lots of cool applications of VMX detectors. \u0026nbsp;What app will \u003Ci\u003Eyou\u003C\/i\u003E build?\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: left;\"\u003ETo learn more about VMX and get-in on the action, simply checkout the \u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Kickstarter project\u003C\/a\u003E\u0026nbsp;and back our campaign.\u0026nbsp;\u0026nbsp;\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/1271268792821663535\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/can-person-specific-face-recognition.html#comment-form","title":"1 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/1271268792821663535"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/1271268792821663535"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/can-person-specific-face-recognition.html","title":"Can a person-specific face recognition algorithm be used to determine a person's race?"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"https:\/\/3.bp.blogspot.com\/-ENZ-749s86g\/UtMhEZWaDSI\/AAAAAAAANXM\/Fa3kaFB10po\/s72-c\/tomasz_top_detection.png","height":"72","width":"72"},"thr$total":{"$t":"1"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-5039329766928752607"},"published":{"$t":"2014-01-07T18:45:00.003-05:00"},"updated":{"$t":"2014-01-14T02:10:34.445-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"algorithm"},{"scheme":"http://www.blogger.com/atom/ns#","term":"angularjs"},{"scheme":"http://www.blogger.com/atom/ns#","term":"computer vision"},{"scheme":"http://www.blogger.com/atom/ns#","term":"face tracking"},{"scheme":"http://www.blogger.com/atom/ns#","term":"jsfeat"},{"scheme":"http://www.blogger.com/atom/ns#","term":"kickstarter"},{"scheme":"http://www.blogger.com/atom/ns#","term":"lucas-kanade"},{"scheme":"http://www.blogger.com/atom/ns#","term":"machine learning"},{"scheme":"http://www.blogger.com/atom/ns#","term":"object detection"},{"scheme":"http://www.blogger.com/atom/ns#","term":"programming"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX project"}],"title":{"type":"text","$t":"Tracking points in a live camera feed: A behind-the-scenes look at the VMX Project webapp"},"content":{"type":"html","$t":"In our \u003Ca href=\"http:\/\/vision.ai\/\"\u003Ecomputer vision startup, vision.ai\u003C\/a\u003E,\u0026nbsp;we're using open-source tools to create a one-of-a-kind\u0026nbsp;object recognition experience. \u0026nbsp;Our goal is to make state-of-the-art visual object recognition as easy as waving an object in front of your laptop's or smartphone's camera. \u0026nbsp;We've made a webapp and programming environment called VMX that allows you to teach your computer about objects without any advanced programming, nor any bulky software installations -- you'll finally be able to put your computer's new visual reasoning abilities to good use. \u0026nbsp;Today's blog post is about some of the underlying technology that we used to build the VMX prototype. \u0026nbsp;(To learn about the entire project and how you can help, please visit \u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Project on Kickstarter\u003C\/a\u003E.)\u003Cbr \/\u003E\u003Cbr \/\u003EThe VMX project utilizes many different programming languages and technologies. \u0026nbsp;Many of the behind-the-scenes machine learning algorithms have been developed in our lab, but to make a good product it takes more than just robust backed algorithms. \u0026nbsp;On the front-end, the two key open source (MIT licensed) projects we rely on are \u003Ca href=\"http:\/\/angularjs.org\/\"\u003EAngularJS\u003C\/a\u003E and \u003Ca href=\"http:\/\/inspirit.github.io\/jsfeat\/\"\u003EJSFeat\u003C\/a\u003E. AngularJS is an open-source JavaScript framework, maintained by Google, that assists with running single-page applications. \u0026nbsp;Today's focus will be on JSFeat, the Javascript Computer Vision Library we use inside the front-end webapp. \u0026nbsp;What is JSFeat? \u0026nbsp;Quoting \u003Ca href=\"http:\/\/twitter.com\/inspirit\"\u003EEugene Zatepyakin\u003C\/a\u003E, the author of JSFeat,\u0026nbsp;\"The project aim is to explore JS\/HTML5 possibilities using modern \u0026amp; state-of-art computer vision algorithms.\"\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cb\u003EWe use the JSFeat library to track points inside the video stream.\u003C\/b\u003E \u0026nbsp;Below is a YouTube video of our webapp in action, where we enabled the \"debug display\" to show you what is happening to tracked points behind the scenes. \u0026nbsp;The \u003Cb\u003Eblue points are being tracked\u003C\/b\u003E inside the browser, the \u003Cb\u003Egreen box is the output of our object detection service\u003C\/b\u003E (already trained on my face), and the \u003Cb\u003Eblack box is the interpolated result\u003C\/b\u003E which integrates the backend service and the frontend tracker.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Ciframe allowfullscreen=\"\" frameborder=\"0\" height=\"315\" src=\"\/\/www.youtube.com\/embed\/Pf7mKlj73As\" width=\"560\"\u003E\u003C\/iframe\u003E \u003C\/div\u003E\u003Cdiv style=\"text-align: center;\"\u003E\u003Cbr \/\u003E\u003C\/div\u003EThe tracker calculates an optical flow for a sparse feature set using the iterative Lucas-Kanade method with pyramids. \u0026nbsp;The algorithm basically looks at two consecutive video frames and determines how points move by using a straightforward least-squares optimization method. The Lucas-Kanade algorithm is a classic in the computer vision community -- to learn more see the\u0026nbsp;\u003Ca href=\"http:\/\/en.wikipedia.org\/wiki\/Lucas%E2%80%93Kanade_method\"\u003ELucas-Kanade Wikipedia page\u003C\/a\u003E\u0026nbsp;or take a graduate level computer vision course. Alternatively, if you find me on the street and ask nicely, I \u003Ci\u003Emight\u003C\/i\u003E give you an impromptu lecture on optical flow.\u003Cbr \/\u003E\u003Cbr \/\u003EInstead of using interest points, in our prototype video we used a regularly spaced grid of points covering the entire video stream. \u0026nbsp;This grid gets re-initialized every N seconds. \u0026nbsp;It avoids the extra expense of finding interest points inside every frame. \u0026nbsp;NOTE: inside our vision.ai computer vision lab, we are incessantly experimenting with better ways of integrating point tracks with strong object detector results. \u0026nbsp;What you're seeing is just an early snapshot of the technology in action.\u003Cbr \/\u003E\u003Cbr \/\u003ETo play with a Lucas-Kanade tracker, take a look at the JSFeat demo page which runs a point tracker directly inside your browser. \u0026nbsp;You'll have to click on points, one at a time. \u0026nbsp;You'll need Google Chrome or Firefox (just like our VMX project), and this will give you a good sense of what using VMX is going to be like once it is available.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv style=\"text-align: center;\"\u003ETry the\u0026nbsp;\u003Ca href=\"http:\/\/inspirit.github.io\/jsfeat\/sample_oflow_lk.html\"\u003EJSFeat Optical Flow Demo\u003C\/a\u003E!\u003C\/div\u003E\u003Cbr \/\u003ETo summarize, there are lots of great computer vision tools out there, but none of these tools can give you a comprehensive object recognition system which requires little-to-none programming experience. \u0026nbsp;\u003Cb\u003EThere is a lot of work needed to put together appropriate machine learning algorithms, object detection libraries, web services, trackers, video codecs, etc\u003C\/b\u003E. \u0026nbsp;Luckily, the team at vision.ai loves both code and machine learning. \u0026nbsp;In addition, having spent the last 10 years of my life working as a research in Computer Vision doesn't hurt. \u003Cbr \/\u003E\u003Cbr \/\u003EGetting a PhD in Computer Vision and learning how all of these technologies work is a truly amazing experience. \u0026nbsp;\u003Cb\u003EI encourage many students to undertake this 6+ year journey and learn all about computer vision.\u003C\/b\u003E \u0026nbsp;But I know the PhD path is not for everybody. \u0026nbsp;That's why we've built VMX. \u0026nbsp;So the rest of you can enjoy the power of industrial-grade computer vision algorithms and the ease of intuitive web-based interfaces, without the expertise needed to piece together many different technologies. \u0026nbsp;The number of applications of computer vision tech is astounding and it is a shame that such technology hasn't been delivered with such a lower barrier-to-entry earlier.\u003Cbr \/\u003E\u003Cbr \/\u003EWith VMX, \u003Cb\u003Ewe're excited that the world is going to experience visual object recognition the way it was meant to be experienced.\u003C\/b\u003E \u0026nbsp;But for that to happen, we still need \u003Ci\u003Eyour\u003C\/i\u003E support. \u0026nbsp;Check out our \u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Project on Kickstarter\u003C\/a\u003E (the page has lots of additional VMX in action videos), and help spread the word.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/2.bp.blogspot.com\/-hjbZkvxmWSM\/UsyM6suKwkI\/AAAAAAAANWw\/42HCtxyI4qo\/s400\/input_formats-01.png\" height=\"91\" width=\"400\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Project: Computer Vision for Everyone\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/5039329766928752607\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/tracking-points-in-live-camera-feed.html#comment-form","title":"2 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5039329766928752607"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5039329766928752607"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/tracking-points-in-live-camera-feed.html","title":"Tracking points in a live camera feed: A behind-the-scenes look at the VMX Project webapp"}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/2.bp.blogspot.com\/-hjbZkvxmWSM\/UsyM6suKwkI\/AAAAAAAANWw\/42HCtxyI4qo\/s72-c\/input_formats-01.png","height":"72","width":"72"},"thr$total":{"$t":"2"}},{"id":{"$t":"tag:blogger.com,1999:blog-15418143.post-5555036173190975114"},"published":{"$t":"2014-01-06T00:29:00.000-05:00"},"updated":{"$t":"2014-01-14T07:20:21.414-05:00"},"category":[{"scheme":"http://www.blogger.com/atom/ns#","term":"entrepreneurship"},{"scheme":"http://www.blogger.com/atom/ns#","term":"kickstarter"},{"scheme":"http://www.blogger.com/atom/ns#","term":"linux"},{"scheme":"http://www.blogger.com/atom/ns#","term":"local installation"},{"scheme":"http://www.blogger.com/atom/ns#","term":"local server"},{"scheme":"http://www.blogger.com/atom/ns#","term":"pivot"},{"scheme":"http://www.blogger.com/atom/ns#","term":"service"},{"scheme":"http://www.blogger.com/atom/ns#","term":"update"},{"scheme":"http://www.blogger.com/atom/ns#","term":"virtual image"},{"scheme":"http://www.blogger.com/atom/ns#","term":"virtualbox"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vision as a service"},{"scheme":"http://www.blogger.com/atom/ns#","term":"vmx credits"},{"scheme":"http://www.blogger.com/atom/ns#","term":"VMX project"}],"title":{"type":"text","$t":"You asked, we listened. VMX will be available to run locally."},"content":{"type":"html","$t":"The following post is a result of my team launching a Kickstarter campaign two weeks ago and upgrading one of our rewards based on all the feedback we received from backers and potential backers. \u0026nbsp;We initially intended to launch the VMX project as a service meaning that it would only run over an internet connection to our serves. \u0026nbsp;But there were scenarios where this was not appropriate. Some people didn't have a fast enough internet connection at home, some people were worried that it would be too expensive to use our product, and some people couldn't use software which required an internet connection at work. \u0026nbsp;The VMX Project, our flagship computer vision in-the-browser software, will not run using a local object detection server.\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16px; line-height: 24px; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/3.bp.blogspot.com\/-YQPDBIL5Eik\/Uso4So15POI\/AAAAAAAANWg\/Jcy7A2cmxeI\/s400\/local_install.png\" height=\"244\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; font-family: 'Helvetica Neue', Helvetica, Arial, 'Liberation Sans', FreeSans, sans-serif; font-size: 16px; line-height: 24px; text-align: center;\"\u003E\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\"\u003EVMX Project: Computer Vision for Everyone\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cbr \/\u003E(Cross-posted from\u0026nbsp;\u003Ca href=\"http:\/\/www.kickstarter.com\/projects\/visionai\/vmx-project-computer-vision-for-everyone\/posts\/712334\"\u003EVMX Project Kickstarter January 5, 2013 update\u003C\/a\u003E\u0026nbsp;and post on\u0026nbsp;\u003Ca href=\"http:\/\/blog.vision.ai\/2014\/01\/06\/local-vmx-install\/\"\u003Eblog.vision.ai\u0026nbsp;\u003C\/a\u003E)\u003Cbr \/\u003E\u003Cbr \/\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: center;\"\u003E\u003Ca href=\"http:\/\/i.imgur.com\/QMhp0Ao.png\" imageanchor=\"1\" style=\"margin-left: 1em; margin-right: 1em;\"\u003E\u003Cimg border=\"0\" src=\"http:\/\/i.imgur.com\/QMhp0Ao.png\" height=\"24\" width=\"320\" \/\u003E\u003C\/a\u003E\u003C\/div\u003E\u003Cbr \/\u003E\u003Cdiv style=\"border: 0px; margin-bottom: 20px; padding: 0px; vertical-align: baseline;\"\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003EOver the last few weeks, \u003Cb\u003Ewe've listened\u003C\/b\u003E to many backers (and potential backers) talk about our technology and would like to thank everyone who gave us valuable feedback. Many of you didn’t like VMX being offered only as a service (requiring an internet connection), so \u003Cb\u003Ewe decided to offer a local VMX installation\u003C\/b\u003E in addition to making VMX available as a service. We didn’t anticipate such great demand for VMX running locally on people’s own computers and networks, but we are dedicated to letting developers have an exceptional computer vision experience and are eager to give our users what they want.\u0026nbsp;\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003EOnce the early-access period (March 2014 - June 2014) is over, VMX developers will have the option to receive a single-machine VMX license and install VMX on their own computer. With VMX running on your computer, you won’t have to worry about running out of VMX Compute Hours, accidentally making your data public, and most importantly: it won’t require an internet connection. You will also have the option of communicating between VMX running on your computer and our servers. You will be able to download object detectors, download the models you create during the early-access period, as well as back-up your object models and import them into the VMX as-a-service servers.\u0026nbsp;\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003EDuring our official launch in Summer 2014, a single-machine VMX license will be available to VMX Developers for $100. Kickstarter backers will be able to simply trade-in 100 of their VMX Compute Hours to obtain one single-machine license and download the software for their own use.\u0026nbsp;\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003EThe local VMX software will be installable directly on a computer running Linux. For VMX developers running MS Windows or Apple OS X, we will provide a Linux Virtual Image for download which will contain a pre-installed, and fully configured instance of VMX.\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003EWe hope this will make all VMX users more excited about our technology.\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E\u003Cbr \/\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003Cdiv class=\"separator\" style=\"clear: both; text-align: start;\"\u003E\u003Cspan style=\"font-family: Helvetica Neue, Helvetica, Arial, Liberation Sans, FreeSans, sans-serif;\"\u003E\u003Cspan style=\"line-height: 24px;\"\u003E--the guys from \u003Ca href=\"http:\/\/vision.ai\/\"\u003EVISION.AI\u003C\/a\u003E\u003C\/span\u003E\u003C\/span\u003E\u003C\/div\u003E\u003C\/div\u003E"},"link":[{"rel":"replies","type":"application/atom+xml","href":"https:\/\/www.computervisionblog.com\/feeds\/5555036173190975114\/comments\/default","title":"Post Comments"},{"rel":"replies","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/you-asked-we-listened-vmx-will-be.html#comment-form","title":"0 Comments"},{"rel":"edit","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5555036173190975114"},{"rel":"self","type":"application/atom+xml","href":"https:\/\/www.blogger.com\/feeds\/15418143\/posts\/default\/5555036173190975114"},{"rel":"alternate","type":"text/html","href":"https:\/\/www.computervisionblog.com\/2014\/01\/you-asked-we-listened-vmx-will-be.html","title":"You asked, we listened. VMX will be available to run locally."}],"author":[{"name":{"$t":"Tomasz Malisiewicz"},"uri":{"$t":"http:\/\/www.blogger.com\/profile\/17507234774392358321"},"email":{"$t":"noreply@blogger.com"},"gd$image":{"rel":"http://schemas.google.com/g/2005#thumbnail","width":"16","height":"16","src":"https:\/\/img1.blogblog.com\/img\/b16-rounded.gif"}}],"media$thumbnail":{"xmlns$media":"http://search.yahoo.com/mrss/","url":"http:\/\/3.bp.blogspot.com\/-YQPDBIL5Eik\/Uso4So15POI\/AAAAAAAANWg\/Jcy7A2cmxeI\/s72-c\/local_install.png","height":"72","width":"72"},"thr$total":{"$t":"0"}}]}});