dc.contributor.author | Owens, Andrew Hale | |
dc.contributor.author | Wu, Jiajun | |
dc.contributor.author | McDermott, Joshua H. | |
dc.contributor.author | Freeman, William T. | |
dc.contributor.author | Torralba, Antonio | |
dc.date.accessioned | 2017-09-12T13:32:52Z | |
dc.date.available | 2017-09-12T13:32:52Z | |
dc.date.issued | 2016-09 | |
dc.identifier.isbn | 978-3-319-46447-3 | |
dc.identifier.isbn | 978-3-319-46448-0 | |
dc.identifier.issn | 0302-9743 | |
dc.identifier.issn | 1611-3349 | |
dc.identifier.uri | http://hdl.handle.net/1721.1/111172 | |
dc.description.abstract | The sound of crashing waves, the roar of fast-moving cars – sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds. | en_US |
dc.description.sponsorship | National Science Foundation (U.S.) (Grant 1524817) | en_US |
dc.description.sponsorship | National Science Foundation (U.S.) (Grant 1447476) | en_US |
dc.description.sponsorship | National Science Foundation (U.S.) (Grant 1212849) | en_US |
dc.language.iso | en_US | |
dc.publisher | Springer-Verlag | en_US |
dc.relation.isversionof | http://dx.doi.org/10.1007/978-3-319-46448-0_48 | en_US |
dc.rights | Creative Commons Attribution-Noncommercial-Share Alike | en_US |
dc.rights.uri | http://creativecommons.org/licenses/by-nc-sa/4.0/ | en_US |
dc.source | arXiv | en_US |
dc.title | Ambient Sound Provides Supervision for Visual Learning | en_US |
dc.type | Article | en_US |
dc.identifier.citation | Owens, Andrew, et al. “Ambient Sound Provides Supervision for Visual Learning.” Lecture Notes in Computer Science 9905 (September 2016): 801–816. © 2016 Springer International Publishing AG | en_US |
dc.contributor.department | Massachusetts Institute of Technology. Department of Brain and Cognitive Sciences | en_US |
dc.contributor.department | Massachusetts Institute of Technology. Department of Electrical Engineering and Computer Science | en_US |
dc.contributor.mitauthor | Owens, Andrew Hale | |
dc.contributor.mitauthor | Wu, Jiajun | |
dc.contributor.mitauthor | McDermott, Joshua H. | |
dc.contributor.mitauthor | Freeman, William T. | |
dc.contributor.mitauthor | Torralba, Antonio | |
dc.relation.journal | Lecture Notes in Computer Science | en_US |
dc.eprint.version | Original manuscript | en_US |
dc.type.uri | http://purl.org/eprint/type/ConferencePaper | en_US |
eprint.status | http://purl.org/eprint/status/NonPeerReviewed | en_US |
dspace.orderedauthors | Owens, Andrew; Wu, Jiajun; McDermott, Josh H.; Freeman, William T.; Torralba, Antonio | en_US |
dspace.embargo.terms | N | en_US |
dc.identifier.orcid | https://orcid.org/0000-0001-9020-9593 | |
dc.identifier.orcid | https://orcid.org/0000-0002-4176-343X | |
dc.identifier.orcid | https://orcid.org/0000-0002-3965-2503 | |
dc.identifier.orcid | https://orcid.org/0000-0002-2231-7995 | |
dc.identifier.orcid | https://orcid.org/0000-0003-4915-0256 | |
dspace.mitauthor.error | true | |
mit.license | OPEN_ACCESS_POLICY | en_US |