<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (mraginsky)</title>
    <link>https://pinboard.in/u:mraginsky/public/</link>
    <description>recent bookmarks from mraginsky</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://www.pnas.org/doi/full/10.1073/pnas.2502353122"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2301.10414"/>
	<rdf:li rdf:resource="https://openreview.net/forum?id=UvmDCdSPDOW"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2402.07717"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2401.12119v1"/>
	<rdf:li rdf:resource="https://journals.aps.org/pre/abstract/10.1103/PhysRevE.102.052117"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2303.07279"/>
	<rdf:li rdf:resource="https://www.frontiersin.org/articles/10.3389/fpsyg.2020.00588/full"/>
	<rdf:li rdf:resource="https://link.springer.com/article/10.1007/s10867-010-9195-3"/>
	<rdf:li rdf:resource="https://royalsocietypublishing.org/doi/10.1098/rsif.2012.0869"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2108.08198"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2103.14723"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2008.10249"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2104.09557"/>
	<rdf:li rdf:resource="https://deontologistics.co/2019/11/01/tfe-information-and-energy/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2101.06640"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.10567"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1808.07593"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1805.04625"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1701.02827"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1705.00302"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1707.05199"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1706.10260"/>
	<rdf:li rdf:resource="https://www.youtube.com/playlist?list=PLYq7WW565SZhPcf5PxsuBUz2PtB7Kfb5X"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1611.00814"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1511.01437"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1610.03592"/>
	<rdf:li rdf:resource="http://www.sciencedirect.com/science/article/pii/S0893608016300880"/>
	<rdf:li rdf:resource="https://hal.archives-ouvertes.fr/hal-01365314/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1607.06494"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1508.04095"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1606.08920"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1605.02818"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1605.02268"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1205.2628"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1503.03585"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1510.04214"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1509.04555"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1510.01844"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1510.02190"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1510.00764"/>
	<rdf:li rdf:resource="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6963478&amp;searchWithin%5B%5D=%22Authors%22%3A.QT.Saldi%2C+N..QT.&amp;newsearch=true"/>
	<rdf:li rdf:resource="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7138587&amp;punumber%3D18%26filter%3DAND%28p_IS_Number%3A7203188%29%26pageNumber%3D2"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1508.00335"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1506.02629"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1506.07216"/>
	<rdf:li rdf:resource="http://people.inf.ethz.ch/~shassani/papers/colt2015-mis.pdf"/>
	<rdf:li rdf:resource="http://plato.stanford.edu/entries/information-entropy/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1506.06472"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/0709.1948"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1506.05483"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1504.04419"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1312.5276"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1403.5855"/>
	<rdf:li rdf:resource="http://link.springer.com/article/10.1007%2FBF00532723"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1310.6391"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1412.7172"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1205.3756"/>
	<rdf:li rdf:resource="http://rgmia.org/monographs/csiszar.htm"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1410.0503"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1409.6833"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1407.0381"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1406.0767"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1405.3629"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1405.0608"/>
	<rdf:li rdf:resource="http://www.eecs.berkeley.edu/~ananth/2014+/2014-02-24_VarunCISS.pdf"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1402.3067"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/0903.2962"/>
	<rdf:li rdf:resource="http://www2.isye.gatech.edu/~cguzman6/Files/ArxivLBAlgo.pdf"/>
	<rdf:li rdf:resource="http://www.cs.berkeley.edu/~jduchi/projects/ZhangDuJoWa13.pdf"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://www.pnas.org/doi/full/10.1073/pnas.2502353122">
    <title>Information rate of meaningful communication | PNAS</title>
    <dc:date>2026-06-08T17:17:22+00:00</dc:date>
    <link>https://www.pnas.org/doi/full/10.1073/pnas.2502353122</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[In Shannon’s seminal paper, the entropy of printed English, treated as a stationary stochastic process, was estimated to be roughly 1 bit per character. However, considered as a means of communication, language differs considerably from its printed form: i) the units of information are not characters or even words but clauses, i.e., shortest meaningful parts of speech; and ii) what is transmitted is principally the meaning of what is being said or written, while the precise phrasing that was used to communicate the meaning is typically ignored. In this study, we show that one can leverage recently developed large language models to quantify information communicated in meaningful narratives in terms of bits of meaning per clause.]]></description>
<dc:subject>papers to-read information-theory language large-language-models communication semantic-communication</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:c67a307c255b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:language"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:large-language-models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:communication"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:semantic-communication"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2301.10414">
    <title>[2301.10414] Towards a Unification of Logic and Information Theory</title>
    <dc:date>2025-09-17T21:27:23+00:00</dc:date>
    <link>https://arxiv.org/abs/2301.10414</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Today, the vast majority of the world's digital information is represented using the fundamental assumption, introduced by Claude Shannon in 1948, that ``...the semantic aspects of communication are irrelevant to the engineering problem (of the design of communication systems)...''. Consider, nonetheless, the observation that we often combine a message with other information in order to deduce new facts, thereby expanding the value of such a message. It is noteworthy that to-date, no rigorous theory of communication has been put forth which postulates the existence of deductive capabilities on the receiver's side.
The purpose of this paper is to present such a theory. We formally model such deductive capabilities using logic reasoning, and present a rigorous theory which covers the following generic scenario: Alice and Bob each have knowledge of some logic sentence, and they wish to communicate as efficiently as possible with the shared goal that, following their communication, Bob should be able to deduce a particular logic sentence that Alice knows to be true, but that Bob currently cannot prove. Many variants of this general setup are considered in this article; in all cases we are able to provide sharp upper and lower bounds. Our contribution includes the identification of the most fundamental requirements that we place on a logic and associated logical language for all of our results to apply. Practical algorithms that are in some cases asymptotically optimal are provided, and we illustrate the potential practical value of the design of communication systems that incorporate the assumption of deductive capabilities at the receiver using experimental results that suggest significant possible gains compared to classical systems. ]]></description>
<dc:subject>papers to-read information-theory logic formal-methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:463dce340e59/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:logic"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:formal-methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://openreview.net/forum?id=UvmDCdSPDOW">
    <title>Information-Theoretic Diffusion | OpenReview</title>
    <dc:date>2024-08-26T16:03:09+00:00</dc:date>
    <link>https://openreview.net/forum?id=UvmDCdSPDOW</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Denoising diffusion models have spurred significant gains in density modeling and image generation, precipitating an industrial revolution in text-guided AI art generation.  We introduce a new mathematical foundation for diffusion models inspired by classic results in information theory that connect Information with Minimum Mean Square Error regression, the so-called I-MMSE relations. We generalize the I-MMSE relations to \emph{exactly} relate the data distribution to an optimal denoising regression problem, leading to an elegant refinement of existing diffusion bounds.  This new insight leads to several improvements for probability distribution estimation, including a theoretical justification for diffusion model ensembling. Remarkably, our framework shows how continuous and discrete probabilities can be learned with the same regression objective, avoiding domain-specific generative models used in variational methods.]]></description>
<dc:subject>papers to-read diffusions generative-models information-theory estimation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:2d447fa522bf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:diffusions"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:generative-models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:estimation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2402.07717">
    <title>[2402.07717] Efficient reductions between some statistical models</title>
    <dc:date>2024-02-13T21:20:57+00:00</dc:date>
    <link>https://arxiv.org/abs/2402.07717</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We study the problem of approximately transforming a sample from a source statistical model to a sample from a target statistical model without knowing the parameters of the source model, and construct several computationally efficient such reductions between statistical experiments. In particular, we provide computationally efficient procedures that approximately reduce uniform, Erlang, and Laplace location models to general target families. We illustrate our methodology by establishing nonasymptotic reductions between some canonical high-dimensional problems, spanning mixtures of experts, phase retrieval, and signal denoising. Notably, the reductions are structure preserving and can accommodate missing data. We also point to a possible application in transforming one differentially private mechanism to another. ]]></description>
<dc:subject>papers to-read statistics Le_Cam-theory information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:8790375ae265/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:Le_Cam-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2401.12119v1">
    <title>Temperature as Joules per Bit</title>
    <dc:date>2024-01-23T03:58:20+00:00</dc:date>
    <link>https://arxiv.org/abs/2401.12119v1</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Boltzmann's constant reflects a historical misunderstanding of the concept of entropy, whose informational nature is obfuscated when expressed in J/K. We suggest that the development of temperature and energy, historically prior to that of entropy, does not amount to their logical priority: Temperature should be defined in terms of entropy, not vice versa. Following the precepts of information theory, entropy is measured in bits, and coincides with information capacity at thermodynamic equilibrium. Consequently, not only is the temperature of an equilibrated system expressed in J/bit, but it acquires an operational meaning: It is the cost in energy to increase its information capacity by 1 bit. Our proposal also supports the notion of available capacity, analogous to free energy. Finally, it simplifies Landauer's cost and clarifies that it is a cost of displacement, not of erasure. ]]></description>
<dc:subject>papers to-read thermodynamics information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:72bf621c25c1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:thermodynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://journals.aps.org/pre/abstract/10.1103/PhysRevE.102.052117">
    <title>Phys. Rev. E 102, 052117 (2020) - Thermodynamics from relative entropy</title>
    <dc:date>2023-06-12T17:17:38+00:00</dc:date>
    <link>https://journals.aps.org/pre/abstract/10.1103/PhysRevE.102.052117</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Thermodynamics can be developed from a microscopic starting point in terms of entropy and the maximum entropy principle. We investigate here to what extent one can replace entropy with relative entropy which has several advantages, for example, in the context of local quantum field theory. We find that the principle of maximum entropy can be replaced by a principle of minimum expected relative entropy. Various ensembles and their thermodynamic potentials can be defined through relative entropy. We also show that thermal fluctuations are in fact governed by a relative entropy. Furthermore, we reformulate the third law of thermodynamics using relative entropy only.]]></description>
<dc:subject>papers to-read thermodynamics information-theory physics-of-information</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:73e1f3e391a6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:thermodynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:physics-of-information"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2303.07279">
    <title>[2303.07279] Universal coding, intrinsic volumes, and metric complexity</title>
    <dc:date>2023-05-05T02:48:23+00:00</dc:date>
    <link>https://arxiv.org/abs/2303.07279</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We study sequential probability assignment in the Gaussian setting, where the goal is to predict, or equivalently compress, a sequence of real-valued observations almost as well as the best Gaussian distribution with mean constrained to a given subset of $\mathbf{R}^n$. First, in the case of a convex constraint set $K$, we express the hardness of the prediction problem (the minimax regret) in terms of the intrinsic volumes of $K$; specifically, it equals the logarithm of the Wills functional from convex geometry. We then establish a comparison inequality for the Wills functional in the general nonconvex case, which underlines the metric nature of this quantity and generalizes the Slepian-Sudakov-Fernique comparison principle for the Gaussian width. Motivated by this inequality, we characterize the exact order of magnitude of the considered functional for a general nonconvex set, in terms of global covering numbers and local Gaussian widths. This implies metric isomorphic estimates for the log-Laplace transform of the intrinsic volume sequence of a convex body. As part of our analysis, we also characterize the minimax redundancy for a general constraint set. We finally relate and contrast our findings with classical asymptotic results in information theory. ]]></description>
<dc:subject>papers to-read empirical-processes probability information-theory source-coding</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:9d5d5e5b6290/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:empirical-processes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:source-coding"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.frontiersin.org/articles/10.3389/fpsyg.2020.00588/full">
    <title>Frontiers | An Enactive-Ecological Approach to Information and Uncertainty</title>
    <dc:date>2022-08-03T18:20:56+00:00</dc:date>
    <link>https://www.frontiersin.org/articles/10.3389/fpsyg.2020.00588/full</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Information is a central notion for cognitive sciences and neurosciences, but there is no agreement on what it means for a cognitive system to acquire information about its surroundings. In this paper, we approximate three influential views on information: the one at play in ecological psychology, which is sometimes called information for action; the notion of information as covariance as developed by some enactivists, and the idea of information as a minimization of uncertainty as presented by Shannon. Our main thesis is that information for action can be construed as covariant information, and that learning to perceive covariant information is a matter of minimizing uncertainty through skilled performance. We argue that the agent’s cognitive system conveys information for acting in an environment by minimizing uncertainty about how to achieve intended goals in that environment. We conclude by reviewing empirical findings that support our view by showing how direct learning, seen as an instance of ecological rationality at work, is how mere possibilities for action are turned into embodied know-how. Finally, we indicate the affinity between direct learning and sense-making activity.]]></description>
<dc:subject>papers to-read information-theory cognition enactivism perception</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:97f48f251ba0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:enactivism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:perception"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://link.springer.com/article/10.1007/s10867-010-9195-3">
    <title>Information processing, computation, and cognition | SpringerLink</title>
    <dc:date>2022-08-03T17:15:40+00:00</dc:date>
    <link>https://link.springer.com/article/10.1007/s10867-010-9195-3</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Computation and information processing are among the most fundamental notions in cognitive science. They are also among the most imprecisely discussed. Many cognitive scientists take it for granted that cognition involves computation, information processing, or both – although others disagree vehemently. Yet different cognitive scientists use ‘computation’ and ‘information processing’ to mean different things, sometimes without realizing that they do. In addition, computation and information processing are surrounded by several myths; first and foremost, that they are the same thing. In this paper, we address this unsatisfactory state of affairs by presenting a general and theory-neutral account of computation and information processing. We also apply our framework by analyzing the relations between computation and information processing on one hand and classicism, connectionism, and computational neuroscience on the other. We defend the relevance to cognitive science of both computation, at least in a generic sense, and information processing, in three important senses of the term. Our account advances several foundational debates in cognitive science by untangling some of their conceptual knots in a theory-neutral way. By leveling the playing field, we pave the way for the future resolution of the debates’ empirical aspects.]]></description>
<dc:subject>papers to-read information-theory cognition computation biology neuroscience</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:580f27a1f05d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:computation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:neuroscience"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://royalsocietypublishing.org/doi/10.1098/rsif.2012.0869">
    <title>The algorithmic origins of life | Journal of The Royal Society Interface</title>
    <dc:date>2022-08-02T16:00:01+00:00</dc:date>
    <link>https://royalsocietypublishing.org/doi/10.1098/rsif.2012.0869</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Although it has been notoriously difficult to pin down precisely what is it that makes life so distinctive and remarkable, there is general agreement that its informational aspect is one key property, perhaps the key property. The unique informational narrative of living systems suggests that life may be characterized by context-dependent causal influences, and, in particular, that top-down (or downward) causation—where higher levels influence and constrain the dynamics of lower levels in organizational hierarchies—may be a major contributor to the hierarchal structure of living systems. Here, we propose that the emergence of life may correspond to a physical transition associated with a shift in the causal structure, where information gains direct and context-dependent causal efficacy over the matter in which it is instantiated. Such a transition may be akin to more traditional physical transitions (e.g. thermodynamic phase transitions), with the crucial distinction that determining which phase (non-life or life) a given system is in requires dynamical information and therefore can only be inferred by identifying causal architecture. We discuss some novel research directions based on this hypothesis, including potential measures of such a transition that may be amenable to laboratory study, and how the proposed mechanism corresponds to the onset of the unique mode of (algorithmic) information processing characteristic of living systems.]]></description>
<dc:subject>papers to-read biogenesis information-theory algorithms causality control-theory dynamical-systems cybernetics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:053e8aa6fdff/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:biogenesis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:causality"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:control-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:dynamical-systems"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:cybernetics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2108.08198">
    <title>[2108.08198] Dimension-free Bounds for Sums of Independent Matrices and Simple Tensors via the Variational Principle</title>
    <dc:date>2022-03-28T01:45:12+00:00</dc:date>
    <link>https://arxiv.org/abs/2108.08198</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We consider the deviation inequalities for the sums of independent d by d random matrices, as well as rank one random tensors. Our focus is on the non-isotropic case and the bounds that do not depend explicitly on the dimension d, but rather on the effective rank. In an elementary and unified manner, we show the following results:
1) A deviation bound for the sums of independent positive-semi-definite matrices of any rank. This result generalizes the dimension-free bound of Koltchinskii and Lounici [Bernoulli, 23(1): 110-133, 2017] on the sample covariance matrix in the sub-Gaussian case. 2) A dimension-free version of the bound of Adamczak, Litvak, Pajor and Tomczak-Jaegermann [Journal Of Amer. Math. Soc,. 23(2), 535-561, 2010] on the sample covariance matrix in the log-concave case. 3) Dimension-free bounds for the operator norm of the sums of random tensors of rank one formed either by sub-Gaussian or by log-concave random vectors. This complements the result of Guédon and Rudelson [Adv. in Math., 208: 798-823, 2007]. 4) A non-isotropic version of the result of Alesker [Geom. Asp. of Funct. Anal., 77: 1-4, 1995] on the deviation of the norm of sub-exponential random vectors. 5) A dimension-free lower tail bound for sums of positive semi-definite matrices with heavy-tailed entries, sharpening the bound of Oliveira [Prob. Th. and Rel. Fields, 166: 1175-1194, 2016].
Our approach is based on the duality formula between entropy and moment generating functions. In contrast to the known proofs of dimension-free bounds, we avoid Talagrand's majorizing measure theorem, as well as generic chaining bounds for empirical processes. Some of our tools were pioneered by O. Catoni and co-authors in the context of robust statistical estimation. ]]></description>
<dc:subject>papers to-read probability random-matrices empirical-processes information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:654b5de716ce/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:random-matrices"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:empirical-processes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2103.14723">
    <title>Lower Bounds on the Generalization Error of Nonlinear Learning Models</title>
    <dc:date>2022-01-27T18:33:42+00:00</dc:date>
    <link>https://arxiv.org/abs/2103.14723</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We study in this paper lower bounds for the generalization error of models derived from multi-layer neural networks, in the regime where the size of the layers is commensurate with the number of samples in the training data. We show that unbiased estimators have unacceptable performance for such nonlinear networks in this regime. We derive explicit generalization lower bounds for general biased estimators, in the cases of linear regression and of two-layered networks. In the linear case the bound is asymptotically tight. In the nonlinear case, we provide a comparison of our bounds with an empirical study of the stochastic gradient descent algorithm. The analysis uses elements from the theory of large random matrices. ]]></description>
<dc:subject>papers to-read learning-theory random-matrices information-theory statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:89ae800369a8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:learning-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:random-matrices"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2008.10249">
    <title>[2008.10249] Information Constrained Optimal Transport: From Talagrand, to Marton, to Cover</title>
    <dc:date>2021-05-03T22:25:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2008.10249</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[The optimal transport problem studies how to transport one measure to another in the most cost-effective way and has wide range of applications from economics to machine learning. In this paper, we introduce and study an information constrained variation of this problem. Our study yields a strengthening and generalization of Talagrand's celebrated transportation cost inequality. Following Marton's approach, we show that the new transportation cost inequality can be used to recover old and new concentration of measure results. Finally, we provide an application of this new inequality to network information theory. We show that it can be used to recover almost immediately a recent solution to a long-standing open problem posed by Cover regarding the capacity of the relay channel. ]]></description>
<dc:subject>papers to-read information-theory optimal-transportation measure-concentration probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:a97b0018567d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimal-transportation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2104.09557">
    <title>[2104.09557] Learning to Communicate with Strangers via Channel Randomisation Methods</title>
    <dc:date>2021-05-03T21:51:38+00:00</dc:date>
    <link>https://arxiv.org/abs/2104.09557</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read information-theory learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:f6de54b0e126/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://deontologistics.co/2019/11/01/tfe-information-and-energy/">
    <title>TfE: Information and Energy – DEONTOLOGISTICS</title>
    <dc:date>2021-05-03T21:48:52+00:00</dc:date>
    <link>https://deontologistics.co/2019/11/01/tfe-information-and-energy/</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>information-theory computation cybernetics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:280635a607f3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:computation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:cybernetics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2101.06640">
    <title>[2101.06640] Estimating informativeness of samples with Smooth Unique Information</title>
    <dc:date>2021-01-21T19:07:27+00:00</dc:date>
    <link>https://arxiv.org/abs/2101.06640</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We define a notion of information that an individual sample provides to the training of a neural network, and we specialize it to measure both how much a sample informs the final weights and how much it informs the function computed by the weights. Though related, we show that these quantities have a qualitatively different behavior. We give efficient approximations of these quantities using a linearized network and demonstrate empirically that the approximation is accurate for real-world architectures, such as pre-trained ResNets. We apply these measures to several problems, such as dataset summarization, analysis of under-sampled classes, comparison of informativeness of different data sources, and detection of adversarial and corrupted examples. Our work generalizes existing frameworks but enjoys better computational properties for heavily over-parametrized models, which makes it possible to apply it to real-world networks. ]]></description>
<dc:subject>papers to-read neural-networks information-theory deep-learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:e1b35f894124/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:neural-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:deep-learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.10567">
    <title>[2002.10567] A universal energy accuracy tradeoff in nonequilibrium cellular sensing</title>
    <dc:date>2020-07-20T20:21:31+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.10567</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We combine stochastic thermodynamics, large deviation theory, and information theory to derive fundamental limits on the accuracy with which single cells can detect external chemical concentrations through arbitrarily complex, energy consuming nonequilibrium cell-surface receptors. We show that if estimation is performed by an ideal observer of the entire trajectory of receptor states, then no energy consuming non-equilibrium receptor that can be divided into two pools of bound signaling and unbound non-signaling states can outperform a simple equilibrium two-state receptor. Moreover, we derive an energy accuracy tradeoff for such general non-equilibrium receptors when the estimation is performed by a simple observer of the duration the receptor is bound. Our tradeoff reveals that the simple observer can only attain the performance of the ideal observer in the limit of large receptor energy consumption and size. Our derivations generalize the classic 1977 Berg-Purcell limit on cellular chemosensation along multiple dimensions, and also yield a novel thermodynamic uncertainty relation for the time a physical system spends in a pool of states. This relation itself is of independent interest, with applications not only to cellular chemosensation, but also to the reliability of other biological processes, like clocks and motors, as a function of energy consumption. ]]></description>
<dc:subject>papers to-read biophysics information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:309631e71a0d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:biophysics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1808.07593">
    <title>[1808.07593] Pathologies in information bottleneck for deterministic supervised learning</title>
    <dc:date>2018-08-24T00:52:02+00:00</dc:date>
    <link>https://arxiv.org/abs/1808.07593</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read machine-learning information-theory information-bottleneck</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:d1dc9b0a893f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-bottleneck"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1805.04625">
    <title>[1805.04625] Strong Converse using Change of Measure Arguments</title>
    <dc:date>2018-07-08T14:24:17+00:00</dc:date>
    <link>https://arxiv.org/abs/1805.04625</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[The strong converse for a coding theorem shows that the optimal asymptotic rate possible with vanishing error cannot be improved by allowing a fixed error. Building on a method introduced by Gu and Effros for centralized coding problems, we develop a general and simple recipe for proving strong converse that is applicable for distributed problems as well. Heuristically, our proof of strong converse mimics the standard steps for proving a weak converse, except that we apply those steps to a modified distribution obtained by conditioning the original distribution on the event that no error occurs. A key component of our recipe is the replacement of the hard Markov constraints implied by the distributed nature of the problem with a soft information cost using a variational formula introduced by Oohama. We illustrate our method by providing a short proof of the strong converse for the Wyner-Ziv problem and new strong converse theorems for interactive function computation, common randomness and secret key agreement, and the wiretap channel. ]]></description>
<dc:subject>papers to-read heard-the-talk information-theory lower-bounds</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:cf489de5601a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:heard-the-talk"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:lower-bounds"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1701.02827">
    <title>[1701.02827] Strong Functional Representation Lemma and Applications to Coding Theorems</title>
    <dc:date>2018-06-19T14:26:59+00:00</dc:date>
    <link>https://arxiv.org/abs/1701.02827</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper shows that for any random variables $X$ and $Y$, it is possible to represent $Y$ as a function of $(X,Z)$ such that $Z$ is independent of $X$ and $I(X;Z|Y)\le\log(I(X;Y)+1)+4$ bits. We use this strong functional representation lemma (SFRL) to establish a bound on the rate needed for one-shot exact channel simulation for general (discrete or continuous) random variables, strengthening the results by Harsha et al. and Braverman and Garg, and to establish new and simple achievability results for one-shot variable-length lossy source coding, multiple description coding and Gray-Wyner system. We also show that the SFRL can be used to reduce the channel with state noncausally known at the encoder to a point-to-point channel, which provides a simple achievability proof of the Gelfand-Pinsker theorem. ]]></description>
<dc:subject>papers to-read information-theory simulation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:24fb2101f0c4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:simulation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1705.00302">
    <title>[1705.00302] Measure concentration and the weak Pinsker property</title>
    <dc:date>2017-11-19T04:47:17+00:00</dc:date>
    <link>https://arxiv.org/abs/1705.00302</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read information-theory ergodic-theory measure-concentration</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:2ac303d47cf8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:ergodic-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1707.05199">
    <title>[1707.05199] Online codes for analog signals</title>
    <dc:date>2017-07-19T16:44:40+00:00</dc:date>
    <link>https://arxiv.org/abs/1707.05199</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We revisit a classical scenario in communication theory: a source is generating a waveform which we sample at regular intervals; we wish to transform the signal in such a way as to minimize distortion in its reconstruction, despite noise. The transformation must be online (also called causal), in order to enable real-time signaling. The noise model we consider is adversarial $\ell_1$-bounded; this is the "atomic norm" convex relaxation of the standard adversary model in discrete-alphabet communications, namely sparsity (low Hamming weight). We require that our encoding not increase the power of the original signal. 
In the "block coding" setting such encoding is possible due to the existence of large almost-Euclidean sections in $\ell_1$ spaces (established in the work of Dvoretzky, Milman, Ka\v{s}in, and Figiel, Lindenstrauss and Milman). 
Our main result is that an analogous result is achievable even online. Equivalently, we show a "lower triangular" version of $\ell_1$ Dvoretzky theorems. In terms of communication, the result has the following form: If the signal is a stream of reals $x_1,\ldots$, one per unit time, which we encode causally into $\rho$ (a constant) reals per unit time (forming altogether an output stream $\mathcal{E}(x)$), and if the adversarial noise added to this encoded stream up to time $s$ is a vector $\vec{y}$, then at time $s$ the decoder's reconstruction of the input prefix $x_{[s]}$ is accurate in a time-weighted $\ell_2$ norm, to within $s^{-1/2+\delta}$ (any $\delta>0$) times the adversary's noise as measured in a time-weighted $\ell_1$ norm. The time-weighted decoding norm forces increasingly accurate reconstruction of the distant past, while the time-weighted noise norm permits only vanishing effect from noise in the distant past. 
Encoding is linear, and decoding is performed by an LP analogous to those used in compressed sensing.]]></description>
<dc:subject>papers to-read random-matrices communication information-theory analog-information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:64ebb3205535/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:random-matrices"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:communication"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:analog-information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1706.10260">
    <title>How biased is your model? Concentration Inequalities, Information and Model Bias</title>
    <dc:date>2017-07-04T02:18:14+00:00</dc:date>
    <link>https://arxiv.org/abs/1706.10260</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[e derive tight and computable bounds on the bias of statistical estimators, or more generally of quantities of interest, when evaluated on a baseline model P rather than on the typically unknown true model Q. Our proposed method combines the scalable information inequality derived by P. Dupuis, K.Chowdhary, the authors and their collaborators together with classical concentration inequalities (such as Bennett's and Hoeffding-Azuma inequalities). Our bounds are expressed in terms of the Kullback-Leibler divergence R(Q||P) of model Q with respect to P and the moment generating function for the statistical estimator under P. Furthermore, concentration inequalities, i.e. bounds on moment generating functions, provide tight and computationally inexpensive model bias bounds for quantities of interest. Finally, they allow us to derive rigorous confidence bands for statistical estimators that account for model bias and are valid for an arbitrary amount of data.]]></description>
<dc:subject>papers to-read statistics information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:c2da7b1f98ef/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.youtube.com/playlist?list=PLYq7WW565SZhPcf5PxsuBUz2PtB7Kfb5X">
    <title>Information, Control, and Learning – The Ingredients of Intelligent Behavior - YouTube</title>
    <dc:date>2017-04-04T02:08:08+00:00</dc:date>
    <link>https://www.youtube.com/playlist?list=PLYq7WW565SZhPcf5PxsuBUz2PtB7Kfb5X</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>information-theory machine-learning ai conferences control-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:435fae84aea1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:ai"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:conferences"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:control-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1611.00814">
    <title>[1611.00814] Information-theoretic thresholds from the cavity method</title>
    <dc:date>2016-11-04T03:20:20+00:00</dc:date>
    <link>https://arxiv.org/abs/1611.00814</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Vindicating a sophisticated but non-rigorous physics approach called the cavity method, we establish a formula for the mutual information in statistical inference problems induced by random graphs. This general result implies the conjecture on the information-theoretic threshold in the disassortative stochastic block model [Decelle et al.: Phys. Rev. E (2011)] and allows us to pinpoint the exact condensation phase transition in random constraint satisfaction problems such as random graph coloring, thereby proving a conjecture from [Krzakala et al.: PNAS (2007)]. As a further application we establish the formula for the mutual information in Low-Density Generator Matrix codes as conjectured in [Montanari: IEEE Transactions on Information Theory (2005)]. The proofs provide a conceptual underpinning of the replica symmetric variant of the cavity method, and we expect that the approach will find many future applications.]]></description>
<dc:subject>papers to-read statistical-physics information-theory optimization graphical-models</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:7682d0661b22/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistical-physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:graphical-models"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1511.01437">
    <title>[1511.01437] The sample size required in importance sampling</title>
    <dc:date>2016-10-23T19:33:18+00:00</dc:date>
    <link>https://arxiv.org/abs/1511.01437</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[The goal of importance sampling is to estimate the expected value of a given function with respect to a probability measure $\nu$ using a random sample of size $n$ drawn from a different probability measure $\mu$. If the two measures $\mu$ and $\nu$ are nearly singular with respect to each other, which is often the case in practice, the sample size required for accurate estimation is large. In this article it is shown that in a fairly general setting, a sample of size approximately $\exp(D(\nu||\mu))$ is necessary and sufficient for accurate estimation by importance sampling, where $D(\nu||\mu)$ is the Kullback--Leibler divergence of $\mu$ from $\nu$. In particular, the required sample size exhibits a kind of cut-off in the logarithmic scale. The theory is applied to obtain a fairly general formula for the sample size required in importance sampling for exponential families (Gibbs measures). We also show that the standard variance-based diagnostic for convergence of importance sampling is fundamentally problematic. An alternative diagnostic that provably works in certain situations is suggested.]]></description>
<dc:subject>papers to-read statistics simulation information-theory probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:5cab585a2303/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:simulation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1610.03592">
    <title>[1610.03592] On statistical learning via the lens of compression</title>
    <dc:date>2016-10-13T01:53:48+00:00</dc:date>
    <link>https://arxiv.org/abs/1610.03592</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This work continues the study of the relationship between sample compression schemes and statistical learning, which has been mostly investigated within the framework of binary classification. The central theme of this work is establishing equivalences between learnability and compressibility, and utilizing these equivalences in the study of statistical learning theory. 
We begin with the setting of multiclass categorization (zero/one loss). We prove that in this case learnability is equivalent to compression of logarithmic sample size, and that uniform convergence implies compression of constant size. 
We then consider Vapnik's general learning setting: we show that in order to extend the compressibility-learnability equivalence to this case, it is necessary to consider an approximate variant of compression. 
Finally, we provide some applications of the compressibility-learnability equivalences: 
(i) Agnostic-case learnability and realizable-case learnability are equivalent in multiclass categorization problems (in terms of sample complexity). 
(ii) This equivalence between agnostic-case learnability and realizable-case learnability does not hold for general learning problems: There exists a learning problem whose loss function takes just three values, under which agnostic-case and realizable-case learnability are not equivalent. 
(iii) Uniform convergence implies compression of constant size in multiclass categorization problems. Part of the argument includes an analysis of the uniform convergence rate in terms of the graph dimension, in which we improve upon previous bounds. 
(iv) A dichotomy for sample compression in multiclass categorization problems: If a non-trivial compression exists then a compression of logarithmic size exists. 
(v) A compactness theorem for multiclass categorization problems.]]></description>
<dc:subject>papers to-read statistical-learning information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:4c704045bf51/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistical-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.sciencedirect.com/science/article/pii/S0893608016300880">
    <title>A theory of local learning, the learning channel, and the optimality of backpropagation</title>
    <dc:date>2016-09-20T20:08:49+00:00</dc:date>
    <link>http://www.sciencedirect.com/science/article/pii/S0893608016300880</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[In a physical neural system, where storage and processing are intimately intertwined, the rules for adjusting the synaptic weights can only depend on variables that are available locally, such as the activity of the pre- and post-synaptic neurons, resulting in local learning rules. A systematic framework for studying the space of local learning rules is obtained by first specifying the nature of the local variables, and then the functional form that ties them together into each learning rule. Such a framework enables also the systematic discovery of new learning rules and exploration of relationships between learning rules and group symmetries. We study polynomial local learning rules stratified by their degree and analyze their behavior and capabilities in both linear and non-linear units and networks. Stacking local learning rules in deep feedforward networks leads to deep local learning. While deep local learning can learn interesting representations, it cannot learn complex input–output functions, even when targets are available for the top layer. Learning complex input–output functions requires local deep learning where target information is communicated to the deep layers through a backward learning channel. The nature of the communicated information about the targets and the structure of the learning channel partition the space of learning algorithms. For any learning algorithm, the capacity of the learning channel can be defined as the number of bits provided about the error gradient per weight, divided by the number of required operations per weight. We estimate the capacity associated with several learning algorithms and show that backpropagation outperforms them by simultaneously maximizing the information rate and minimizing the computational cost. This result is also shown to be true for recurrent networks, by unfolding them in time. The theory clarifies the concept of Hebbian learning, establishes the power and limitations of local learning rules, introduces the learning channel which enables a formal analysis of the optimality of backpropagation, and explains the sparsity of the space of learning rules discovered so far.]]></description>
<dc:subject>papers to-read neural-networks information-theory machine-learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:9790f163190a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:neural-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://hal.archives-ouvertes.fr/hal-01365314/">
    <title>CONCENTRATION OF MEASURE PRINCIPLE AND ENTROPY-INEQUALITIES</title>
    <dc:date>2016-09-20T16:24:15+00:00</dc:date>
    <link>https://hal.archives-ouvertes.fr/hal-01365314/</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Abstract : The concentration measure principle is presented in an abstract way to encompass and unify different concentration properties. We give a general overview of the links between concentration properties, transport-entropy inequalities, and logarithmic Sobolev inequalities for some specific transport costs. By giving few examples, we emphasize optimal weak transport costs as an efficient tool to establish new transport inequality and new concentration principles for discrete measures (the binomial law, the Poisson measure, the uniform law on the symmetric group).]]></description>
<dc:subject>papers to-read measure-concentration information-theory optimal-transportation probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:0ca014389c93/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimal-transportation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1607.06494">
    <title>[1607.06494] Stochastic Control via Entropy Compression</title>
    <dc:date>2016-07-25T00:47:49+00:00</dc:date>
    <link>http://arxiv.org/abs/1607.06494</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We consider an agent trying to bring a system to an acceptable state by repeated probabilistic action (stochastic control). Specifically, in each step the agent observes the flaws present in the current state, selects one of them, and addresses it by probabilistically moving to a new state, one where the addressed flaw is most likely absent, but where one or more new flaws may be present. Several recent works on algorithmizations of the Lov\'{a}sz Local Lemma have established sufficient conditions for such an agent to succeed. Motivated by the paradigm of Partially Observable Markov Decision Processes (POMDPs) we study whether such stochastic control is also possible in a noisy environment, where both the process of state-observation and the process of state-evolution are subject to adversarial perturbation (noise). The introduction of noise causes the tools developed for LLL algorithmization to break down since the key LLL ingredient, the sparsity of the causality (dependence) relationship, no longer holds. To overcome this challenge we develop a new analysis where entropy plays a central role, both to measure the rate at which progress towards an acceptable state is made and the rate at which the noise undoes this progress. The end result is a sufficient condition that allows a smooth tradeoff between the intensity of the noise and the amenability of the system, recovering an asymmetric LLL condition in the noiseless case. To our knowledge, this is the first tractability result for a nontrivial class of POMDPs under stochastic memoryless control.]]></description>
<dc:subject>papers to-read control-theory Markov-decision-processes information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:8e90a8909489/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:control-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:Markov-decision-processes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1508.04095">
    <title>[1508.04095] Algorithmic Aspects of Optimal Channel Coding</title>
    <dc:date>2016-07-12T09:16:48+00:00</dc:date>
    <link>http://arxiv.org/abs/1508.04095</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[A central question in information theory is to determine the maximum success probability that can be achieved in sending a fixed number of messages over a noisy channel. This was first studied in the pioneering work of Shannon who established a simple expression characterizing this quantity in the limit of multiple independent uses of the channel. Here we consider the general setting with only one use of the channel. We observe that the maximum success probability can be expressed as the maximum value of a submodular function. Using this connection, we establish the following results: 
1. There is a simple greedy polynomial-time algorithm that computes a code achieving a (1-1/e)-approximation of the maximum success probability. Moreover, for this problem it is NP-hard to obtain an approximation ratio strictly better than (1-1/e). 
2. Shared quantum entanglement between the sender and the receiver can increase the success probability by a factor of at most 1/(1-1/e). In addition, this factor is tight if one allows an arbitrary non-signaling box between the sender and the receiver. 
3. We give tight bounds on the one-shot performance of the meta-converse of Polyanskiy-Poor-Verdu.]]></description>
<dc:subject>papers to-read information-theory channel-coding heard-the-talk</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:85457006c28d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:channel-coding"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:heard-the-talk"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1606.08920">
    <title>[1606.08920] Exact Lower Bounds for the Agnostic Probably-Approximately-Correct (PAC) Machine Learning Model</title>
    <dc:date>2016-07-01T03:07:26+00:00</dc:date>
    <link>http://arxiv.org/abs/1606.08920</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We provide an exact asymptotic lower bound on the minimax expected excess risk (EER) in the agnostic probably-approximately-correct (PAC) machine learning classification model. This bound is of the simple form $c_\infty/\sqrt{\nu}$ as $\nu\to\infty$, where $c_\infty=0.16997\dots$ is a universal constant, $\nu=m/d$, $m$ is the size of the training sample, and $d$ is the Vapnik--Chervonenkis dimension of the hypothesis class. In the case when randomization of learning algorithms is allowed, we also provide an exact non-asymptotic lower bound on the minimax EER and identify minimax learning algorithms as certain maximally symmetric and minimally randomized "voting" procedures. It is shown that the differences between these asymptotic and non-asymptotic bounds, as well as the differences between these two bounds and the maximum EER of any learning algorithms that minimize the empirical risk, are asymptotically negligible, and all these differences are due to ties in the mentioned "voting" procedures. A few easy to compute non-asymptotic lower bounds on the minimax EER are also obtained, which are shown to be close to the exact asymptotic lower bound $c_\infty/\sqrt{\nu}$ even for rather small values of the ratio $\nu=m/d$. As an application of these results, we substantially improve existing lower bounds on the tail probability of the excess risk. Among the tools used are Bayes estimation and apparently new identities and inequalities for binomial distributions.]]></description>
<dc:subject>papers to-read machine-learning learning-theory lower-bounds information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:7d50ce3dd8e7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:learning-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:lower-bounds"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1605.02818">
    <title>[1605.02818] Brascamp-Lieb Inequality and Its Reverse: An Information Theoretic View</title>
    <dc:date>2016-05-12T03:48:52+00:00</dc:date>
    <link>http://arxiv.org/abs/1605.02818</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We generalize a result by Carlen and Cordero-Erausquin on the equivalence between the Brascamp-Lieb inequality and the subadditivity of relative entropy by allowing for random transformations (a broadcast channel). This leads to a unified perspective on several functional inequalities that have been gaining popularity in the context of proving impossibility results. We demonstrate that the information theoretic dual of the Brascamp-Lieb inequality is a convenient setting for proving properties such as data processing, tensorization, convexity and Gaussian optimality. Consequences of the latter include an extension of the Brascamp-Lieb inequality allowing for Gaussian random transformations, the determination of the multivariate Wyner common information for Gaussian sources, and a multivariate version of Nelson's hypercontractivity theorem. Finally we present an information theoretic characterization of a reverse Brascamp-Lieb inequality involving a random transformation (a multiple access channel).]]></description>
<dc:subject>papers to-read information-theory probability strong-data-processing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:44edfb32b6d6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:strong-data-processing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1605.02268">
    <title>[1605.02268] Rate-Distortion Bounds on Bayes Risk in Supervised Learning</title>
    <dc:date>2016-05-10T02:39:02+00:00</dc:date>
    <link>http://arxiv.org/abs/1605.02268</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[An information-theoretic framework is presented for estimating the number of labeled samples needed to train a classifier in a parametric Bayesian setting. Ideas from rate-distortion theory are used to derive bounds on the average L1 or L∞ distance between the learned classifier and the true maximum a posteriori classifier---which are well-established surrogates for the excess classification error due to imperfect learning---in terms of the differential entropy of the posterior distribution, the Fisher information of the parametric family, and the number of training samples available. The maximum {\em a posteriori} classifier is viewed as a random source, labeled training data are viewed as a finite-rate encoding of the source, and the L1 or L∞ Bayes risk is viewed as the average distortion. The result is a complementary framework to the well-known probably approximately correct (PAC) framework. PAC bounds characterize worst-case learning performance of a family of classifiers whose complexity is captured by the Vapnik-Chervonenkis (VC) dimension. The rate-distortion framework, on the other hand, characterizes the average-case performance of a family of data distributions in terms of a quantity called the interpolation dimension, which represents the complexity of the family of data distributions. The resulting bounds do not suffer from the pessimism typical of the PAC framework, particularly when the training set is small. The framework also naturally accommodates multi-class settings. Furthermore, Monte Carlo methods provide accurate estimates of the bounds even for complicated distributions. The effectiveness of this framework is demonstrated in both a binary and multi-class Gaussian setting.]]></description>
<dc:subject>papers to-read information-theory machine-learning rate-distortion bayesian-learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:57e3011ba13b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:bayesian-learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1205.2628">
    <title>[1205.2628] Multiple Source Adaptation and the Renyi Divergence</title>
    <dc:date>2016-03-22T02:48:31+00:00</dc:date>
    <link>http://arxiv.org/abs/1205.2628</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper presents a novel theoretical study of the general problem of multiple source adaptation using the notion of Renyi divergence. Our results build on our previous work [12], but significantly broaden the scope of that work in several directions. We extend previous multiple source loss guarantees based on distribution weighted combinations to arbitrary target distributions P, not necessarily mixtures of the source distributions, analyze both known and unknown target distribution cases, and prove a lower bound. We further extend our bounds to deal with the case where the learner receives an approximate distribution for each source instead of the exact one, and show that similar loss guarantees can be achieved depending on the divergence between the approximate and true distributions. We also analyze the case where the labeling functions of the source domains are somewhat different. Finally, we report the results of experiments with both an artificial data set and a sentiment analysis task, showing the performance benefits of the distribution weighted combinations and the quality of our bounds based on the Renyi divergence.]]></description>
<dc:subject>papers to-read machine-learning representation-learning domain-adaptation information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:ce4aed1d2b5a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:representation-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:domain-adaptation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1503.03585">
    <title>[1503.03585] Deep Unsupervised Learning using Nonequilibrium Thermodynamics</title>
    <dc:date>2015-12-08T13:38:21+00:00</dc:date>
    <link>http://arxiv.org/abs/1503.03585</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[A central problem in machine learning involves modeling complex data-sets using highly flexible families of probability distributions in which learning, sampling, inference, and evaluation are still analytically or computationally tractable. Here, we develop an approach that simultaneously achieves both flexibility and tractability. The essential idea, inspired by non-equilibrium statistical physics, is to systematically and slowly destroy structure in a data distribution through an iterative forward diffusion process. We then learn a reverse diffusion process that restores structure in data, yielding a highly flexible and tractable generative model of the data. This approach allows us to rapidly learn, sample from, and evaluate probabilities in deep generative models with thousands of layers or time steps, as well as to compute conditional and posterior probabilities under the learned model. We additionally release an open source reference implementation of the algorithm.]]></description>
<dc:subject>papers deep-learning information-theory statistical-physics have-read meh</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:b817112477ae/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:deep-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistical-physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:meh"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1510.04214">
    <title>[1510.04214] LQG Control with Minimal Information: Three-Stage Separation Principle and SDP-based Solution Synthesis</title>
    <dc:date>2015-10-15T03:08:47+00:00</dc:date>
    <link>http://arxiv.org/abs/1510.04214</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[In the interest of evaluating an information-theoretic requirement for feedback control, this paper proposes a framework to synthesize a control policy that minimizes Massey's directed information from the state sequence to the control sequence while attaining required Linear-Quadratic-Gaussian (LQG) control performance. Interpretation and significance of this framework is discussed in the context of networked control theory. As the main result, we show that an optimal control policy can be realized by an attractively simple three-stage decision architecture comprising (1) a linear sensor with additive Gaussian noise, (2) a Kalman filter, and (3) a certainty equivalence controller. This result suggests an integration of two separation principles previously known in the literature: the filter-controller separation principle in the LQG control theory, and the sensor-filter separation principle in zero-delay rate-distortion theory for Gauss-Markov sources. It is also shown that an optimal policy can be synthesized by semidefinite programming (SDP). Both time-varying finite-horizon problems and time-invariant infinite-horizon problems are considered. Our results can be viewed as a generalization of the data-rate theorem for mean-square stability by Nair and Evans, extended for a control performance analysis.]]></description>
<dc:subject>papers to-read information-theory control-theory rational-inattention filtering estimation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:268e8ef145e7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:control-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rational-inattention"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:filtering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:estimation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1509.04555">
    <title>[1509.04555] Understanding interdependency through complex information sharing</title>
    <dc:date>2015-10-13T03:21:54+00:00</dc:date>
    <link>http://arxiv.org/abs/1509.04555</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[The interactions between three or more random variables are often nontrivial, poorly understood, and yet, are paramount for future advances in fields such as network information theory, neuroscience, genetics and many others. In this work, we propose to analyze these interactions as different modes of information sharing. Towards this end, we introduce a novel axiomatic framework for decomposing the joint entropy, which characterizes the various ways in which random variables can share information. The key contribution of our framework is to distinguish between interdependencies where the information is shared redundantly, and synergistic interdependencies where the sharing structure exists in the whole but not between the parts. We show that our axioms determine unique formulas for all the terms of the proposed decomposition for a number of cases of interest. Moreover, we show how these results can be applied to several network information theory problems, providing a more intuitive understanding of their fundamental limits.]]></description>
<dc:subject>papers to-read information-theory complex-systems complexity distributed-systems</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:97ff2c9c459f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:complex-systems"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:distributed-systems"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1510.01844">
    <title>[1510.01844] Bounds between Contraction Coefficients</title>
    <dc:date>2015-10-10T05:12:11+00:00</dc:date>
    <link>http://arxiv.org/abs/1510.01844</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[In this paper, we delineate how the contraction coefficient of the strong data processing inequality for KL divergence can be used to learn likelihood models. We then present an alternative formulation to learn likelihood models that forces the input KL divergence of the data processing inequality to vanish, and achieves a contraction coefficient equivalent to the squared maximal correlation. This formulation turns out to admit a linear algebraic solution. To analyze the performance loss in using this simple but suboptimal procedure, we bound these contraction coefficients in the discrete and finite regime, and prove their equivalence in the Gaussian regime.]]></description>
<dc:subject>papers information-theory have-read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:c7171bffa5b8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1510.02190">
    <title>[1510.02190] Data compression with low distortion and finite blocklength</title>
    <dc:date>2015-10-10T05:09:45+00:00</dc:date>
    <link>http://arxiv.org/abs/1510.02190</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper considers lossy source coding of n-dimensional continuous memoryless sources with low mean-square error distortion and shows a simple, explicit approximation to the minimum source coding rate. More precisely, a nonasymptotic version of Shannon's lower bound is presented. Lattice quantizers are shown to approach that lower bound, provided that the source density is smooth enough and the distortion is low, which implies that fine multidimensional lattice coverings are nearly optimal in the rate-distortion sense even at finite n. The achievability proof technique avoids both the usual random coding argument and the simplifying assumption of the presence of a dither signal.]]></description>
<dc:subject>papers to-read heard-the-talk information-theory rate-distortion</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:41cad441378d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:heard-the-talk"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1510.00764">
    <title>[1510.00764] Strategic Compression and Transmission of Information: Crawford-Sobel Meet Shannon</title>
    <dc:date>2015-10-06T05:17:17+00:00</dc:date>
    <link>http://arxiv.org/abs/1510.00764</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper analyzes the well-known strategic information transmission (SIT) concept of Crawford and Sobel in information economics, from the lens of information theory. SIT differs from the conventional communication paradigms in information theory since it involves different objectives for the encoder and the decoder, which are aware of this mismatch and act accordingly. This leads to a game whose equilibrium solutions are studied here. The problem is modeled as a Stackelberg game-as opposed to the Nash model used in prior work in economics. The transmitter is the leader, and the receiver is the follower. As leader, the transmitter announces an encoding strategy with full commitment, and its distortion measure depends on a private information sequence which is non-causally available --only to the transmitter. Three problem settings are considered, focusing on the quadratic distortion measures and jointly Gaussian source and private information: compression, communication, and the simple equilibrium conditions without any compression or communication. The equilibrium strategies and associated costs are characterized. The analysis is then extended to the receiver side information setting. Finally, several applications of the results within the broader context of decision theory are presented.]]></description>
<dc:subject>papers to-read information-theory economics decision-making decision-theory rational-inattention rate-distortion</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:6a5c1240101d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:economics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:decision-making"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:decision-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rational-inattention"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6963478&amp;searchWithin%5B%5D=%22Authors%22%3A.QT.Saldi%2C+N..QT.&amp;newsearch=true">
    <title>IEEE Xplore Abstract - Randomized Quantization and Source Coding With Constrained Output Distribution</title>
    <dc:date>2015-08-19T04:30:37+00:00</dc:date>
    <link>http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6963478&amp;searchWithin%5B%5D=%22Authors%22%3A.QT.Saldi%2C+N..QT.&amp;newsearch=true</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper studies fixed-rate randomized vector quantization under the constraint that the quantizer's output has a given fixed probability distribution. A general representation of randomized quantizers that includes the common models in the literature is introduced via appropriate mixtures of joint probability measures on the product of the source and reproduction alphabets. Using this representation and results from optimal transport theory, the existence of an optimal (minimum distortion) randomized quantizer having a given output distribution is shown under various conditions. For sources with densities and the mean square distortion measure, it is shown that this optimum can be attained by randomizing quantizers having convex codecells. For stationary and memoryless source and output distributions, a rate-distortion theorem is proved, providing a single-letter expression for the optimum distortion in the limit of large blocklengths.]]></description>
<dc:subject>papers have-read information-theory source-coding rate-distortion optimal-transportation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:68c4c22d3482/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:source-coding"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimal-transportation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7138587&amp;punumber%3D18%26filter%3DAND%28p_IS_Number%3A7203188%29%26pageNumber%3D2">
    <title>IEEE Xplore Abstract - Output Constrained Lossy Source Coding With Limited Common Randomness</title>
    <dc:date>2015-08-19T04:29:48+00:00</dc:date>
    <link>http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7138587&amp;punumber%3D18%26filter%3DAND%28p_IS_Number%3A7203188%29%26pageNumber%3D2</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper studies a Shannon-theoretic version of the generalized distribution preserving quantization problem where a stationary and memoryless source is encoded subject to a distortion constraint and the additional requirement that the reproduction also be stationary and memoryless with a given distribution. The encoder and decoder are stochastic and assumed to have access to independent common randomness. Recent work has characterized the minimum achievable coding rate at a given distortion level when unlimited common randomness is available. Here, we consider the general case where the available common randomness may be rate limited. Our main result completely characterizes the set of achievable coding and common randomness rate pairs at any distortion level, thereby providing the optimal tradeoff between these two rate quantities. We also consider two variations of this problem where we investigate the effect of relaxing the strict output distribution constraint and the role of private randomness used by the decoder on the rate region. Our results have strong connections with Cuff’s recent work on distributed channel synthesis. In particular, our achievability proof combines a coupling argument with the approach developed by Cuff, where instead of explicitly constructing the encoder–decoder pair, a joint distribution is constructed from which a desired encoder–decoder pair is established. We show, however, that for our problem, the separated solution of first finding an optimal channel and then synthesizing this channel results in a suboptimal rate region.]]></description>
<dc:subject>papers to-read information-theory quantization rate-distortion</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:76032a6a91d4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:quantization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1508.00335">
    <title>[1508.00335] Bounds among $f$-divergences</title>
    <dc:date>2015-08-04T18:02:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1508.00335</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper studies bounds among various f-divergences, dealing with arbitrary alphabets and deriving bounds on the ratios of various distance measures. Special attention is placed on bounds in terms of the total variation distance, including "reverse Pinsker inequalities," as well as on the Eγ divergence, which generalizes the total variation distance. Pinsker's inequality is extended for this type of f-divergence, a result which leads to a new inequality that links the relative entropy and relative information spectrum. New expressions of the R\'enyi divergence in terms of the relative information spectrum are derived, leading to upper and lower bounds on the R\'enyi divergence in terms of the variational distance.]]></description>
<dc:subject>papers to-read information-theory divergence</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:e06fff32e922/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:divergence"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1506.02629">
    <title>[1506.02629] Generalization in Adaptive Data Analysis and Holdout Reuse</title>
    <dc:date>2015-06-28T03:16:36+00:00</dc:date>
    <link>http://arxiv.org/abs/1506.02629</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Overfitting is the bane of data analysts, even when data are plentiful. Formal approaches to understanding this problem focus on statistical inference and generalization of individual analysis procedures. Yet the practice of data analysis is an inherently interactive and adaptive process: new analyses and hypotheses are proposed after seeing the results of previous ones, parameters are tuned on the basis of obtained results, and datasets are shared and reused. An investigation of this gap has recently been initiated by the authors in (Dwork et al., 2014), where we focused on the problem of estimating expectations of adaptively chosen functions. 
In this paper, we give a simple and practical method for reusing a holdout (or testing) set to validate the accuracy of hypotheses produced by a learning algorithm operating on a training set. Reusing a holdout set adaptively multiple times can easily lead to overfitting to the holdout set itself. We give an algorithm that enables the validation of a large number of adaptively chosen hypotheses, while provably avoiding overfitting. We illustrate the advantages of our algorithm over the standard use of the holdout set via a simple synthetic experiment. 
We also formalize and address the general problem of data reuse in adaptive data analysis. We show how the differential-privacy based approach given in (Dwork et al., 2014) is applicable much more broadly to adaptive data analysis. We then show that a simple approach based on description length can also be used to give guarantees of statistical validity in adaptive settings. Finally, we demonstrate that these incomparable approaches can be unified via the notion of approximate max-information that we introduce.]]></description>
<dc:subject>papers machine-learning overfitting-control information-theory have-read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:de2b426ab45a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:overfitting-control"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1506.07216">
    <title>[1506.07216] Communication Lower Bounds for Statistical Estimation Problems via a Distributed Data Processing Inequality</title>
    <dc:date>2015-06-26T00:17:20+00:00</dc:date>
    <link>http://arxiv.org/abs/1506.07216</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We study the tradeoff between the statistical error and communication cost for distributed statistical estimation problems in high dimensions. In the distributed sparse Gaussian mean estimation problem, each of the m machines receives n data points from a d-dimensional Gaussian distribution with unknown mean θ which is promised to be k-sparse. The machines communicate by message passing and aim to estimate the mean θ. We provide a tight (up to logarithmic factors) tradeoff between the estimation error and the number of bits communicated between the machines. This directly leads to a lower bound for the distributed \textit{sparse linear regression} problem: to achieve the statistical minimax error, the total communication is at least Ω(min{n,d}m), where n is the number of observations that each machine receives and d is the ambient dimension. As our main technique, we prove a \textit{distributed data processing inequality}, as a generalization of usual data processing inequalities, which might be of independent interest. Finally, we give a communication-optimal protocol for distributed Gaussian mean estimation, improving the number of rounds of the previous such protocol from O(logm) to 1.]]></description>
<dc:subject>papers to-read distributed-computing estimation lower-bounds information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:29507eecba65/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:distributed-computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:lower-bounds"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://people.inf.ethz.ch/~shassani/papers/colt2015-mis.pdf">
    <title>Sequential Information Maximization: When is Greedy Near-optimal?</title>
    <dc:date>2015-06-25T14:15:12+00:00</dc:date>
    <link>http://people.inf.ethz.ch/~shassani/papers/colt2015-mis.pdf</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA["Optimal information gathering is a central challenge in machine learning and science in general. A common objective that quantifies the usefulness of observations is Shannon’s mutual information, defined w.r.t. a probabilistic model. Greedily selecting observations that maximize the mutual information is the method of choice in numerous applications, ranging from Bayesian experimental design to automated diagnosis, to active learning in Bayesian models. Despite its importance and widespread use in applications, little is known about the theoretical properties of sequential information maximization, in particular under noisy observations. In this paper, we analyze the widely used greedy policy for this task, and identify problem instances where it provides provably near-maximal utility, even in the challenging setting of persistent noise. Our results depend on a natural separability condition associated with a channel injecting noise into the observations. We also identify examples where this separability parameter is necessary in the bound: if it is too small, then the greedy policy fails to select informative tests."]]></description>
<dc:subject>papers to-read information-theory sequential-inference re:active_feature_selection_project via:arsyed</dc:subject>
<dc:identifier>https://pinboard.in/u:mraginsky/b:d0ff0aca4af5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:sequential-inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:re:active_feature_selection_project"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:via:arsyed"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://plato.stanford.edu/entries/information-entropy/">
    <title>Information Processing and Thermodynamic Entropy (Stanford Encyclopedia of Philosophy)</title>
    <dc:date>2015-06-24T14:18:01+00:00</dc:date>
    <link>http://plato.stanford.edu/entries/information-entropy/</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>to-read physics-of-information thermodynamics information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:b4cc92319423/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:physics-of-information"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:thermodynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1506.06472">
    <title>[1506.06472] The Ebb and Flow of Deep Learning: a Theory of Local Learning</title>
    <dc:date>2015-06-24T04:21:06+00:00</dc:date>
    <link>http://arxiv.org/abs/1506.06472</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[In a physical neural system, where storage and processing are intimately intertwined, the rules for adjusting the synaptic weights can only depend on variables that are available locally, such as the activity of the pre- and post-synaptic neurons, resulting in local learning rules. A systematic framework for studying the space of local learning rules must first define the nature of the local variables, and then the functional form that ties them together into each learning rule. We consider polynomial local learning rules and analyze their behavior and capabilities in both linear and non-linear networks. As a byproduct, this framework enables also the discovery of new learning rules as well as important relationships between learning rules and group symmetries. Stacking local learning rules in deep feedforward networks leads to deep local learning. While deep local learning can learn interesting representations, it cannot learn complex input-output functions, even when targets are available for the top layer. Learning complex input-output functions requires local deep learning where target information is propagated to the deep layers through a backward channel. The nature of the propagated information about the targets, and the backward channel through which this information is propagated, partition the space of learning algorithms. For any learning algorithm, the capacity of the backward channel can be defined as the number of bits provided about the gradient per weight, divided by the number of required operations per weight. We estimate the capacity associated with several learning algorithms and show that backpropagation outperforms them and achieves the maximum possible capacity. The theory clarifies the concept of Hebbian learning, what is learnable by Hebbian learning, and explains the sparsity of the space of learning rules discovered so far.]]></description>
<dc:subject>papers to-read neural-networks deep-learning information-theory feedback</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:f6d9164014cf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:neural-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:deep-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:feedback"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/0709.1948">
    <title>[0709.1948] Information theoretic approach to interactive learning</title>
    <dc:date>2015-06-23T19:52:43+00:00</dc:date>
    <link>http://arxiv.org/abs/0709.1948</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA["The principles of statistical mechanics and information theory play an important role in learning and have inspired both theory and the design of numerous machine learning algorithms. The new aspect in this paper is a focus on integrating feedback from the learner. A quantitative approach to interactive learning and adaptive behavior is proposed, integrating model- and decision-making into one theoretical framework. This paper follows simple principles by requiring that the observer's world model and action policy should result in maximal predictive power at minimal complexity. Classes of optimal action policies and of optimal models are derived from an objective function that reflects this trade-off between prediction and complexity. The resulting optimal models then summarize, at different levels of abstraction, the process's causal organization in the presence of the learner's actions. A fundamental consequence of the proposed principle is that the learner's optimal action policies balance exploration and control as an emerging property. Interestingly, the explorative component is present in the absence of policy randomness, i.e. in the optimal deterministic behavior. This is a direct result of requiring maximal predictive power in the presence of feedback."]]></description>
<dc:subject>papers to-read information-theory machine-learning reinforcement-learning via:arsyed</dc:subject>
<dc:identifier>https://pinboard.in/u:mraginsky/b:e3e649cb7276/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:reinforcement-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:via:arsyed"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1506.05483">
    <title>[1506.05483] Asymptotic Optimality of Myopic Information-Based Strategies for Bayesian Adaptive Estimation</title>
    <dc:date>2015-06-19T14:18:14+00:00</dc:date>
    <link>http://arxiv.org/abs/1506.05483</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper presents a general asymptotic theory of sequential Bayesian estimation giving results for the strongest, almost sure convergence. We show that under certain smoothness conditions on the probability model, the greedy information gain maximization algorithm for adaptive Bayesian estimation is asymptotically optimal in the sense that the determinant of the posterior covariance in a certain neighborhood of the true parameter value is asymptotically minimal. Using this result, we also obtain an asymptotic expression for the posterior entropy based on a novel definition of almost sure convergence on "most trials" (meaning that the convergence holds on a fraction of trials that converges to one). Then, we extend the results to a recently published framework, which generalizes the usual adaptive estimation setting by allowing different trial placements to be associated with different, random costs of observation. For this setting, the author has proposed the heuristic of maximizing the expected information gain divided by the expected cost of that placement. In this paper, we show that this myopic strategy satisfies an analogous asymptotic optimality result when the convergence of the posterior distribution is considered as a function of the total cost (as opposed to the number of observations).]]></description>
<dc:subject>papers to-read information-theory bayesian-learning experimental-design</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:4fa79ef31bcf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:bayesian-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:experimental-design"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1504.04419">
    <title>[1504.04419] Wasserstein continuity of entropy and outer bounds for interference channels</title>
    <dc:date>2015-06-14T08:36:33+00:00</dc:date>
    <link>http://arxiv.org/abs/1504.04419</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[It is shown that under suitable regularity conditions, differential entropy is a Lipschitz functional on the space of distributions on n-dimensional Euclidean space with respect to the quadratic Wasserstein distance. Under similar conditions, (discrete) Shannon entropy is shown to be Lipschitz continuous in distributions over the product space with respect to Ornstein's d¯-distance (Wasserstein distance corresponding to the Hamming distance). These results together with Talagrand's and Marton's transportation-information inequalities allow one to replace the unknown multi-user interference with its i.i.d. approximations. As an application, a new outer bound for the two-user Gaussian interference channel is proved, which, in particular, settles the "missing corner point" problem of Costa (1985).]]></description>
<dc:subject>papers have-read information-theory probability measure-concentration optimal-transportation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:19910a388840/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimal-transportation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1312.5276">
    <title>[1312.5276] Integration by parts and representation of information functionals</title>
    <dc:date>2015-06-14T01:39:07+00:00</dc:date>
    <link>http://arxiv.org/abs/1312.5276</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We introduce a new formalism for computing expectations of functionals of arbitrary random vectors, by using generalised integration by parts formulae. In doing so we extend recent representation formulae for the score function introduced in Nourdin, Peccati and Swan (JFA, to appear) and also provide a new proof of a central identity first discovered in Guo, Shamai, and Verd{\'u} (IEEE Trans. Information Theory, 2005). We derive a representation for the standardized Fisher information of sums of i.i.d. random vectors which use our identities to provide rates of convergence in information theoretic central limit theorems (both in Fisher information distance and in relative entropy).]]></description>
<dc:subject>papers to-read information-theory probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:ede85048e82e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1403.5855">
    <title>[1403.5855] Stein's method, logarithmic Sobolev and transport inequalities</title>
    <dc:date>2015-06-14T01:38:40+00:00</dc:date>
    <link>http://arxiv.org/abs/1403.5855</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We develop connections between Stein's approximation method, logarithmic Sobolev and transport inequalities by introducing a new class of functional inequalities involving the relative entropy, the Stein kernel, the relative Fisher information and the Wasserstein distance with respect to a given reference distribution on ℝd. For the Gaussian model, the results improve upon the classical logarithmic Sobolev inequality and the Talagrand quadratic transportation cost inequality. Further examples of illustrations include multidimensional gamma distributions, beta distributions, as well as families of log-concave densities. As a by-product, the new inequalities are shown to be relevant towards convergence to equilibrium, concentration inequalities and entropic convergence expressed in terms of the Stein kernel. The tools rely on semigroup interpolation and bounds, in particular by means of the iterated gradients of the Markov generator with invariant measure the distribution under consideration. In a second part, motivated by the recent investigation by Nourdin, Peccati and Swan on Wiener chaoses, we address the issue of entropic bounds on multidimensional functionals F with the Stein kernel via a set of data on F and its gradients rather than on the Fisher information of the density. A natural framework for this investigation is given by the Markov Triple structure (E,μ,Γ) in which abstract Malliavin-type arguments may be developed and extend the Wiener chaos setting.]]></description>
<dc:subject>papers to-read measure-concentration probability information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:b06c167eb96a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://link.springer.com/article/10.1007%2FBF00532723">
    <title>On entropy and information gain in random fields - Springer</title>
    <dc:date>2015-02-08T20:37:21+00:00</dc:date>
    <link>http://link.springer.com/article/10.1007%2FBF00532723</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read information-theory statistical-physics probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:bb682501fcd3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistical-physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1310.6391">
    <title>[1310.6391] Robust bounds on risk-sensitive functionals via Renyi divergence</title>
    <dc:date>2015-01-03T00:12:40+00:00</dc:date>
    <link>http://arxiv.org/abs/1310.6391</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We extend the duality between exponential integrals and relative entropy to a variational formula for exponential integrals involving the Renyi divergence. This formula characterizes the dependence of risk-sensitive functionals and related quantities determined by tail behavior to perturbations in the underlying distributions, in terms of the Renyi divergence. The characterization gives rise to upper and lower bounds that are meaningful for all values of a large deviation scaling parameter, allowing one to quantify in explicit terms the robustness of risk-sensitive costs. As applications we consider problems of uncertainty quantification when aspects of the model are not fully known, as well their use in bounding tail properties of an intractable model in terms of a tractable one.]]></description>
<dc:subject>papers to-read information-theory probability robustness</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:a3fdee032e59/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:robustness"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1412.7172">
    <title>[1412.7172] On the Speed of Social Learning</title>
    <dc:date>2014-12-27T14:20:59+00:00</dc:date>
    <link>http://arxiv.org/abs/1412.7172</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We study the speed of social learning, when two players learn from private signals as well as the actions of the other. Our main finding is that increased interaction between the agents can lower the rate of learning: learning is significantly slower when both players observe each other, than when one only observes the other.]]></description>
<dc:subject>papers to-read information-theory social-learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:bb56025e45d2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:social-learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1205.3756">
    <title>[1205.3756] Achieving the Capacity of any DMC using only Polar Codes</title>
    <dc:date>2014-11-06T05:37:17+00:00</dc:date>
    <link>http://arxiv.org/abs/1205.3756</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We construct a channel coding scheme to achieve the capacity of any discrete memoryless channel based solely on the techniques of polar coding. In particular, we show how source polarization and randomness extraction via polarization can be employed to "shape" uniformly-distributed i.i.d. random variables into approximate i.i.d. random variables distributed ac- cording to the capacity-achieving distribution. We then combine this shaper with a variant of polar channel coding, constructed by the duality with source coding, to achieve the channel capacity. Our scheme inherits the low complexity encoder and decoder of polar coding. It differs conceptually from Gallager's method for achieving capacity, and we discuss the advantages and disadvantages of the two schemes. An application to the AWGN channel is discussed.]]></description>
<dc:subject>to-read information-theory coding-theory polar-codes</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:bcb397e01ca8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:coding-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:polar-codes"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://rgmia.org/monographs/csiszar.htm">
    <title>Inequalities for Csiszár f-Divergences in Information Theory</title>
    <dc:date>2014-10-16T17:32:58+00:00</dc:date>
    <link>http://rgmia.org/monographs/csiszar.htm</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>to-read information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:98757f9e3311/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1410.0503">
    <title>[1410.0503] On Bayes Risk Lower Bounds</title>
    <dc:date>2014-10-03T20:12:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1410.0503</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[This paper provides a general technique to lower bound the Bayes risk for arbitrary loss functions and prior distributions in the standard abstract decision theoretic setting. A lower bound on the Bayes risk not only serves as a lower bound on the minimax risk but also characterizes the fundamental limitations of the statistical difficulty of a decision problem under a given prior. Our bounds are based on then notion of f-informativity of the underlying class of probability measures and the prior. Application of our bounds requires upper bounds on the f-informativity and we derive new upper bounds on f-informativity for a class of f functions which lead to tight Bayes risk lower bounds. Our technique leads to generalizations of a variety of classical minimax bounds. As applications, we present Bayes risk lower bounds for several concrete estimation problems, including Gaussian location models, Bayesian Lasso, generalized linear models and principle component analysis for spiked covariance models.]]></description>
<dc:subject>papers to-read information-theory statistics heard-the-talk</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:e8aa46bbcc91/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:heard-the-talk"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1409.6833">
    <title>[1409.6833] Quantized Estimation of Gaussian Sequence Models in Euclidean Balls</title>
    <dc:date>2014-09-25T02:29:32+00:00</dc:date>
    <link>http://arxiv.org/abs/1409.6833</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[A central result in statistical theory is Pinsker's theorem, which characterizes the minimax rate in the normal means model of nonparametric estimation. In this paper, we present an extension to Pinsker's theorem where estimation is carried out under storage or communication constraints. In particular, we place limits on the number of bits used to encode an estimator, and analyze the excess risk in terms of this constraint, the signal size, and the noise level. We give sharp upper and lower bounds for the case of a Euclidean ball, which establishes the Pareto-optimal minimax tradeoff between storage and risk in this setting.]]></description>
<dc:subject>papers to-read statistics information-theory rate-distortion estimation quantization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:13aab932bd81/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:rate-distortion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:quantization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1407.0381">
    <title>[1407.0381] Minimax rates of entropy estimation on large alphabets via best polynomial approximation</title>
    <dc:date>2014-07-04T19:07:49+00:00</dc:date>
    <link>http://arxiv.org/abs/1407.0381</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Consider the problem of estimating the Shannon entropy of a distribution on $k$ elements from $n$ independent samples. We show that the minimax mean-square error is within universal multiplicative constant factors of $$\Big(\frac{k }{n \log k}\Big)^2 + \frac{\log^2 k}{n}$$ as long as $n$ grows no faster than a polynomial of $k$. This implies the recent result of Valiant-Valiant \cite{VV11} that the minimal sample size for consistent entropy estimation scales according to $\Theta(\frac{k}{\log k})$. The apparatus of best polynomial approximation plays a key role in both the minimax lower bound and the construction of optimal estimators.]]></description>
<dc:subject>papers have-read information-theory entropy-estimation statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:0e74208b419f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:entropy-estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1406.0767">
    <title>[1406.0767] A generalization of Witsenhausen's zero-error rate for directed graphs</title>
    <dc:date>2014-06-04T01:39:05+00:00</dc:date>
    <link>http://arxiv.org/abs/1406.0767</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We investigate a communication setup where a source output is sent through a free noisy channel first and an additional codeword is sent through a noiseless but expensive channel later. With the help of the second message the decoder should be able to decide with zero-error whether its decoding of the first message was error-free. This scenario leads to the definition of a digraph parameter that generalizes Witsenhausen's zero-error rate for directed graphs. We investigate this new parameter for some specific directed graphs and explore its relations to other digraph parameters like Sperner capacity and dichromatic number. 
When the original problem is modified to require zero-error decoding of the whole message then we arrive back to the Witsenhausen rate of an appropriately defined undirected graph.]]></description>
<dc:subject>papers to-read information-theory graph-theory combinatorics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:51e3b12cd4c7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:graph-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:combinatorics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1405.3629">
    <title>[1405.3629] Dissipation of information in channels with input constraints</title>
    <dc:date>2014-05-15T05:07:33+00:00</dc:date>
    <link>http://arxiv.org/abs/1405.3629</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[One of the basic tenets in information theory, the data processing inequality states that output divergence does not exceed the input divergence for any channel. For channels without input constraints, various estimates on the amount of such contraction are known, Dobrushin's coefficient for the total variation being perhaps the most well known. This work investigates channels with average input cost constraint. It is found that while the contraction coefficient typically equals one (no contraction), the information nevertheless dissipates. A certain non-linear function, a \emph{Dobrushin curve} of the channel, is proposed to quantify the amount of dissipation. Tools for evaluating the Dobrushin curve of additive-noise channels are developed based on coupling arguments. Some basic applications in stochastic control, uniqueness of Gibbs measures and fundamental limits of noisy circuits are discussed.]]></description>
<dc:subject>papers have-read information-theory markov-chains</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:ac69dda62b44/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:have-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:markov-chains"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1405.0608">
    <title>[1405.0608] Approximate tensorization of entropy at high temperature</title>
    <dc:date>2014-05-08T18:02:20+00:00</dc:date>
    <link>http://arxiv.org/abs/1405.0608</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We show that for weakly dependent random variables the relative entropy functional satisfies an approximate version of the standard tensorization property which holds in the independent case. As a corollary one obtains a family of dimensionless logarithmic Sobolev inequalities. In the context of spin systems on a graph the weak dependence requirements resemble the well known Dobrushin uniqueness conditions. Our results can be considered as the discrete counterpart of a recent work of Katalin Marton.]]></description>
<dc:subject>papers to-read measure-concentration information-theory probability</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:f7ef14163a02/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:measure-concentration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.eecs.berkeley.edu/~ananth/2014+/2014-02-24_VarunCISS.pdf">
    <title>Convex Relative Entropy Decay in Markov Chains</title>
    <dc:date>2014-03-01T23:55:13+00:00</dc:date>
    <link>http://www.eecs.berkeley.edu/~ananth/2014+/2014-02-24_VarunCISS.pdf</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read markov-chains information-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:b85757a05e70/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:markov-chains"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1402.3067">
    <title>[1402.3067] A Bayesian Characterization of Relative Entropy</title>
    <dc:date>2014-02-14T02:29:55+00:00</dc:date>
    <link>http://arxiv.org/abs/1402.3067</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We give a new characterization of relative entropy, also known as the Kullback-Leibler divergence. We use a number of interesting categories related to probability theory. In particular, we consider a category FinStat where an object is a finite set equipped with a probability distribution, while a morphism is a measure-preserving function f:X→Y together with a stochastic right inverse s:Y→X. The function f can be thought of as a measurement process, while s provides a hypothesis about the state of the measured system given the result of a measurement. Given this data we can define the entropy of the probability distribution on X relative to the "prior" given by pushing the probability distribution on Y forwards along s. We say that s is "optimal" if these distributions agree. We show that any convex linear, lower semicontinuous functor from FinStat to the additive monoid [0,∞] which vanishes when s is optimal must be a scalar multiple of this relative entropy. Our proof is independent of all earlier characterizations, but inspired by the work of Petz.]]></description>
<dc:subject>papers to-read information-theory probability categories</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:9ca9e338ca19/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:categories"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/0903.2962">
    <title>[0903.2962] A symmetric entropy bound on the non-reconstruction regime of Markov chains on Galton-Watson trees</title>
    <dc:date>2014-01-17T12:55:45+00:00</dc:date>
    <link>http://arxiv.org/abs/0903.2962</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[We give a criterion of the form Q(d)c(M)<1 for the non-reconstructability of tree-indexed q-state Markov chains obtained by broadcasting a signal from the root with a given transition matrix M. Here c(M) is an explicit function, which is convex over the set of M's with a given invariant distribution, that is defined in terms of a (q-1)-dimensional variational problem over symmetric entropies. Further Q(d) is the expected number of offspring on the Galton-Watson tree. This result is equivalent to proving the extremality of the free boundary condition-Gibbs measure within the corresponding Gibbs-simplex. Our theorem holds for possibly non-reversible M and its proof is based on a general Recursion Formula for expectations of a symmetrized relative entropy function, which invites their use as a Lyapunov function. 
In the case of the Potts model, the present theorem reproduces earlier results of the authors, with a simplified proof, in the case of the symmetric Ising model (where the argument becomes similar to the approach of Pemantle and Peres) the method produces the correct reconstruction threshold), in the case of the (strongly) asymmetric Ising model where the Kesten-Stigum bound is known to be not sharp the method provides improved numerical bounds.]]></description>
<dc:subject>papers to-read information-theory statistical-physics markov-chains</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:56ee7d45e679/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistical-physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:markov-chains"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www2.isye.gatech.edu/~cguzman6/Files/ArxivLBAlgo.pdf">
    <title>Unifying Lower Bounds on the Oracle Complexity of Nonsmooth Convex Optimization (Gábor Braun, Cristóbal Guzmán, and Sebastian Pokutta)</title>
    <dc:date>2014-01-02T16:16:38+00:00</dc:date>
    <link>http://www2.isye.gatech.edu/~cguzman6/Files/ArxivLBAlgo.pdf</link>
    <dc:creator>mraginsky</dc:creator><dc:subject>papers to-read optimization complexity convex-programming information-theory lower-bounds</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:682ee7556952/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:convex-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:lower-bounds"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cs.berkeley.edu/~jduchi/projects/ZhangDuJoWa13.pdf">
    <title>Information-theoretic lower bounds for distributed statistical estimation with communication constraints, Yuchen Zhang, John C. Duchi, Michael I. Jordan, and Martin Wainwright.</title>
    <dc:date>2013-11-17T21:58:08+00:00</dc:date>
    <link>http://www.cs.berkeley.edu/~jduchi/projects/ZhangDuJoWa13.pdf</link>
    <dc:creator>mraginsky</dc:creator><description><![CDATA[Information-theoretic lower bounds for distributed statistical estimation with communication constraints, Yuchen Zhang, John C. Duchi, Michael I. Jordan, and Martin Wainwright. Neural Information Processing Systems (NIPS 2013)]]></description>
<dc:subject>papers to-read information-theory complexity machine-learning statistics distributed-computing lower-bounds</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:mraginsky/b:69e64df0398d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:distributed-computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:mraginsky/t:lower-bounds"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>