<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (cshalizi)</title>
    <link>https://pinboard.in/u:cshalizi/public/</link>
    <description>recent bookmarks from cshalizi</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://arxiv.org/abs/2601.05444"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2601.10825"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2603.12228"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2602.01011"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2405.06691v3"/>
	<rdf:li rdf:resource="https://www.cambridge.org/core/journals/philosophy-of-science/article/peirce-in-the-machine-how-mixture-of-experts-models-perform-hypothesis-construction/2C92DF1A6805195170683CC6EC446125?WT.mc_id=New%2520Cambridge%2520Alert%2520-%2520Articles"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2503.03213"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1503.02531"/>
	<rdf:li rdf:resource="https://dl.acm.org/doi/10.1145/1150402.1150464"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2408.05446"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2406.11741"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2303.00586"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2110.11216"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2402.01502"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2301.11562"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2202.02950"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2111.14377"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2207.08815"/>
	<rdf:li rdf:resource="https://jmlr.org/papers/v23/20-874.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2206.04902"/>
	<rdf:li rdf:resource="https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1974867"/>
	<rdf:li rdf:resource="https://www.jmlr.org/papers/v23/20-874.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2109.00173"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2107.04208"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.05211"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2106.05918"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2008.07063"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2106.02589"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2106.02803"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2106.01092"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2005.14458"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2006.08855"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2101.11083"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2009.09036"/>
	<rdf:li rdf:resource="https://doi.org/10.1111/rssb.12425"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.04134"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.02569"/>
	<rdf:li rdf:resource="https://www.cambridge.org/core/journals/econometric-theory/article/abs/new-study-on-asymptotic-optimality-of-least-squares-model-averaging/5252EEB71F41DCF4B613C02F1440D4A1"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2103.05766"/>
	<rdf:li rdf:resource="https://journals.aom.org/doi/10.5465/ambpp.2015.15192abstract"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2101.11190"/>
	<rdf:li rdf:resource="https://www.tandfonline.com/doi/full/10.1080/01621459.2020.1851696"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.14563"/>
	<rdf:li rdf:resource="https://www.tandfonline.com/doi/full/10.1080/10618600.2020.1853548"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.12802"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.11649"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1902.03999"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2011.07476"/>
	<rdf:li rdf:resource="https://covid19-projections.com/about/"/>
	<rdf:li rdf:resource="https://www.jstatsoft.org/article/view/v054i02"/>
	<rdf:li rdf:resource="https://link.springer.com/article/10.1023%2FA%3A1010933404324"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1911.00190"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1910.11743"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1910.11445"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1906.01235"/>
	<rdf:li rdf:resource="https://link.springer.com/article/10.1007/s11023-019-09506-6"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1910.04743"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1909.12299"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1806.03467"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1909.11799"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1909.07578"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.06951"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.06852"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.02718"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.02723"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.01251"/>
	<rdf:li rdf:resource="http://journal.sjdm.org/19/190215/jdm190215.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1907.11452"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1907.08742"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1807.11408"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://arxiv.org/abs/2601.05444">
    <title>[2601.05444] What Functions Does XGBoost Learn?</title>
    <dc:date>2026-06-04T18:10:45+00:00</dc:date>
    <link>https://arxiv.org/abs/2601.05444</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This paper establishes a rigorous theoretical foundation for the function class implicitly learned by XGBoost, bridging the gap between its empirical success and our theoretical understanding. We introduce an infinite-dimensional function class d,s∞−ST that extends finite ensembles of bounded-depth regression trees, together with a complexity measure Vd,s∞−XGB(⋅) that generalizes the L1 regularization penalty used in XGBoost. We show that every optimizer of the XGBoost objective is also an optimizer of an equivalent penalized regression problem over d,s∞−ST with penalty Vd,s∞−XGB(⋅), providing an interpretation of XGBoost as implicitly targeting a broader function class. We also develop a smoothness-based interpretation of d,s∞−ST and Vd,s∞−XGB(⋅) in terms of Hardy--Krause variation. We prove that the least squares estimator over {f∈d,s∞−ST:Vd,s∞−XGB(f)≤V} achieves a nearly minimax-optimal rate of convergence n−2/3(logn)4(min(s,d)−1)/3, thereby avoiding the curse of dimensionality. Our results provide the first rigorous characterization of the function space underlying XGBoost, clarify its connection to classical notions of variation, and identify an important open problem: whether the XGBoost algorithm itself achieves minimax optimality over this class."]]></description>
<dc:subject>to:NB functional_analysis boosting ensemble_methods decision_trees via:msw</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:85d5009a07b7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:functional_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:boosting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:msw"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2601.10825">
    <title>[2601.10825] Reasoning Models Generate Societies of Thought</title>
    <dc:date>2026-04-08T16:58:19+00:00</dc:date>
    <link>https://arxiv.org/abs/2601.10825</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Large language models have achieved remarkable capabilities across domains, yet mechanisms underlying sophisticated reasoning remain elusive. Recent reasoning models outperform comparable instruction-tuned models on complex cognitive tasks, attributed to extended computation through longer chains of thought. Here we show that enhanced reasoning emerges not from extended computation alone, but from simulating multi-agent-like interactions -- a society of thought -- which enables diversification and debate among internal cognitive perspectives characterized by distinct personality traits and domain expertise. Through quantitative analysis and mechanistic interpretability methods applied to reasoning traces, we find that reasoning models like DeepSeek-R1 and QwQ-32B exhibit much greater perspective diversity than instruction-tuned models, activating broader conflict between heterogeneous personality- and expertise-related features during reasoning. This multi-agent structure manifests in conversational behaviors, including question-answering, perspective shifts, and the reconciliation of conflicting views, and in socio-emotional roles that characterize sharp back-and-forth conversations, together accounting for the accuracy advantage in reasoning tasks. Controlled reinforcement learning experiments reveal that base models increase conversational behaviors when rewarded solely for reasoning accuracy, and fine-tuning models with conversational scaffolding accelerates reasoning improvement over base models. These findings indicate that the social organization of thought enables effective exploration of solution spaces. We suggest that reasoning models establish a computational parallel to collective intelligence in human groups, where diversity enables superior problem-solving when systematically structured, which suggests new opportunities for agent organization to harness the wisdom of crowds."]]></description>
<dc:subject>to:NB to_read artificial_intelligence large_language_models_(so_called) ensemble_methods evans.james kith_and_kin</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f2cf63b3474b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:artificial_intelligence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:evans.james"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2603.12228">
    <title>[2603.12228] Neural Thickets: Diverse Task Experts Are Dense Around Pretrained Weights</title>
    <dc:date>2026-03-22T04:13:16+00:00</dc:date>
    <link>https://arxiv.org/abs/2603.12228</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Pretraining produces a learned parameter vector that is typically treated as a starting point for further iterative adaptation. In this work, we instead view the outcome of pretraining as a distribution over parameter vectors, whose support already contains task-specific experts. We show that in small models such expert solutions occupy a negligible fraction of the volume of this distribution, making their discovery reliant on structured optimization methods such as gradient descent. In contrast, in large, well-pretrained models the density of task-experts increases dramatically, so that diverse, task-improving specialists populate a substantial fraction of the neighborhood around the pretrained weights. Motivated by this perspective, we explore a simple, fully parallel post-training method that samples N parameter perturbations at random, selects the top K, and ensembles predictions via majority vote. Despite its simplicity, this approach is competitive with standard post-training methods such as PPO, GRPO, and ES for contemporary large-scale models."]]></description>
<dc:subject>to:NB neural_networks large_language_models_(so_called) ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a9f750012e1c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2602.01011">
    <title>[2602.01011] Multi-Agent Teams Hold Experts Back</title>
    <dc:date>2026-02-12T17:22:39+00:00</dc:date>
    <link>https://arxiv.org/abs/2602.01011</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Multi-agent LLM systems are increasingly deployed as autonomous collaborators, where agents interact freely rather than execute fixed, pre-specified workflows. In such settings, effective coordination cannot be fully designed in advance and must instead emerge through interaction. However, most prior work enforces coordination through fixed roles, workflows, or aggregation rules, leaving open the question of how well self-organizing teams perform when coordination is unconstrained. Drawing on organizational psychology, we study whether self-organizing LLM teams achieve strong synergy, where team performance matches or exceeds the best individual member. Across human-inspired and frontier ML benchmarks, we find that -- unlike human teams -- LLM teams consistently fail to match their expert agent's performance, even when explicitly told who the expert is, incurring performance losses of up to 37.6%. Decomposing this failure, we show that expert leveraging, rather than identification, is the primary bottleneck. Conversational analysis reveals a tendency toward integrative compromise -- averaging expert and non-expert views rather than appropriately weighting expertise -- which increases with team size and correlates negatively with performance. Interestingly, this consensus-seeking behavior improves robustness to adversarial agents, suggesting a trade-off between alignment and effective expertise utilization. Our findings reveal a significant gap in the ability of self-organizing multi-agent teams to harness the collective expertise of their members."]]></description>
<dc:subject>to:NB large_language_models_(so_called) ensemble_methods collective_cognition via:henry_farrell</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:39e92ff5a350/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:henry_farrell"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2405.06691v3">
    <title>[2405.06691v3] Fleet of Agents: Coordinated Problem Solving with Large Language Models</title>
    <dc:date>2025-09-05T16:01:14+00:00</dc:date>
    <link>https://arxiv.org/abs/2405.06691v3</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["While numerous frameworks have been developed to enhance the reasoning abilities of large language models (LLMs), there is a scarcity of methods that effectively balance the trade-off between cost and quality. In this paper, we introduce Fleet of Agents (FoA), a novel and intuitive yet principled framework utilizing LLMs as agents to navigate through dynamic tree searches, employing a genetic-type particle filtering approach. FoA spawns a multitude of agents, each exploring the search space autonomously, followed by a selection phase where resampling based on a heuristic value function optimizes the balance between exploration and exploitation. This mechanism enables dynamic branching, adapting the exploration strategy based on discovered solutions. We conduct extensive experiments on three benchmark tasks, ``Game of 24'', ``Mini-Crosswords'', and ``WebShop'', utilizing four different LLMs, ``GPT-3.5'', ``GPT-4'', ``LLaMA3.2-11B'', and ``LLaMA3.2-90B''. On average across all tasks and LLMs, FoA obtains a quality improvement of ~5% while requiring only ~40% of the cost of previous SOTA methods. Notably, our analyses reveal that (1) FoA achieves the best cost-quality trade-off among all benchmarked methods and (2) FoA + LLaMA3.2-11B surpasses the Llama3.2-90B model. "]]></description>
<dc:subject>to:NB ensemble_methods particle_filters collective_cognition large_language_models_(so_called)</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:4492f638f68c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:particle_filters"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/core/journals/philosophy-of-science/article/peirce-in-the-machine-how-mixture-of-experts-models-perform-hypothesis-construction/2C92DF1A6805195170683CC6EC446125?WT.mc_id=New%2520Cambridge%2520Alert%2520-%2520Articles">
    <title>Peirce in the Machine: How Mixture of Experts Models Perform Hypothesis Construction | Philosophy of Science | Cambridge Core</title>
    <dc:date>2025-09-05T15:03:49+00:00</dc:date>
    <link>https://www.cambridge.org/core/journals/philosophy-of-science/article/peirce-in-the-machine-how-mixture-of-experts-models-perform-hypothesis-construction/2C92DF1A6805195170683CC6EC446125?WT.mc_id=New%2520Cambridge%2520Alert%2520-%2520Articles</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Mixture of experts is a prediction aggregation method in machine learning that aggregates the predictions of specialized experts. This method often outperforms Bayesian methods despite the Bayesian having stronger inductive guarantees. We argue that this is due to the greater functional capacity of mixture of experts. We prove that in a limiting case of mixture of experts will have greater capacity than equivalent Bayesian methods, which we vouchsafe through experiments on non-limiting cases. Finally, we conclude that mixture of experts is a type of abductive reasoning in the Peircean sense of hypothesis construction."

--- Last tag for the last sentence of this abstract (which I realize is the point of the paper).]]></description>
<dc:subject>to:NB philosophy_of_science ensemble_methods color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b0495f96c9bf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:philosophy_of_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2503.03213">
    <title>[2503.03213] Convergence Rates for Softmax Gating Mixture of Experts</title>
    <dc:date>2025-04-09T14:16:09+00:00</dc:date>
    <link>https://arxiv.org/abs/2503.03213</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Mixture of experts (MoE) has recently emerged as an effective framework to advance the efficiency and scalability of machine learning models by softly dividing complex tasks among multiple specialized sub-models termed experts. Central to the success of MoE is an adaptive softmax gating mechanism which takes responsibility for determining the relevance of each expert to a given input and then dynamically assigning experts their respective weights. Despite its widespread use in practice, a comprehensive study on the effects of the softmax gating on the MoE has been lacking in the literature. To bridge this gap in this paper, we perform a convergence analysis of parameter estimation and expert estimation under the MoE equipped with the standard softmax gating or its variants, including a dense-to-sparse gating and a hierarchical softmax gating, respectively. Furthermore, our theories also provide useful insights into the design of sample-efficient expert structures. In particular, we demonstrate that it requires polynomially many data points to estimate experts satisfying our proposed \emph{strong identifiability} condition, namely a commonly used two-layer feed-forward network. In stark contrast, estimating linear experts, which violate the strong identifiability condition, necessitates exponentially many data points as a result of intrinsic parameter interactions expressed in the language of partial differential equations. All the theoretical results are substantiated with a rigorous guarantee."]]></description>
<dc:subject>to:NB kith_and_kin rinaldo.alessandro ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:69ffdb506cef/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:rinaldo.alessandro"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1503.02531">
    <title>[1503.02531] Distilling the Knowledge in a Neural Network</title>
    <dc:date>2025-03-04T18:58:35+00:00</dc:date>
    <link>https://arxiv.org/abs/1503.02531</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A very simple way to improve the performance of almost any machine learning algorithm is to train many different models on the same data and then to average their predictions. Unfortunately, making predictions using a whole ensemble of models is cumbersome and may be too computationally expensive to allow deployment to a large number of users, especially if the individual models are large neural nets. Caruana and his collaborators have shown that it is possible to compress the knowledge in an ensemble into a single model which is much easier to deploy and we develop this approach further using a different compression technique. We achieve some surprising results on MNIST and we show that we can significantly improve the acoustic model of a heavily used commercial system by distilling the knowledge in an ensemble of models into a single model. We also introduce a new type of ensemble composed of one or more full models and many specialist models which learn to distinguish fine-grained classes that the full models confuse. Unlike a mixture of experts, these specialist models can be trained rapidly and in parallel."]]></description>
<dc:subject>to:NB ensemble_methods to_read to_teach:data-mining via:rvenkat</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a950b4f51e90/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:rvenkat"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://dl.acm.org/doi/10.1145/1150402.1150464">
    <title>Model compression | Proceedings of the 12th ACM SIGKDD international conference on Knowledge discovery and data mining</title>
    <dc:date>2025-03-04T18:57:34+00:00</dc:date>
    <link>https://dl.acm.org/doi/10.1145/1150402.1150464</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Often the best performing supervised learning models are ensembles of hundreds or thousands of base-level classifiers. Unfortunately, the space required to store this many classifiers, and the time required to execute them at run-time, prohibits their use in applications where test sets are large (e.g. Google), where storage space is at a premium (e.g. PDAs), and where computational power is limited (e.g. hea-ring aids). We present a method for "compressing" large, complex ensembles into smaller, faster models, usually without significant loss in performance."

--- KDD '06!  WTH didn't I know about this?]]></description>
<dc:subject>to:NB ensemble_methods to_read to_teach:data-mining via:rvenkat</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5a3d0fd453f6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:rvenkat"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2408.05446">
    <title>[2408.05446] Ensemble everything everywhere: Multi-scale aggregation for adversarial robustness</title>
    <dc:date>2024-08-21T11:55:03+00:00</dc:date>
    <link>https://arxiv.org/abs/2408.05446</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Adversarial examples pose a significant challenge to the robustness, reliability and alignment of deep neural networks. We propose a novel, easy-to-use approach to achieving high-quality representations that lead to adversarial robustness through the use of multi-resolution input representations and dynamic self-ensembling of intermediate layer predictions. We demonstrate that intermediate layer predictions exhibit inherent robustness to adversarial attacks crafted to fool the full classifier, and propose a robust aggregation mechanism based on Vickrey auction that we call \textit{CrossMax} to dynamically ensemble them. By combining multi-resolution inputs and robust ensembling, we achieve significant adversarial robustness on CIFAR-10 and CIFAR-100 datasets without any adversarial training or extra data, reaching an adversarial accuracy of ≈72% (CIFAR-10) and ≈48% (CIFAR-100) on the RobustBench AutoAttack suite (L∞=8/255) with a finetuned ImageNet-pretrained ResNet152. This represents a result comparable with the top three models on CIFAR-10 and a +5 % gain compared to the best current dedicated approach on CIFAR-100. Adding simple adversarial training on top, we get ≈78% on CIFAR-10 and ≈51% on CIFAR-100, improving SOTA by 5 % and 9 % respectively and seeing greater gains on the harder dataset. We validate our approach through extensive experiments and provide insights into the interplay between adversarial robustness, and the hierarchical nature of deep representations. We show that simple gradient-based attacks against our model lead to human-interpretable images of the target classes as well as interpretable image changes. As a byproduct, using our multi-resolution prior, we turn pre-trained classifiers and CLIP models into controllable image generators and develop successful transferable attacks on large vision language models."]]></description>
<dc:subject>to:NB adversarial_examples ensemble_methods neural_networks to_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:538176a719c4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:adversarial_examples"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2406.11741">
    <title>[2406.11741] Transcendence: Generative Models Can Outperform The Experts That Train Them</title>
    <dc:date>2024-06-24T13:34:00+00:00</dc:date>
    <link>https://arxiv.org/abs/2406.11741</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Generative models are trained with the simple objective of imitating the conditional probability distribution induced by the data they are trained on. Therefore, when trained on data generated by humans, we may not expect the artificial model to outperform the humans on their original objectives. In this work, we study the phenomenon of transcendence: when a generative model achieves capabilities that surpass the abilities of the experts generating its data. We demonstrate transcendence by training an autoregressive transformer to play chess from game transcripts, and show that the trained model can sometimes achieve better performance than all players in the dataset. We theoretically prove that transcendence is enabled by low-temperature sampling, and rigorously assess this experimentally. Finally, we discuss other sources of transcendence, laying the groundwork for future investigation of this phenomenon in a broader setting."]]></description>
<dc:subject>to_read large_language_models_(so_called) ensemble_methods kakade.sham collective_cognition in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:81c40714a462/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kakade.sham"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2303.00586">
    <title>[2303.00586] FAIR-Ensemble: When Fairness Naturally Emerges From Deep Ensembling</title>
    <dc:date>2024-03-06T15:18:30+00:00</dc:date>
    <link>https://arxiv.org/abs/2303.00586</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Ensembling multiple Deep Neural Networks (DNNs) is a simple and effective way to improve top-line metrics and to outperform a larger single model. In this work, we go beyond top-line metrics and instead explore the impact of ensembling on subgroup performances. Surprisingly, we observe that even with a simple homogeneous ensemble -- all the individual DNNs share the same training set, architecture, and design choices -- the minority group performance disproportionately improves with the number of models compared to the majority group, i.e. fairness naturally emerges from ensembling. Even more surprising, we find that this gain keeps occurring even when a large number of models is considered, e.g. 20, despite the fact that the average performance of the ensemble plateaus with fewer models. Our work establishes that simple DNN ensembles can be a powerful tool for alleviating disparate impact from DNN classifiers, thus curbing algorithmic harm. We also explore why this is the case. We find that even in homogeneous ensembles, varying the sources of stochasticity through parameter initialization, mini-batch sampling, and data-augmentation realizations, results in different fairness outcomes."]]></description>
<dc:subject>ensemble_methods algorithmic_fairness re:codename:one_law_for_the_lion_and_ox_is_oppression hooker.sara in_NB have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:905e0f62063f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:codename:one_law_for_the_lion_and_ox_is_oppression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hooker.sara"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2110.11216">
    <title>[2110.11216] User-friendly introduction to PAC-Bayes bounds</title>
    <dc:date>2024-03-02T19:25:18+00:00</dc:date>
    <link>https://arxiv.org/abs/2110.11216</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Aggregated predictors are obtained by making a set of basic predictors vote according to some weights, that is, to some probability distribution.
"Randomized predictors are obtained by sampling in a set of basic predictors, according to some prescribed probability distribution.
"Thus, aggregated and randomized predictors have in common that they are not defined by a minimization problem, but by a probability distribution on the set of predictors. In statistical learning theory, there is a set of tools designed to understand the generalization ability of such procedures: PAC-Bayesian or PAC-Bayes bounds.
"Since the original PAC-Bayes bounds of D. McAllester, these tools have been considerably improved in many directions (we will for example describe a simplified version of the localization technique of O. Catoni that was missed by the community, and later rediscovered as "mutual information bounds"). Very recently, PAC-Bayes bounds received a considerable attention: for example there was workshop on PAC-Bayes at NIPS 2017, "(Almost) 50 Shades of Bayesian Learning: PAC-Bayesian trends and insights", organized by B. Guedj, F. Bach and P. Germain. One of the reason of this recent success is the successful application of these bounds to neural networks by G. Dziugaite and D. Roy.
"An elementary introduction to PAC-Bayes theory is still missing. This is an attempt to provide such an introduction."


--- Published version: [https://doi.org/10.1561/2200000100]]]></description>
<dc:subject>to:NB to_read learning_theory ensemble_methods to_teach:childs_garden_of_statistical_learning_theory alquier.pierre</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:27997bd5aeb9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:learning_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:childs_garden_of_statistical_learning_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:alquier.pierre"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2402.01502">
    <title>[2402.01502] Why do Random Forests Work? Understanding Tree Ensembles as Self-Regularizing Adaptive Smoothers</title>
    <dc:date>2024-02-27T19:59:40+00:00</dc:date>
    <link>https://arxiv.org/abs/2402.01502</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Despite their remarkable effectiveness and broad application, the drivers of success underlying ensembles of trees are still not fully understood. In this paper, we highlight how interpreting tree ensembles as adaptive and self-regularizing smoothers can provide new intuition and deeper insight to this topic. We use this perspective to show that, when studied as smoothers, randomized tree ensembles not only make predictions that are quantifiably more smooth than the predictions of the individual trees they consist of, but also further regulate their smoothness at test-time based on the dissimilarity between testing and training inputs. First, we use this insight to revisit, refine and reconcile two recent explanations of forest success by providing a new way of quantifying the conjectured behaviors of tree ensembles objectively by measuring the effective degree of smoothing they imply. Then, we move beyond existing explanations for the mechanisms by which tree ensembles improve upon individual trees and challenge the popular wisdom that the superior performance of forests should be understood as a consequence of variance reduction alone. We argue that the current high-level dichotomy into bias- and variance-reduction prevalent in statistics is insufficient to understand tree ensembles -- because the prevailing definition of bias does not capture differences in the expressivity of the hypothesis classes formed by trees and forests. Instead, we show that forests can improve upon trees by three distinct mechanisms that are usually implicitly entangled. In particular, we demonstrate that the smoothing effect of ensembling can reduce variance in predictions due to noise in outcome generation, reduce variability in the quality of the learned function given fixed input data and reduce potential bias in learnable functions by enriching the available hypothesis space."]]></description>
<dc:subject>to_read ensemble_methods random_forests decision_trees learning_theory in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:afc37c93e049/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:learning_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2301.11562">
    <title>[2301.11562] Is My Prediction Arbitrary? The Confounding Effects of Variance in Fair Classification Benchmarks</title>
    <dc:date>2023-09-15T19:39:01+00:00</dc:date>
    <link>https://arxiv.org/abs/2301.11562</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Variance in predictions across different trained models is a significant, under-explored source of error in fair classification. In practice, the variance on some data examples is so large that decisions can be effectively arbitrary. To investigate this problem, we take an experimental approach and make four overarching contributions: We 1) Define a metric called self-consistency, derived from variance, which we use as a proxy for measuring and reducing arbitrariness; 2) Develop an ensembling algorithm that abstains from classification when a prediction would be arbitrary; 3) Conduct the largest to-date empirical study of the role of variance (vis-a-vis self-consistency and arbitrariness) in fair classification; and, 4) Release a toolkit that makes the US Home Mortgage Disclosure Act (HMDA) datasets easily usable for future research. Altogether, our experiments reveal shocking insights about the reliability of conclusions on benchmark datasets. Most fairness classification benchmarks are close-to-fair when taking into account the amount of arbitrariness present in predictions -- before we even try to apply common fairness interventions. This finding calls into question the practical utility of common algorithmic fairness methods, and in turn suggests that we should fundamentally reconsider how we choose to measure fairness in machine learning."

--- "Variance" here is defined slightly non-standardly, as E[loss(Y_1, Y_2)] where Y_1 and Y_2 are (distinct) draws from the distribution.  (If loss is squared error, this comes out to twice the usual definition of variance.)  "Self-consistency" is just the probability that two models, bootstrapped from the same data set, give the same classification for a given individual.]]></description>
<dc:subject>algorithmic_fairness via:rvenkat classifiers have_read ensemble_methods uncertainty_for_neural_networks in_NB to_teach:data-mining</dc:subject>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bec8057ed430/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:rvenkat"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:uncertainty_for_neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2202.02950">
    <title>[2202.02950] Jury Learning: Integrating Dissenting Voices into Machine Learning Models</title>
    <dc:date>2023-08-10T19:33:54+00:00</dc:date>
    <link>https://arxiv.org/abs/2202.02950</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Whose labels should a machine learning (ML) algorithm learn to emulate? For ML tasks ranging from online comment toxicity to misinformation detection to medical diagnosis, different groups in society may have irreconcilable disagreements about ground truth labels. Supervised ML today resolves these label disagreements implicitly using majority vote, which overrides minority groups' labels. We introduce jury learning, a supervised ML approach that resolves these disagreements explicitly through the metaphor of a jury: defining which people or groups, in what proportion, determine the classifier's prediction. For example, a jury learning model for online toxicity might centrally feature women and Black jurors, who are commonly targets of online harassment. To enable jury learning, we contribute a deep learning architecture that models every annotator in a dataset, samples from annotators' models to populate the jury, then runs inference to classify. Our architecture enables juries that dynamically adapt their composition, explore counterfactuals, and visualize dissent."

--- This sounds like a potentially interesting way of dealing with inter-rater disagreement, if nothing else.
--- The very simple approach to not relying on majority vote would be to see what % of human raters labeled each training item as toxic, and then try to match that, i.e., to do regression limited to [0,1] rather than simply classification.  (This would avoid the unwarranted presupposition, or at least suggestion, that currently-salient identity groups are always homogeneous in their ratings.)  I will be interested to see if they give reasons for not just doing that.
--- The understanding of juries in this abstract is... curious, to say the least.
--- Also, per [https://pinboard.in/u:cshalizi/b:eb483f873534], the % difference in incidence of harassment by gender is actually pretty small, though the _forms_ of harassment are different in perhaps-relevant ways.  Similarly for racial/ethnic disparities, though the statistics are necessarily noisier for minority groups there.
]]></description>
<dc:subject>to:NB to_read ensemble_methods text_mining networked_life social_life_of_the_mind via:henry_farrell</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:dcc0ae80623f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:text_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:networked_life"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_life_of_the_mind"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:henry_farrell"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2111.14377">
    <title>[2111.14377] Collective Intelligence for Deep Learning: A Survey of Recent Developments</title>
    <dc:date>2023-03-24T16:57:45+00:00</dc:date>
    <link>https://arxiv.org/abs/2111.14377</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In the past decade, we have witnessed the rise of deep learning to dominate the field of artificial intelligence. Advances in artificial neural networks alongside corresponding advances in hardware accelerators with large memory capacity, together with the availability of large datasets enabled practitioners to train and deploy sophisticated neural network models that achieve state-of-the-art performance on tasks across several fields spanning computer vision, natural language processing, and reinforcement learning. However, as these neural networks become bigger, more complex, and more widely used, fundamental problems with current deep learning models become more apparent. State-of-the-art deep learning models are known to suffer from issues that range from poor robustness, inability to adapt to novel task settings, to requiring rigid and inflexible configuration assumptions. Collective behavior, commonly observed in nature, tends to produce systems that are robust, adaptable, and have less rigid assumptions about the environment configuration. Collective intelligence, as a field, studies the group intelligence that emerges from the interactions of many individuals. Within this field, ideas such as self-organization, emergent behavior, swarm optimization, and cellular automata were developed to model and explain complex systems. It is therefore natural to see these ideas incorporated into newer deep learning methods. In this review, we will provide a historical context of neural network research's involvement with complex systems, and highlight several active areas in modern deep learning research that incorporate the principles of collective intelligence to advance its current capabilities. We hope this review can serve as a bridge between the complex systems and deep learning communities."]]></description>
<dc:subject>to:NB neural_networks distributed_systems ensemble_methods complexity self-organization cellular_automata to_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5fe97f6b6e1a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:distributed_systems"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:self-organization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cellular_automata"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2207.08815">
    <title>[2207.08815] Why do tree-based models still outperform deep learning on tabular data?</title>
    <dc:date>2022-08-25T16:03:35+00:00</dc:date>
    <link>https://arxiv.org/abs/2207.08815</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["While deep learning has enabled tremendous progress on text and image datasets, its superiority on tabular data is not clear. We contribute extensive benchmarks of standard and novel deep learning methods as well as tree-based models such as XGBoost and Random Forests, across a large number of datasets and hyperparameter combinations. We define a standard set of 45 datasets from varied domains with clear characteristics of tabular data and a benchmarking methodology accounting for both fitting models and finding good hyperparameters. Results show that tree-based models remain state-of-the-art on medium-sized data (∼10K samples) even without accounting for their superior speed. To understand this gap, we conduct an empirical investigation into the differing inductive biases of tree-based models and Neural Networks (NNs). This leads to a series of challenges which should guide researchers aiming to build tabular-specific NNs: 1. be robust to uninformative features, 2. preserve the orientation of the data, and 3. be able to easily learn irregular functions. To stimulate research on tabular architectures, we contribute a standard benchmark and raw data for baselines: every point of a 20 000 compute hours hyperparameter search for each learner."]]></description>
<dc:subject>to:NB to_read your_favorite_deep_neural_network_sucks ensemble_methods decision_trees to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2b5e2a0c03ab/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:your_favorite_deep_neural_network_sucks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://jmlr.org/papers/v23/20-874.html">
    <title>Model Averaging Is Asymptotically Better Than Model Selection For Prediction</title>
    <dc:date>2022-07-19T13:59:00+00:00</dc:date>
    <link>https://jmlr.org/papers/v23/20-874.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We compare the performance of six model average predictors---Mallows' model averaging, stacking, Bayes model averaging, bagging, random forests, and boosting---to the components used to form them.In all six cases we identify conditions under which the model average predictor is consistent for its intended limit and performs as well or better than any of its components asymptotically. This is well known empirically, especially for complex problems, although theoretical results do not seem to have been formally established. We have focused our attention on the regression context since that is where model averaging techniques differ most often from current practice."

--- Could've sworn I bookmarked this already!]]></description>
<dc:subject>in_NB model_selection ensemble_methods regression to_teach:data-mining to_teach:childs_garden_of_statistical_learning_theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9781f6f1983c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:childs_garden_of_statistical_learning_theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2206.04902">
    <title>[2206.04902] Forecasting macroeconomic data with Bayesian VARs: Sparse or dense? It depends!</title>
    <dc:date>2022-06-13T17:40:28+00:00</dc:date>
    <link>https://arxiv.org/abs/2206.04902</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Vectorautogressions (VARs) are widely applied when it comes to modeling and forecasting macroeconomic variables. In high dimensions, however, they are prone to overfitting. Bayesian methods, more concretely shrinking priors, have shown to be successful in improving prediction performance. In the present paper we introduce the recently developed R2-induced Dirichlet-decomposition prior to the VAR framework and compare it to refinements of well-known priors in the VAR literature. We demonstrate the virtues of the proposed prior in an extensive simulation study and in an empirical application forecasting data of the US economy. Further, we shed more light on the ongoing Illusion of Sparsity debate. We find that forecasting performances under sparse/dense priors vary across evaluated economic variables and across time frames; dynamic model averaging, however, can combine the merits of both worlds. All priors are implemented using the reduced-form VAR and all models feature stochastic volatility in the variance-covariance matrix."]]></description>
<dc:subject>to:NB time_series prediction macroeconomics re:your_favorite_dsge_sucks ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b6484bcc0127/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:time_series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:macroeconomics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:your_favorite_dsge_sucks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1974867">
    <title>Bagged Filters for Partially Observed Interacting Systems: Journal of the American Statistical Association: Vol 0, No 0</title>
    <dc:date>2022-06-11T04:57:01+00:00</dc:date>
    <link>https://www.tandfonline.com/doi/full/10.1080/01621459.2021.1974867</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Bagging (i.e., bootstrap aggregating) involves combining an ensemble of bootstrap estimators. We consider bagging for inference from noisy or incomplete measurements on a collection of interacting stochastic dynamic systems. Each system is called a unit, and each unit is associated with a spatial location. A motivating example arises in epidemiology, where each unit is a city: the majority of transmission occurs within a city, with smaller yet epidemiologically important interactions arising from disease transmission between cities. Monte Carlo filtering methods used for inference on nonlinear non-Gaussian systems can suffer from a curse of dimensionality (COD) as the number of units increases. We introduce bagged filter (BF) methodology which combines an ensemble of Monte Carlo filters, using spatiotemporally localized weights to select successful filters at each unit and time. We obtain conditions under which likelihood evaluation using a BF algorithm can beat a COD, and we demonstrate applicability even when these conditions do not hold. BF can out-perform an ensemble Kalman filter on a coupled population dynamics model describing infectious disease transmission. A block particle filter (BPF) also performs well on this task, though the bagged filter respects smoothness and conservation laws that a BPF can violate. "]]></description>
<dc:subject>to:NB time_series state_estimation state-space_models particle_filters ensemble_methods ionides.edward</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:737c28aede3c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:time_series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:state_estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:state-space_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:particle_filters"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ionides.edward"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.jmlr.org/papers/v23/20-874.html">
    <title>Model Averaging Is Asymptotically Better Than Model Selection For Prediction</title>
    <dc:date>2022-03-27T15:53:13+00:00</dc:date>
    <link>https://www.jmlr.org/papers/v23/20-874.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We compare the performance of six model average predictors---Mallows' model averaging, stacking, Bayes model averaging, bagging, random forests, and boosting---to the components used to form them.In all six cases we identify conditions under which the model average predictor is consistent for its intended limit and performs as well or better than any of its components asymptotically. This is well known empirically, especially for complex problems, although theoretical results do not seem to have been formally established. We have focused our attention on the regression context since that is wheremodel averaging techniques differ most often from current practice."

--- Of course I find this weeks after I teach model averaging.]]></description>
<dc:subject>to:NB ensemble_methods regression to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5b7a2e7ec1aa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2109.00173">
    <title>[2109.00173] FADE: FAir Double Ensemble Learning for Observable and Counterfactual Outcomes</title>
    <dc:date>2021-09-07T05:21:00+00:00</dc:date>
    <link>https://arxiv.org/abs/2109.00173</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Methods for building fair predictors often involve tradeoffs between fairness and accuracy and between different fairness criteria, but the nature of these tradeoffs varies. Recent work seeks to characterize these tradeoffs in specific problem settings, but these methods often do not accommodate users who wish to improve the fairness of an existing benchmark model without sacrificing accuracy, or vice versa. These results are also typically restricted to observable accuracy and fairness criteria. We develop a flexible framework for fair ensemble learning that allows users to efficiently explore the fairness-accuracy space or to improve the fairness or accuracy of a benchmark model. Our framework can simultaneously target multiple observable or counterfactual fairness criteria, and it enables users to combine a large number of previously trained and newly trained predictors. We provide theoretical guarantees that our estimators converge at fast rates. We apply our method on both simulated and real data, with respect to both observable and counterfactual accuracy and fairness criteria. We show that, surprisingly, multiple unfairness measures can sometimes be minimized simultaneously with little impact on accuracy, relative to unconstrained predictors or existing benchmark models."]]></description>
<dc:subject>to:NB prediction statistics ensemble_methods algorithmic_fairness kith_and_kin approved_the_thesis mishler.alan kennedy.edward_h.</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9f7430fb6211/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:approved_the_thesis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:mishler.alan"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kennedy.edward_h."/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2107.04208">
    <title>[2107.04208] From Many to One: Consensus Inference in a MIP</title>
    <dc:date>2021-07-12T14:51:51+00:00</dc:date>
    <link>https://arxiv.org/abs/2107.04208</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A Model Intercomparison Project (MIP) consists of teams who each estimate the same underlying quantity (e.g., temperature projections to the year 2070), and the spread of the estimates indicates their uncertainty. It recognizes that a community of scientists will not agree completely but that there is value in looking for a consensus and information in the range of disagreement. A simple average of the teams' outputs gives a consensus estimate, but it does not recognize that some outputs are more variable than others. Statistical analysis of variance (ANOVA) models offer a way to obtain a weighted consensus estimate of outputs with a variance that is the smallest possible and hence the tightest possible 'one-sigma' and 'two-sigma' intervals. Modulo dependence between MIP outputs, the ANOVA approach weights a team's output inversely proportional to its variation. When external verification data are available for evaluating the fidelity of each MIP output, ANOVA weights can also provide a prior distribution for Bayesian Model Averaging to yield a consensus estimate. We use a MIP of carbon dioxide flux inversions to illustrate the ANOVA-based weighting and subsequent consensus inferences."]]></description>
<dc:subject>to:NB ensemble_methods science_fiction cressie.noel statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8317ff977650/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:science_fiction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cressie.noel"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.05211">
    <title>[2002.05211] Bagged filters for partially observed interacting systems</title>
    <dc:date>2021-06-28T03:45:08+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.05211</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Bagging (i.e., bootstrap aggregating) involves combining an ensemble of bootstrap estimators. We consider bagging for inference from noisy or incomplete measurements on a collection of interacting stochastic dynamic systems. Each system is called a unit, and each unit is associated with a spatial location. A motivating example arises in epidemiology, where each unit is a city: the majority of transmission occurs within a city, with smaller yet epidemiologically important interactions arising from disease transmission between cities. Monte~Carlo filtering methods used for inference on nonlinear non-Gaussian systems can suffer from a curse of dimensionality as the number of units increases. We introduce bagged filter (BF) methodology which combines an ensemble of Monte Carlo filters, using spatiotemporally localized weights to select successful filters at each unit and time. We obtain conditions under which likelihood evaluation using a BF algorithm can beat a curse of dimensionality, and we demonstrate applicability even when these conditions do not hold. BF can out-perform an ensemble Kalman filter on a coupled population dynamics model describing infectious disease transmission. A block particle filter also performs well on this task, though the bagged filter respects smoothness and conservation laws that a block particle filter can violate."]]></description>
<dc:subject>to:NB state-space_models ensemble_methods particle_filters spatio-temporal_statistics statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:0b4c5b2a7b88/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:state-space_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:particle_filters"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spatio-temporal_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2106.05918">
    <title>[2106.05918] Bias, Consistency, and Alternative Perspectives of the Infinitesimal Jackknife</title>
    <dc:date>2021-06-24T20:42:42+00:00</dc:date>
    <link>https://arxiv.org/abs/2106.05918</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Though introduced nearly 50 years ago, the infinitesimal jackknife (IJ) remains a popular modern tool for quantifying predictive uncertainty in complex estimation settings. In particular, when supervised learning ensembles are constructed via bootstrap samples, recent work demonstrated that the IJ estimate of variance is particularly convenient and useful. However, despite the algebraic simplicity of its final form, its derivation is rather complex. As a result, studies clarifying the intuition behind the estimator or rigorously investigating its properties have been severely lacking. This work aims to take a step forward on both fronts. We demonstrate that surprisingly, the exact form of the IJ estimator can be obtained via a straightforward linear regression of the individual bootstrap estimates on their respective weights or via the classical jackknife. The latter realization is particularly useful as it allows us to formally investigate the bias of the IJ variance estimator and better characterize the settings in which its use is appropriate. Finally, we extend these results to the case of U-statistics where base models are constructed via subsampling rather than bootstrapping and provide a consistent estimate of the resulting variance."]]></description>
<dc:subject>to:NB bootstrap confidence_sets ensemble_methods statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a62e1cc12b78/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bootstrap"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:confidence_sets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2008.07063">
    <title>[2008.07063] To Bag is to Prune</title>
    <dc:date>2021-06-10T02:10:01+00:00</dc:date>
    <link>https://arxiv.org/abs/2008.07063</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["It is notoriously difficult to build a bad Random Forest (RF). Concurrently, RF blatantly overfits in-sample without any apparent consequence out-of-sample. Standard arguments, like the classic bias-variance trade-off or double descent, cannot rationalize this paradox. I propose a new explanation: bootstrap aggregation and model perturbation as implemented by RF automatically prune a latent "true" tree. More generally, randomized ensembles of greedily optimized learners implicitly perform optimal early stopping out-of-sample. So there is no need to tune the stopping point. By construction, novel variants of Boosting and MARS are also eligible for automatic tuning. I empirically demonstrate the property, with simulated and real data, by reporting that these new completely overfitting ensembles perform similarly to their tuned counterparts -- or better."]]></description>
<dc:subject>to:NB learning_theory ensemble_methods random_forests statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:4e7116d33251/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:learning_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2106.02589">
    <title>[2106.02589] On Ensembling vs Merging: Least Squares and Random Forests under Covariate Shift</title>
    <dc:date>2021-06-08T13:55:10+00:00</dc:date>
    <link>https://arxiv.org/abs/2106.02589</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["It has been postulated and observed in practice that for prediction problems in which covariate data can be naturally partitioned into clusters, ensembling algorithms based on suitably aggregating models trained on individual clusters often perform substantially better than methods that ignore the clustering structure in the data. In this paper, we provide theoretical support to these empirical observations by asymptotically analyzing linear least squares and random forest regressions under a linear model. Our main results demonstrate that the benefit of ensembling compared to training a single model on the entire data, often termed 'merging', might depend on the underlying bias and variance interplay of the individual predictors to be aggregated. In particular, under both fixed and high dimensional linear models, we show that merging is asymptotically superior to optimal ensembling techniques for linear least squares regression due to the unbiased nature of least squares prediction. In contrast, for random forest regression under fixed dimensional linear models, our bounds imply a strict benefit of ensembling over merging. Finally, we also present numerical experiments to verify the validity of our asymptotic results across different situations."]]></description>
<dc:subject>to:NB ensemble_methods prediction clustering statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c784d0848e5a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2106.02803">
    <title>[2106.02803] Network Estimation by Mixing: Adaptivity and More</title>
    <dc:date>2021-06-08T13:54:31+00:00</dc:date>
    <link>https://arxiv.org/abs/2106.02803</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Networks analysis has been commonly used to study the interactions between units of complex systems. One problem of particular interest is learning the network's underlying connection pattern given a single and noisy instantiation. While many methods have been proposed to address this problem in recent years, they usually assume that the true model belongs to a known class, which is not verifiable in most real-world applications. Consequently, network modeling based on these methods either suffers from model misspecification or relies on additional model selection procedures that are not well understood in theory and can potentially be unstable in practice. To address this difficulty, we propose a mixing strategy that leverages available arbitrary models to improve their individual performances. The proposed method is computationally efficient and almost tuning-free; thus, it can be used as an off-the-shelf method for network modeling. We show that the proposed method performs equally well as the oracle estimate when the true model is included as individual candidates. More importantly, the method remains robust and outperforms all current estimates even when the models are misspecified. Extensive simulation examples are used to verify the advantage of the proposed mixing method. Evaluation of link prediction performance on 385 real-world networks from six domains also demonstrates the universal competitiveness of the mixing method across multiple domains."

]]></description>
<dc:subject>to:NB network_data_analysis ensemble_methods to_read statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:06ee46836e84/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2106.01092">
    <title>[2106.01092] Statistical optimality conditions for compressive ensembles</title>
    <dc:date>2021-06-07T03:51:56+00:00</dc:date>
    <link>https://arxiv.org/abs/2106.01092</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We present a framework for the theoretical analysis of ensembles of low-complexity empirical risk minimisers trained on independent random compressions of high-dimensional data. First we introduce a general distribution-dependent upper-bound on the excess risk, framed in terms of a natural notion of compressibility. This bound is independent of the dimension of the original data representation, and explains the in-built regularisation effect of the compressive approach. We then instantiate this general bound to classification and regression tasks, considering Johnson-Lindenstrauss mappings as the compression scheme. For each of these tasks, our strategy is to develop a tight upper bound on the compressibility function, and by doing so we discover distributional conditions of geometric nature under which the compressive algorithm attains minimax-optimal rates up to at most poly-logarithmic factors. In the case of compressive classification, this is achieved with a mild geometric margin condition along with a flexible moment condition that is significantly more general than the assumption of bounded domain. In the case of regression with strongly convex smooth loss functions we find that compressive regression is capable of exploiting spectral decay with near-optimal guarantees. In addition, a key ingredient for our central upper bound is a high probability uniform upper bound on the integrated deviation of dependent empirical processes, which may be of independent interest."]]></description>
<dc:subject>to:NB learning_theory random_projections ensemble_methods statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:18adca924829/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:learning_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_projections"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2005.14458">
    <title>[2005.14458] Distributional Random Forests: Heterogeneity Adjustment and Multivariate Distributional Regression</title>
    <dc:date>2021-06-01T17:33:18+00:00</dc:date>
    <link>https://arxiv.org/abs/2005.14458</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Random Forests (Breiman, 2001) is a successful and widely used regression and classification algorithm. Part of its appeal and reason for its versatility is its (implicit) construction of a kernel-type weighting function on training data, which can also be used for targets other than the original mean estimation. We propose a novel forest construction for multivariate responses based on their joint conditional distribution, independent of the estimation target and the data model. It uses a new splitting criterion based on the MMD distributional metric, which is suitable for detecting heterogeneity in multivariate distributions. The induced weights define an estimate of the full conditional distribution, which in turn can be used for arbitrary and potentially complicated targets of interest. The method is very versatile and convenient to use, as we illustrate on a wide range of examples. The code is available as Python and R packages drf."]]></description>
<dc:subject>to:NB density_estimation random_fields buhlmann.peter ensemble_methods statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:35774f99f672/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:density_estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_fields"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:buhlmann.peter"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2006.08855">
    <title>[2006.08855] RaSE: Random Subspace Ensemble Classification</title>
    <dc:date>2021-06-01T13:35:35+00:00</dc:date>
    <link>https://arxiv.org/abs/2006.08855</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We propose a flexible ensemble classification framework, Random Subspace Ensemble (RaSE), for sparse classification. In the RaSE algorithm, we aggregate many weak learners, where each weak learner is a base classifier trained in a subspace optimally selected from a collection of random subspaces. To conduct subspace selection, we propose a new criterion, ratio information criterion (RIC), based on weighted Kullback-Leibler divergence. The theoretical analysis includes the risk and Monte-Carlo variance of the RaSE classifier, establishing the screening consistency and weak consistency of RIC, and providing an upper bound for the misclassification rate of the RaSE classifier. In addition, we show that in a high-dimensional framework, the number of random subspaces needs to be very large to guarantee that a subspace covering signals is selected. Therefore, we propose an iterative version of the RaSE algorithm and prove that under some specific conditions, a smaller number of generated random subspaces are needed to find a desirable subspace through iteration. An array of simulations under various models and real-data applications demonstrate the effectiveness and robustness of the RaSE classifier and its iterative version in terms of low misclassification rate and accurate feature ranking. The RaSE algorithm is implemented in the R package RaSEn on CRAN."]]></description>
<dc:subject>to:NB ensemble_methods random_features statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d7e2672c3097/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_features"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2101.11083">
    <title>[2101.11083] Tree boosting for learning probability measures</title>
    <dc:date>2021-05-30T21:09:35+00:00</dc:date>
    <link>https://arxiv.org/abs/2101.11083</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Learning probability measures based on an i.i.d. sample is a fundamental inference task, but is challenging when the sample space is high-dimensional. Inspired by the success of tree boosting in high-dimensional classification and regression, we propose a tree boosting method for learning high-dimensional probability distributions. We formulate concepts of "addition" and "residuals" on probability distributions in terms of compositions of a new, more general notion of multivariate cumulative distribution functions (CDFs) than classical CDFs. This then gives rise to a simple boosting algorithm based on forward-stagewise (FS) fitting of an additive ensemble of measures, which sequentially minimizes the entropy loss. The output of the FS algorithm allows analytic computation of the probability density function for the fitted distribution. It also provides an exact simulator for drawing independent Monte Carlo samples from the fitted measure. Typical considerations in applying boosting--namely choosing the number of trees, setting the appropriate level of shrinkage/regularization in the weak learner, and the evaluation of variable importance--can all be accomplished in an analogous fashion to traditional boosting in supervised learning. Numerical experiments confirm that boosting can substantially improve the fit to multivariate distributions compared to the state-of-the-art single-tree learner and is computationally efficient. We illustrate through an application to a data set from mass cytometry how the simulator can be used to investigate various aspects of the underlying distribution."]]></description>
<dc:subject>to:NB ensemble_methods density_estimation statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1d7502cf7e69/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:density_estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2009.09036">
    <title>[2009.09036] Causal Rule Ensemble: Interpretable Inference of Heterogeneous Treatment Effects</title>
    <dc:date>2021-05-30T20:43:38+00:00</dc:date>
    <link>https://arxiv.org/abs/2009.09036</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In social and health sciences, it is critically important to identify subgroups of the study population where a treatment has a notably larger or smaller causal effect compared to the population average. In recent years, there have been many methodological developments for addressing heterogeneity of causal effects. A common approach is to estimate the conditional average treatment effect (CATE) given a pre-specified set of covariates. However, this approach does not allow to discover new subgroups. Recent causal machine learning (ML) approaches estimate the CATE at an individual level in presence of large number of observations and covariates with great accuracy. Nevertheless, the bulk of these ML approaches do not provide an interpretable characterization of the heterogeneous subgroups. In this paper, we propose a new Causal Rule Ensemble (CRE) method that: 1) discovers de novo subgroups with significantly heterogeneous treatment effects (causal rules); 2) ensures interpretability of these subgroups because they are defined in terms of decision rules; and 3) estimates the CATE for each of these newly discovered subgroups with small bias and high statistical precision. We provide theoretical results that guarantee consistency of the estimated causal effects for the newly discovered causal rules. A nice feature of CRE is that it is agnostic to the choices of the ML algorithms that can be used to discover the causal rules, and the estimation methods for the causal effects within the discovered causal rules. Via simulations, we show that the CRE method has competitive performance as compared to existing approaches while providing enhanced interpretability. We also introduce a new sensitivity analysis to unmeasured confounding bias. We apply the CRE method to discover subgroups that are more vulnerable to the causal effects of long-term exposure to air pollution on mortality."

--- From the abstract, I am willing to bet we are re-discovering the "classifier systems" of John Holland (of blessed memory).]]></description>
<dc:subject>to:NB causal_inference causal_discovery ensemble_methods statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:43a67140d232/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:causal_inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:causal_discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://doi.org/10.1111/rssb.12425">
    <title>AMF: Aggregated Mondrian forests for online learning - Mourtada - - Journal of the Royal Statistical Society: Series B (Statistical Methodology) - Wiley Online Library</title>
    <dc:date>2021-05-20T13:53:05+00:00</dc:date>
    <link>https://doi.org/10.1111/rssb.12425</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Random forest (RF) is one of the algorithms of choice in many supervised learning applications, be it classification or regression. The appeal of such tree-ensemble methods comes from a combination of several characteristics: a remarkable accuracy in a variety of tasks, a small number of parameters to tune, robustness with respect to features scaling, a reasonable computational cost for training and prediction, and their suitability in high-dimensional settings. The most commonly used RF variants, however, are ‘offline’ algorithms, which require the availability of the whole dataset at once. In this paper, we introduce AMF, an online RF algorithm based on Mondrian Forests. Using a variant of the context tree weighting algorithm, we show that it is possible to efficiently perform an exact aggregation over all prunings of the trees; in particular, this enables to obtain a truly online parameter-free algorithm which is competitive with the optimal pruning of the Mondrian tree, and thus adaptive to the unknown regularity of the regression function. Numerical experiments show that AMF is competitive with respect to several strong baselines on a large number of datasets for multi-class classification."]]></description>
<dc:subject>to:NB to_read ensemble_methods random_forests regression classifiers to_teach:data-mining online_learning statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:42bd56d40bd2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:online_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.04134">
    <title>[2105.04134] Bagging cross-validated bandwidth selection in nonparametric regression estimation with applications to large-sized samples</title>
    <dc:date>2021-05-12T18:15:40+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.04134</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Cross-validation is a well-known and widely used bandwidth selection method in nonparametric regression estimation. However, this technique has two remarkable drawbacks: (i) the large variability of the selected bandwidths, and (ii) the inability to provide results in a reasonable time for very large sample sizes. To overcome these problems, bagging cross-validation bandwidths are analyzed in this paper. This approach consists in computing the cross-validation bandwidths for a finite number of subsamples and then rescaling the averaged smoothing parameters to the original sample size. Under a random-design regression model, asymptotic expressions up to a second-order for the bias and variance of the leave-one-out cross-validation bandwidth for the Nadaraya--Watson estimator are obtained. Subsequently, the asymptotic bias and variance and the limit distribution are derived for the bagged cross-validation selector. Suitable choices of the number of subsamples and the subsample size lead to an n−1/2 rate for the convergence in distribution of the bagging cross-validation selector, outperforming the rate n−3/10 of leave-one-out cross-validation. Several simulations and an illustration on a real dataset related to the COVID-19 pandemic show the behavior of our proposal and its better performance, in terms of statistical efficiency and computing time, when compared to leave-one-out cross-validation."]]></description>
<dc:subject>to:NB cross-validation ensemble_methods bootstrap to_teach:data-mining kernel_smoothing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f2a3494ec6a3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cross-validation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bootstrap"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kernel_smoothing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.02569">
    <title>[2105.02569] Machine Collaboration</title>
    <dc:date>2021-05-10T22:48:51+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.02569</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We propose a new ensemble framework for supervised learning, named machine collaboration (MaC), based on a collection of base machines for prediction tasks. Different from bagging/stacking (a parallel & independent framework) and boosting (a sequential & top-down framework), MaC is a type of circular & interactive learning framework. The circular & interactive feature helps the base machines to transfer information circularly and update their own structures and parameters accordingly. The theoretical result on the risk bound of the estimator based on MaC shows that circular & interactive feature can help MaC reduce the risk via a parsimonious ensemble. We conduct extensive experiments on simulated data and 119 benchmark real data sets. The results of the experiments show that in most cases, MaC performs much better than several state-of-the-art methods, including CART, neural network, stacking, and boosting."]]></description>
<dc:subject>to:NB ensemble_methods prediction re:democratic_cognition to_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d3a50065611d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:democratic_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/core/journals/econometric-theory/article/abs/new-study-on-asymptotic-optimality-of-least-squares-model-averaging/5252EEB71F41DCF4B613C02F1440D4A1">
    <title>A NEW STUDY ON ASYMPTOTIC OPTIMALITY OF LEAST SQUARES MODEL AVERAGING | Econometric Theory | Cambridge Core</title>
    <dc:date>2021-04-21T16:08:41+00:00</dc:date>
    <link>https://www.cambridge.org/core/journals/econometric-theory/article/abs/new-study-on-asymptotic-optimality-of-least-squares-model-averaging/5252EEB71F41DCF4B613C02F1440D4A1</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this article, we present a comprehensive study of asymptotic optimality of least squares model averaging methods. The concept of asymptotic optimality is that in a large-sample sense, the method results in the model averaging estimator with the smallest possible prediction loss among all such estimators. In the literature, asymptotic optimality is usually proved under specific weights restriction or using hardly interpretable assumptions. This article provides a new approach to proving asymptotic optimality, in which a general weight set is adopted, and some easily interpretable assumptions are imposed. In particular, we do not impose any assumptions on the maximum selection risk and allow a larger number of regressors than that of existing studies."]]></description>
<dc:subject>to:NB model_averaging ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9cbc6fcccb0c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_averaging"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2103.05766">
    <title>[2103.05766] Interpretable Machines: Constructing Valid Prediction Intervals with Random Forests</title>
    <dc:date>2021-03-21T18:40:32+00:00</dc:date>
    <link>https://arxiv.org/abs/2103.05766</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["An important issue when using Machine Learning algorithms in recent research is the lack of interpretability. Although these algorithms provide accurate point predictions for various learning problems, uncertainty estimates connected with point predictions are rather sparse. A contribution to this gap for the Random Forest Regression Learner is presented here. Based on its Out-of-Bag procedure, several parametric and non-parametric prediction intervals are provided for Random Forest point predictions and theoretical guarantees for its correct coverage probability is delivered. In a second part, a thorough investigation through Monte-Carlo simulation is conducted evaluating the performance of the proposed methods from three aspects: (i) Analyzing the correct coverage rate of the proposed prediction intervals, (ii) Inspecting interval width and (iii) Verifying the competitiveness of the proposed intervals with existing methods. The simulation yields that the proposed prediction intervals are robust towards non-normal residual distributions and are competitive by providing correct coverage rates and comparably narrow interval lengths, even for comparably small samples."]]></description>
<dc:subject>to:NB confidence_sets prediction statistics random_forests ensemble_methods data_mining to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:fdba38e0badc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:confidence_sets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://journals.aom.org/doi/10.5465/ambpp.2015.15192abstract">
    <title>Distilling the Wisdom of Crowds: Prediction Markets versus Prediction Polls | Academy of Management Proceedings</title>
    <dc:date>2021-03-01T07:53:19+00:00</dc:date>
    <link>https://journals.aom.org/doi/10.5465/ambpp.2015.15192abstract</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Crowd prediction methods offer the promised to collect valuable, widely dispersed information in organizations. To the extent that information is a source of power, crowdsourcing democratizes organizational governance. We report the results of the first large-scale, long-term, experimental test of crowd prediction methods. More than 2,400 participants made forecasts on 261 world events over two forecasting seasons, each lasting more than 9 months. Forecasters in prediction markets made trades about future events in a continuous double auction. Those in prediction polls submitted explicit probability judgments, independently or in teams. Probability values were aggregated statistically. In Study 1, which used full random assignment, prediction markets were more accurate than the unweighted mean of forecasts from prediction polls. However, team prediction polls aggregated with algorithms featuring decay, weighting and recalibration outperformed prediction markets by 12% in terms of Brier score. This pattern persisted in Study 2, and was stable across scoring rules. Prediction polls’ advantage was largest at the start of long-duration questions. Prediction polls with proper scoring, algorithmic aggregation and teaming offer an attractive method for distilling crowd wisdom."]]></description>
<dc:subject>to:NB collective_cognition ensemble_methods re:democratic_cognition via:henry_farrell</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b270f873ed06/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:democratic_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:henry_farrell"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2101.11190">
    <title>[2101.11190] Boost-S: Gradient Boosted Trees for Spatial Data and Its Application to FDG-PET Imaging Data</title>
    <dc:date>2021-02-05T20:12:15+00:00</dc:date>
    <link>https://arxiv.org/abs/2101.11190</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Boosting Trees are one of the most successful statistical learning approaches that involve sequentially growing an ensemble of simple regression trees (i.e., "weak learners"). However, gradient boosted trees are not yet available for spatially correlated data. This paper proposes a new gradient Boosted Trees algorithm for Spatial Data (Boost-S) with covariate information. Boost-S integrates the spatial correlation structure into the classical framework of gradient boosted trees. Each tree is grown by solving a regularized optimization problem, where the objective function involves two penalty terms on tree complexity and takes into account the underlying spatial correlation. A computationally-efficient algorithm is proposed to obtain the ensemble trees. The proposed Boost-S is applied to the spatially-correlated FDG-PET (fluorodeoxyglucose-positron emission tomography) imaging data collected during cancer chemoradiotherapy. Our numerical investigations successfully demonstrate the advantages of the proposed Boost-S over existing approaches for this particular application."]]></description>
<dc:subject>to:NB ensemble_methods boosting decision_trees spatial_statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9fdb3ea321fd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:boosting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spatial_statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.tandfonline.com/doi/full/10.1080/01621459.2020.1851696">
    <title>On Constraining Projections of Future Climate Using Observations and Simulations From Multiple Climate Models: Journal of the American Statistical Association: Vol 0, No 0</title>
    <dc:date>2021-01-27T15:05:40+00:00</dc:date>
    <link>https://www.tandfonline.com/doi/full/10.1080/01621459.2020.1851696</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Numerical climate models are used to project future climate change due to both anthropogenic and natural causes. Differences between projections from different climate models are a major source of uncertainty about future climate. Emergent relationships shared by multiple climate models have the potential to constrain our uncertainty when combined with historical observations. We combine projections from 13 climate models with observational data to quantify the impact of emergent relationships on projections of future warming in the Arctic at the end of the 21st century. We propose a hierarchical Bayesian framework based on a coexchangeable representation of the relationship between climate models and the Earth system. We show how emergent constraints fit into the coexchangeable representation, and extend it to account for internal variability simulated by the models and natural variability in the Earth system. Our analysis shows that projected warming in some regions of the Arctic may be more than 2 ∘° C lower and our uncertainty reduced by up to 30% when constrained by historical observations. A detailed theoretical comparison with existing multi-model projection frameworks is also provided. In particular, we show that projections may be biased if we do not account for internal variability in climate model predictions."]]></description>
<dc:subject>to:NB climatology climate_change ensemble_methods prediction model_checking statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:390646258391/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:climatology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:climate_change"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_checking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.14563">
    <title>[2012.14563] Random Planted Forest: a directly interpretable tree ensemble</title>
    <dc:date>2021-01-03T20:06:28+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.14563</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We introduce a novel interpretable and tree-based algorithm for prediction in a regression setting in which each tree in a classical random forest is replaced by a family of planted trees that grow simultaneously. The motivation for our algorithm is to estimate the unknown regression function from a functional ANOVA decomposition perspective, where each tree corresponds to a function within that decomposition. Therefore, planted trees are limited in the number of interaction terms. The maximal order of approximation in the ANOVA decomposition can be specified or left unlimited. If a first order approximation is chosen, the result is an additive model. In the other extreme case, if the order of approximation is not limited, the resulting model puts no restrictions on the form of the regression function. In a simulation study we find encouraging prediction and visualisation properties of our random planted forest method. We also develop theory for an idealised version of random planted forests in the case of an underlying additive model. We show that in the additive case, the idealised version achieves up to a logarithmic factor asymptotically optimal one-dimensional convergence rates of order n−2/5."]]></description>
<dc:subject>to:NB regression nonparametrics ensemble_methods decision_trees to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f6e4ba73e30f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:nonparametrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.tandfonline.com/doi/full/10.1080/10618600.2020.1853548">
    <title>Kriging Riemannian Data via Random Domain Decompositions: Journal of Computational and Graphical Statistics: Vol 0, No 0</title>
    <dc:date>2021-01-03T19:47:56+00:00</dc:date>
    <link>https://www.tandfonline.com/doi/full/10.1080/10618600.2020.1853548</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Data taking value on a Riemannian manifold and observed over a complex spatial domain are becoming more frequent in applications, for example, in environmental sciences and in geoscience. The analysis of these data needs to rely on local models to account for the nonstationarity of the generating random process, the nonlinearity of the manifold, and the complex topology of the domain. In this article, we propose to use a random domain decomposition approach to estimate an ensemble of local models and then to aggregate the predictions of the local models through Fréchet averaging. The algorithm is introduced in complete generality and is valid for data belonging to any smooth Riemannian manifold but it is then described in details for the case of the manifold of positive definite matrices, the hypersphere and the Cholesky manifold. The predictive performances of the method are explored via simulation studies for covariance matrices and correlation matrices, where the Cholesky manifold geometry is used. Finally, the method is illustrated on an environmental dataset observed over the Chesapeake Bay (USA)."]]></description>
<dc:subject>to:NB statistics_on_manifolds spatial_statistics ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:821c74f708d0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics_on_manifolds"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spatial_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.12802">
    <title>[2012.12802] Machine Learning Advances for Time Series Forecasting</title>
    <dc:date>2020-12-24T15:33:23+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.12802</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this paper we survey the most recent advances in supervised machine learning and high-dimensional models for time series forecasting. We consider both linear and nonlinear alternatives. Among the linear methods we pay special attention to penalized regressions and ensemble of models. The nonlinear methods considered in the paper include shallow and deep neural networks, in their feed-forward and recurrent versions, and tree-based methods, such as random forests and boosted trees. We also consider ensemble and hybrid models by combining ingredients from different alternatives. Tests for superior predictive ability are briefly reviewed. Finally, we discuss application of machine learning in economics and finance and provide an illustration with high-frequency financial data."]]></description>
<dc:subject>to:NB time_series prediction data_mining decision_trees random_forests neural_networks ensemble_methods to_teach:data_over_space_and_time</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:30ad1b56b9ba/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:time_series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data_over_space_and_time"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.11649">
    <title>[2012.11649] On the Aggregation of Probability Assessments: Regularized Mixtures of Predictive Densities for Eurozone Inflation and Real Interest Rates</title>
    <dc:date>2020-12-23T02:22:20+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.11649</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We propose methods for constructing regularized mixtures of density forecasts. We explore a variety of objectives and regularization penalties, and we use them in a substantive exploration of Eurozone inflation and real interest rate density forecasts. All individual inflation forecasters (even the ex post best forecaster) are outperformed by our regularized mixtures. The log scores of the Simplex and Best-Average mixtures, for example, are approximately 7% better than that of the ex post best individual forecaster, and 15% better than that of the median forecaster. From the Great Recession onward, the optimal regularization tends to move density forecasts' probability mass from the centers to the tails, correcting for overconfidence."]]></description>
<dc:subject>to:NB prediction ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:78ce8bc230bf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1902.03999">
    <title>[1902.03999] KTBoost: Combined Kernel and Tree Boosting</title>
    <dc:date>2020-11-19T19:46:20+00:00</dc:date>
    <link>https://arxiv.org/abs/1902.03999</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We introduce a novel boosting algorithm called `KTBoost' which combines kernel boosting and tree boosting. In each boosting iteration, the algorithm adds either a regression tree or reproducing kernel Hilbert space (RKHS) regression function to the ensemble of base learners. Intuitively, the idea is that discontinuous trees and continuous RKHS regression functions complement each other, and that this combination allows for better learning of functions that have parts with varying degrees of regularity such as discontinuities and smooth parts. We empirically show that KTBoost significantly outperforms both tree and kernel boosting in terms of predictive accuracy in a comparison on a wide array of data sets."]]></description>
<dc:subject>to:NB ensemble_methods boosting kernel_methods decision_trees to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8dff463b6f9b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:boosting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kernel_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2011.07476">
    <title>[2011.07476] Right Decisions from Wrong Predictions: A Mechanism Design Alternative to Individual Calibration</title>
    <dc:date>2020-11-18T17:19:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2011.07476</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Decision makers often need to rely on imperfect probabilistic forecasts. While average performance metrics are typically available, it is difficult to assess the quality of individual forecasts and the corresponding utilities. To convey confidence about individual predictions to decision-makers, we propose a compensation mechanism ensuring that the forecasted utility matches the actually accrued utility. While a naive scheme to compensate decision-makers for prediction errors can be exploited and might not be sustainable in the long run, we propose a mechanism based on fair bets and online learning that provably cannot be exploited. We demonstrate an application showing how passengers could confidently optimize individual travel plans based on flight delay probabilities estimated by an airline."]]></description>
<dc:subject>to:NB prediction ensemble_methods mechanism_design</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:74be020d03a3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:mechanism_design"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://covid19-projections.com/about/">
    <title>COVID-19 Projections Using Machine Learning | We take a data-driven approach rooted in epidemiology to forecast infections, deaths, and recovery timelines of the COVID-19 / coronavirus epidemic in the US and around the world</title>
    <dc:date>2020-04-30T00:17:40+00:00</dc:date>
    <link>https://covid19-projections.com/about/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[It seems like they're (1) doing a brute-force search over parameter combinations, (2) evaluating them by forward-looking CV (it sounds like it's close to accumulated prediction error), and (3) doing an ensemble forecast which averages over parameter values, with weights that go down as the CV errors go up.]]></description>
<dc:subject>prediction epidemic_models coronavirus_pandemic_of_2019-- ensemble_methods to_teach:data_over_space_and_time via:carl_bergstrom</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:29b9c6fe79d5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:epidemic_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:coronavirus_pandemic_of_2019--"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data_over_space_and_time"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:carl_bergstrom"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.jstatsoft.org/article/view/v054i02">
    <title>adabag: An R Package for Classification with Boosting and Bagging | Alfaro | Journal of Statistical Software</title>
    <dc:date>2019-12-01T15:47:34+00:00</dc:date>
    <link>https://www.jstatsoft.org/article/view/v054i02</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Boosting and bagging are two widely used ensemble methods for classification. Their common goal is to improve the accuracy of a classifier combining single classifiers which are slightly better than random guessing. Among the family of boosting algorithms, AdaBoost (adaptive boosting) is the best known, although it is suitable only for dichotomous tasks. AdaBoost.M1 and SAMME (stagewise additive modeling using a multi-class exponential loss function) are two easy and natural extensions to the general case of two or more classes. In this paper, the adabag R package is introduced. This version implements AdaBoost.M1, SAMME and bagging algorithms with classification trees as base classifiers. Once the ensembles have been trained, they can be used to predict the class of new samples. The accuracy of these classifiers can be estimated in a separated data set or through cross validation. Moreover, the evolution of the error as the ensemble grows can be analysed and the ensemble can be pruned. In addition, the margin in the class prediction and the probability of each class for the observations can be calculated. Finally, several classic examples in classification literature are shown to illustrate the use of this package."]]></description>
<dc:subject>to:NB boosting bagging ensemble_methods classifiers decision_trees R to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:af00024c2969/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:boosting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bagging"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://link.springer.com/article/10.1023%2FA%3A1010933404324">
    <title>Random Forests | SpringerLink</title>
    <dc:date>2019-11-25T15:59:39+00:00</dc:date>
    <link>https://link.springer.com/article/10.1023%2FA%3A1010933404324</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Random forests are a combination of tree predictors such that each tree depends on the values of a random vector sampled independently and with the same distribution for all trees in the forest. The generalization error for forests converges a.s. to a limit as the number of trees in the forest becomes large. The generalization error of a forest of tree classifiers depends on the strength of the individual trees in the forest and the correlation between them. Using a random selection of features to split each node yields error rates that compare favorably to Adaboost (Y. Freund & R. Schapire, Machine Learning: Proceedings of the Thirteenth International conference, ***, 148–156), but are more robust with respect to noise. Internal estimates monitor error, strength, and correlation and these are used to show the response to increasing the number of features used in the splitting. Internal estimates are also used to measure variable importance. These ideas are also applicable to regression."]]></description>
<dc:subject>have_read breiman.leo ensemble_methods decision_trees random_forests to_teach:data-mining machine_learning statistics prediction in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:11aee8d7c62c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:breiman.leo"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1911.00190">
    <title>[1911.00190] Randomization as Regularization: A Degrees of Freedom Explanation for Random Forest Success</title>
    <dc:date>2019-11-10T22:34:10+00:00</dc:date>
    <link>https://arxiv.org/abs/1911.00190</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Random forests remain among the most popular off-the-shelf supervised machine learning tools with a well-established track record of predictive accuracy in both regression and classification settings. Despite their empirical success as well as a bevy of recent work investigating their statistical properties, a full and satisfying explanation for their success has yet to be put forth. Here we aim to take a step forward in this direction by demonstrating that the additional randomness injected into individual trees serves as a form of implicit regularization, making random forests an ideal model in low signal-to-noise ratio (SNR) settings. Specifically, from a model-complexity perspective, we show that the mtry parameter in random forests serves much the same purpose as the shrinkage penalty in explicitly regularized regression procedures like lasso and ridge regression. To highlight this point, we design a randomized linear-model-based forward selection procedure intended as an analogue to tree-based random forests and demonstrate its surprisingly strong empirical performance. Numerous demonstrations on both real and synthetic data are provided."]]></description>
<dc:subject>to:NB random_forests statistics decision_trees ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:cf2088f4c949/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1910.11743">
    <title>[1910.11743] Boosting heritability: estimating the genetic component of phenotypic variation with multiple sample splitting</title>
    <dc:date>2019-10-29T14:29:28+00:00</dc:date>
    <link>https://arxiv.org/abs/1910.11743</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Heritability is a central measure in genetics quantifying how much of the variability observed in a trait is attributable to genetic differences. Existing methods for estimating heritability are most often based on random-effect models, typically for computational reasons. The alternative of using a fixed-effect model has received much more limited attention in the literature. In this paper, we propose a generic strategy for heritability inference, termed as \textit{"boosting heritability"}, by combining several advantageous features of different recent methods to produce an estimate of the heritability with a high-dimensional linear model. Boosting heritability uses in particular a multiple sample splitting strategy which leads to a more stable estimate. We use antibiotic resistance data from a major human pathogen, \textit{Sptreptococcus pneumoniae}, to demonstrate the applicability of our inference strategy."]]></description>
<dc:subject>to:NB heritability statistics genetics ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:427e435cf974/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:heritability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:genetics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1910.11445">
    <title>[1910.11445] Finite Mixtures of ERGMs for Ensembles of Networks</title>
    <dc:date>2019-10-29T14:28:44+00:00</dc:date>
    <link>https://arxiv.org/abs/1910.11445</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Ensembles of networks arise in many scientific fields, but currently there are few statistical models aimed at understanding their generative processes. To fill in this gap, we propose characterizing network ensembles via finite mixtures of exponential family random graph models, employing a Metropolis-within-Gibbs algorithm to conduct Bayesian inference. Simulation studies show that the proposed procedure can recover the true cluster assignments and cluster-specific parameters. We demonstrate the utility of the proposed approach using an ensemble of political co-voting networks among U.S. Senators."]]></description>
<dc:subject>to:NB exponential_family_random_graphs ensemble_methods network_data_analysis statistics to_teach:baby-nets butts.carter_t.</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:132d040c5832/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:exponential_family_random_graphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:baby-nets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:butts.carter_t."/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1906.01235">
    <title>[1906.01235] Universal Boosting Variational Inference</title>
    <dc:date>2019-10-29T02:18:39+00:00</dc:date>
    <link>https://arxiv.org/abs/1906.01235</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Boosting variational inference (BVI) approximates an intractable probability density by iteratively building up a mixture of simple component distributions one at a time, using techniques from sparse convex optimization to provide both computational scalability and approximation error guarantees. But the guarantees have strong conditions that do not often hold in practice, resulting in degenerate component optimization problems; and we show that the ad-hoc regularization used to prevent degeneracy in practice can cause BVI to fail in unintuitive ways. We thus develop universal boosting variational inference (UBVI), a BVI scheme that exploits the simple geometry of probability densities under the Hellinger metric to prevent the degeneracy of other gradient-based BVI methods, avoid difficult joint optimizations of both component and weight, and simplify fully-corrective weight optimizations. We show that for any target density and any mixture component family, the output of UBVI converges to the best possible approximation in the mixture family, even when the mixture family is misspecified. We develop a scalable implementation based on exponential family mixture components and standard stochastic optimization techniques. Finally, we discuss statistical benefits of the Hellinger distance as a variational objective through bounds on posterior probability, moment, and importance sampling errors. Experiments on multiple datasets and models show that UBVI provides reliable, accurate posterior approximations."]]></description>
<dc:subject>to:NB density_estimation probability ensemble_methods computational_statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:68d19002e973/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:density_estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://link.springer.com/article/10.1007/s11023-019-09506-6">
    <title>The Rhetoric and Reality of Anthropomorphism in Artificial Intelligence | SpringerLink</title>
    <dc:date>2019-10-24T15:58:28+00:00</dc:date>
    <link>https://link.springer.com/article/10.1007/s11023-019-09506-6</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Artificial intelligence (AI) has historically been conceptualized in anthropomorphic terms. Some algorithms deploy biomimetic designs in a deliberate attempt to effect a sort of digital isomorphism of the human brain. Others leverage more general learning strategies that happen to coincide with popular theories of cognitive science and social epistemology. In this paper, I challenge the anthropomorphic credentials of the neural network algorithm, whose similarities to human cognition I argue are vastly overstated and narrowly construed. I submit that three alternative supervised learning methods—namely lasso penalties, bagging, and boosting—offer subtler, more interesting analogies to human reasoning as both an individual and a social phenomenon. Despite the temptation to fall back on anthropomorphic tropes when discussing AI, however, I conclude that such rhetoric is at best misleading and at worst downright dangerous. The impulse to humanize algorithms is an obstacle to properly conceptualizing the ethical challenges posed by emerging technologies."]]></description>
<dc:subject>to:NB anthropomorphism artificial_intelligence neural_networks ensemble_methods lasso rhetoric</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c79f17e90fab/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:anthropomorphism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:artificial_intelligence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:lasso"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:rhetoric"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1910.04743">
    <title>[1910.04743] The Implicit Regularization of Ordinary Least Squares Ensembles</title>
    <dc:date>2019-10-12T03:47:52+00:00</dc:date>
    <link>https://arxiv.org/abs/1910.04743</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Ensemble methods that average over a collection of independent predictors that are each limited to a subsampling of both the examples and features of the training data command a significant presence in machine learning, such as the ever-popular random forest, yet the nature of the subsampling effect, particularly of the features, is not well understood. We study the case of an ensemble of linear predictors, where each individual predictor is fit using ordinary least squares on a random submatrix of the data matrix. We show that, under standard Gaussianity assumptions, when the number of features selected for each predictor is optimally tuned, the asymptotic risk of a large ensemble is equal to the asymptotic ridge regression risk, which is known to be optimal among linear predictors in this setting. In addition to eliciting this implicit regularization that results from subsampling, we also connect this ensemble to the dropout technique used in training deep (neural) networks, another strategy that has been shown to have a ridge-like regularizing effect."]]></description>
<dc:subject>to:NB ensemble_methods regression statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c6ad104d2722/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1909.12299">
    <title>[1909.12299] ExpertoCoder: Capturing Divergent Brain Regions Using Mixture of Regression Experts</title>
    <dc:date>2019-10-01T17:37:43+00:00</dc:date>
    <link>https://arxiv.org/abs/1909.12299</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["fMRI semantic category understanding using linguistic encoding models attempts to learn a forward mapping that relates stimuli to the corresponding brain activation. Classical encoding models use linear multivariate methods to predict brain activation (all the voxels) given the stimulus. However, these methods mainly assume multiple regions as one vast uniform region or several independent regions, ignoring connections among them. In this paper, we present a mixture of experts model for predicting brain activity patterns. Given a new stimulus, the model predicts the entire brain activation as a weighted linear combination of activation of multiple experts. We argue that each expert captures activity patterns related to a particular region of interest (ROI) in the human brain. Thus, the utility of the proposed model is twofold. It not only accurately predicts the brain activation for a given stimulus, but it also reveals the level of activation of individual brain regions. Results of our experiments highlight the importance of the proposed model for predicting brain activation. This study also helps in understanding which of the brain regions get activated together, given a certain kind of stimulus. Importantly, we suggest that the mixture of regression experts (MoRE) framework successfully combines the two principles of organization of function in the brain, namely that of specialization and integration."]]></description>
<dc:subject>to:NB neural_data_analysis fmri mixture_models statistics ensemble_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:dffb8437bc01/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:fmri"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:mixture_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1806.03467">
    <title>[1806.03467] Orthogonal Random Forest for Causal Inference</title>
    <dc:date>2019-10-01T16:11:28+00:00</dc:date>
    <link>https://arxiv.org/abs/1806.03467</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We propose the orthogonal random forest, an algorithm that combines Neyman-orthogonality to reduce sensitivity with respect to estimation error of nuisance parameters with generalized random forests (Athey et al., 2017)--a flexible non-parametric method for statistical estimation of conditional moment models using random forests. We provide a consistency rate and establish asymptotic normality for our estimator. We show that under mild assumptions on the consistency rate of the nuisance estimator, we can achieve the same error rate as an oracle with a priori knowledge of these nuisance parameters. We show that when the nuisance functions have a locally sparse parametrization, then a local ℓ1-penalized regression achieves the required rate. We apply our method to estimate heterogeneous treatment effects from observational data with discrete treatments or continuous treatments, and we show that, unlike prior work, our method provably allows to control for a high-dimensional set of variables under standard sparsity conditions. We also provide a comprehensive empirical evaluation of our algorithm on both synthetic and real data."]]></description>
<dc:subject>to:NB decision_trees ensemble_methods regression causal_inference statistics nonparametrics random_forests</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:64b039275305/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:causal_inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:nonparametrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1909.11799">
    <title>[1909.11799] Manifold Forests: Closing the Gap on Neural Networks</title>
    <dc:date>2019-10-01T13:53:57+00:00</dc:date>
    <link>https://arxiv.org/abs/1909.11799</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Decision forests (DF), in particular random forests and gradient boosting trees, have demonstrated state-of-the-art accuracy compared to other methods in many supervised learning scenarios. In particular, DFs dominate other methods in tabular data, that is, when the feature space is unstructured, so that the signal is invariant to permuting feature indices. However, in structured data lying on a manifold---such as images, text, and speech---neural nets (NN) tend to outperform DFs. We conjecture that at least part of the reason for this is that the input to NN is not simply the feature magnitudes, but also their indices (for example, the convolution operation uses "feature locality"). In contrast, naïve DF implementations fail to explicitly consider feature indices. A recently proposed DF approach demonstrates that DFs, for each node, implicitly sample a random matrix from some specific distribution. Here, we build on that to show that one can choose distributions in a \emph{manifold aware fashion}. For example, for image classification, rather than randomly selecting pixels, one can randomly select contiguous patches. We demonstrate the empirical performance of data living on three different manifolds: images, time-series, and a torus. In all three cases, our Manifold Forest (\Mf) algorithm empirically dominates other state-of-the-art approaches that ignore feature space structure, achieving a lower classification error on all sample sizes. This dominance extends to the MNIST data set as well. Moreover, both training and test time is significantly faster for manifold forests as compared to deep nets. This approach, therefore, has promise to enable DFs and other machine learning methods to close the gap with deep nets on manifold-valued data."]]></description>
<dc:subject>to:NB decision_trees ensemble_methods to_read random_forests</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9e7ddf9742fb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1909.07578">
    <title>[1909.07578] Stacking Models for Nearly Optimal Link Prediction in Complex Networks</title>
    <dc:date>2019-09-25T03:26:04+00:00</dc:date>
    <link>https://arxiv.org/abs/1909.07578</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Most real-world networks are incompletely observed. Algorithms that can accurately predict which links are missing can dramatically speedup the collection of network data and improve the validity of network models. Many algorithms now exist for predicting missing links, given a partially observed network, but it has remained unknown whether a single best predictor exists, how link predictability varies across methods and networks from different domains, and how close to optimality current methods are. We answer these questions by systematically evaluating 203 individual link predictor algorithms, representing three popular families of methods, applied to a large corpus of 548 structurally diverse networks from six scientific domains. We first show that individual algorithms exhibit a broad diversity of prediction errors, such that no one predictor or family is best, or worst, across all realistic inputs. We then exploit this diversity via meta-learning to construct a series of "stacked" models that combine predictors into a single algorithm. Applied to a broad range of synthetic networks, for which we may analytically calculate optimal performance, these stacked models achieve optimal or nearly optimal levels of accuracy. Applied to real-world networks, stacked models are also superior, but their accuracy varies strongly by domain, suggesting that link prediction may be fundamentally easier in social networks than in biological or technological networks. These results indicate that the state-of-the-art for link prediction comes from combining individual algorithms, which achieves nearly optimal predictions. We close with a brief discussion of limitations and opportunities for further improvement of these results."]]></description>
<dc:subject>to:NB ensemble_methods network_data_analysis link_prediction statistics clauset.aaron airoldi.edo galstyan.aram</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9eaee2511c1c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:link_prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:clauset.aaron"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:airoldi.edo"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:galstyan.aram"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.06951">
    <title>[1908.06951] Gradient Boosting Machine: A Survey</title>
    <dc:date>2019-08-20T14:23:05+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.06951</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this survey, we discuss several different types of gradient boosting algorithms and illustrate their mathematical frameworks in detail: 1. introduction of gradient boosting leads to 2. objective function optimization, 3. loss function estimations, and 4. model constructions. 5. application of boosting in ranking."]]></description>
<dc:subject>to:NB ensemble_methods boosting statistics machine_learning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c32dc04478cc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:boosting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.06852">
    <title>[1908.06852] SIRUS: making random forests interpretable</title>
    <dc:date>2019-08-20T14:22:39+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.06852</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["State-of-the-art learning algorithms, such as random forests or neural networks, are often qualified as "black-boxes" because of the high number and complexity of operations involved in their prediction mechanism. This lack of interpretability is a strong limitation for applications involving critical decisions, typically the analysis of production processes in the manufacturing industry. In such critical contexts, models have to be interpretable, i.e., simple, stable, and predictive. To address this issue, we design SIRUS (Stable and In-terpretable RUle Set), a new classification algorithm based on random forests, which takes the form of a short list of rules. While simple models are usually unstable with respect to data perturbation, SIRUS achieves a remarkable stability improvement over cutting-edge methods. Furthermore, SIRUS inherits a predictive accuracy close to random forests, combined with the simplicity of decision trees. These properties are assessed both from a theoretical and empirical point of view, through extensive numerical experiments based on our R/C++ software implementation sirus."

--- Not sure that there's really much new here, beyond limiting the forest to very shallow trees.]]></description>
<dc:subject>to:NB classifiers ensemble_methods random_forests decision_trees data_mining statistics to_teach:data-mining have_skimmed</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e72b24d4a589/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_skimmed"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.02718">
    <title>[1908.02718] A Characterization of Mean Squared Error for Estimator with Bagging</title>
    <dc:date>2019-08-08T13:06:15+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.02718</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Bagging can significantly improve the generalization performance of unstable machine learning algorithms such as trees or neural networks. Though bagging is now widely used in practice and many empirical studies have explored its behavior, we still know little about the theoretical properties of bagged predictions. In this paper, we theoretically investigate how the bagging method can reduce the Mean Squared Error (MSE) when applied on a statistical estimator. First, we prove that for any estimator, increasing the number of bagged estimators N in the average can only reduce the MSE. This intuitive result, observed empirically and discussed in the literature, has not yet been rigorously proved. Second, we focus on the standard estimator of variance called unbiased sample variance and we develop an exact analytical expression of the MSE for this estimator with bagging. 
"This allows us to rigorously discuss the number of iterations N and the batch size m of the bagging method. From this expression, we state that only if the kurtosis of the distribution is greater than 32, the MSE of the variance estimator can be reduced with bagging. This result is important because it demonstrates that for distribution with low kurtosis, bagging can only deteriorate the performance of a statistical prediction. Finally, we propose a novel general-purpose algorithm to estimate with high precision the variance of a sample."]]></description>
<dc:subject>to:NB ensemble_methods prediction regression statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:28984b0de8c0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.02723">
    <title>[1908.02723] Advocacy Learning: Learning through Competition and Class-Conditional Representations</title>
    <dc:date>2019-08-08T12:57:47+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.02723</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We introduce advocacy learning, a novel supervised training scheme for attention-based classification problems. Advocacy learning relies on a framework consisting of two connected networks: 1) N Advocates (one for each class), each of which outputs an argument in the form of an attention map over the input, and 2) a Judge, which predicts the class label based on these arguments. Each Advocate produces a class-conditional representation with the goal of convincing the Judge that the input example belongs to their class, even when the input belongs to a different class. Applied to several different classification tasks, we show that advocacy learning can lead to small improvements in classification accuracy over an identical supervised baseline. Though a series of follow-up experiments, we analyze when and how such class-conditional representations improve discriminative performance. Though somewhat counter-intuitive, a framework in which subnetworks are trained to competitively provide evidence in support of their class shows promise, in many cases performing on par with standard learning approaches. This provides a foundation for further exploration into competition and class-conditional representations in supervised learning."

--- Drs. Mercier and Sperber, please call your office.  (Also Drs. Jordan and Jacobs...)]]></description>
<dc:subject>to:NB machine_learning collective_cognition ensemble_methods to_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:3f593a60f6e1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.01251">
    <title>[1908.01251] Measuring the Algorithmic Convergence of Randomized Ensembles: The Regression Setting</title>
    <dc:date>2019-08-06T14:47:00+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.01251</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["When randomized ensemble methods such as bagging and random forests are implemented, a basic question arises: Is the ensemble large enough? In particular, the practitioner desires a rigorous guarantee that a given ensemble will perform nearly as well as an ideal infinite ensemble (trained on the same data). The purpose of the current paper is to develop a bootstrap method for solving this problem in the context of regression --- which complements our companion paper in the context of classification (Lopes 2019). In contrast to the classification setting, the current paper shows that theoretical guarantees for the proposed bootstrap can be established under much weaker assumptions. In addition, we illustrate the flexibility of the method by showing how it can be adapted to measure algorithmic convergence for variable selection. Lastly, we provide numerical results demonstrating that the method works well in a range of situations."]]></description>
<dc:subject>to:NB ensemble_methods computational_statistics statistics to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8e148c2e58df/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://journal.sjdm.org/19/190215/jdm190215.html">
    <title>A universal method for evaluating the quality of aggregators</title>
    <dc:date>2019-08-03T23:06:47+00:00</dc:date>
    <link>http://journal.sjdm.org/19/190215/jdm190215.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We propose a new method to facilitate comparison of aggregated forecasts based on different aggregation, elicitation and calibration methods. Aggregates are evaluated by their relative position on the cumulative distribution of the corresponding individual scores. This allows one to compare methods using different measures of quality that use different scales. We illustrate the use of the method by re-analyzing various estimates from Budescu and Du (Management Science, 2007)."]]></description>
<dc:subject>to:NB ensemble_methods statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bf2e20f601a6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1907.11452">
    <title>[1907.11452] An Information-theoretic On-line Learning Principle for Specialization in Hierarchical Decision-Making Systems</title>
    <dc:date>2019-07-30T00:08:02+00:00</dc:date>
    <link>https://arxiv.org/abs/1907.11452</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Information-theoretic bounded rationality describes utility-optimizing decision-makers whose limited information-processing capabilities are formalized by information constraints. One of the consequences of bounded rationality is that resource-limited decision-makers can join together to solve decision-making problems that are beyond the capabilities of each individual. Here, we study an information-theoretic principle that drives division of labor and specialization when decision-makers with information constraints are joined together. We devise an on-line learning rule of this principle that learns a partitioning of the problem space such that it can be solved by specialized linear policies. We demonstrate the approach for decision-making problems whose complexity exceeds the capabilities of individual decision-makers, but can be solved by combining the decision-makers optimally. The strength of the model is that it is abstract and principled, yet has direct applications in classification, regression, reinforcement learning and adaptive control."]]></description>
<dc:subject>to:NB information_theory bounded_rationality collective_cognition social_life_of_the_mind ensemble_methods re:democratic_cognition to_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:44a695733c24/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:information_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bounded_rationality"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:collective_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_life_of_the_mind"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:democratic_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1907.08742">
    <title>[1907.08742] Estimating the Algorithmic Variance of Randomized Ensembles via the Bootstrap</title>
    <dc:date>2019-07-24T14:00:32+00:00</dc:date>
    <link>https://arxiv.org/abs/1907.08742</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Although the methods of bagging and random forests are some of the most widely used prediction methods, relatively little is known about their algorithmic convergence. In particular, there are not many theoretical guarantees for deciding when an ensemble is "large enough" --- so that its accuracy is close to that of an ideal infinite ensemble. Due to the fact that bagging and random forests are randomized algorithms, the choice of ensemble size is closely related to the notion of "algorithmic variance" (i.e. the variance of prediction error due only to the training algorithm). In the present work, we propose a bootstrap method to estimate this variance for bagging, random forests, and related methods in the context of classification. To be specific, suppose the training dataset is fixed, and let the random variable Errt denote the prediction error of a randomized ensemble of size t. Working under a "first-order model" for randomized ensembles, we prove that the centered law of Errt can be consistently approximated via the proposed method as t→∞. Meanwhile, the computational cost of the method is quite modest, by virtue of an extrapolation technique. As a consequence, the method offers a practical guideline for deciding when the algorithmic fluctuations of Errt are negligible."]]></description>
<dc:subject>to:NB ensemble_methods computational_statistics statistics prediction to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e4f2433d5ca3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1807.11408">
    <title>[1807.11408] Local Linear Forests</title>
    <dc:date>2019-06-23T17:33:39+00:00</dc:date>
    <link>https://arxiv.org/abs/1807.11408</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Random forests are a powerful method for non-parametric regression, but are limited in their ability to fit smooth signals, and can show poor predictive performance in the presence of strong, smooth effects. Taking the perspective of random forests as an adaptive kernel method, we pair the forest kernel with a local linear regression adjustment to better capture smoothness. The resulting procedure, local linear forests, enables us to improve on asymptotic rates of convergence for random forests with smooth signals, and provides substantial gains in accuracy on both real and simulated data. We prove a central limit theorem valid under regularity conditions on the forest and smoothness constraints, and propose a computationally efficient construction for confidence intervals. Moving to a causal inference application, we discuss the merits of local regression adjustments for heterogeneous treatment effect estimation, and give an example on a dataset exploring the effect word choice has on attitudes to the social safety net. Last, we include simulation results on real and generated data."]]></description>
<dc:subject>to:NB linear_regression ensemble_methods decision_trees athey.susan statistics random_forests</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e4550a9d18a1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:linear_regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:athey.susan"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>