<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (cshalizi)</title>
    <link>https://pinboard.in/u:cshalizi/public/</link>
    <description>recent bookmarks from cshalizi</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://elevanth.org/blog/2023/07/17/none-of-the-above/"/>
	<rdf:li rdf:resource="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998"/>
	<rdf:li rdf:resource="https://www.pnas.org/doi/10.1073/pnas.2203150119"/>
	<rdf:li rdf:resource="https://press.uchicago.edu/ucp/books/book/chicago/B/bo136254067"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2203.06498"/>
	<rdf:li rdf:resource="https://www.cambridge.org/core/books/between-the-spreadsheets/E595C14F50034C30ECDCC6B835710EA8#fndtn-information"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.03082"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2009.06864"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.03122"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1301.1034"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2104.04628"/>
	<rdf:li rdf:resource="https://www.aeaweb.org/articles?id=10.1257/jel.20191526"/>
	<rdf:li rdf:resource="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621"/>
	<rdf:li rdf:resource="https://ieeexplore.ieee.org/document/8861141"/>
	<rdf:li rdf:resource="https://www.biorxiv.org/content/10.1101/2020.10.15.341495v1"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.10992"/>
	<rdf:li rdf:resource="https://press.princeton.edu/books/hardcover/9780691172361/humanities-data-analysis"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.09770"/>
	<rdf:li rdf:resource="https://press.princeton.edu/books/hardcover/9780691182377/dark-data"/>
	<rdf:li rdf:resource="https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.91.045002"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1910.08707"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1907.09864"/>
	<rdf:li rdf:resource="http://matthewlincoln.net/2015/03/21/confabulation-in-the-humanities.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1908.01014"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1904.02101"/>
	<rdf:li rdf:resource="http://dx.doi.org/10.1037/h0071325"/>
	<rdf:li rdf:resource="https://www.tandfonline.com/doi/abs/10.1080/14786440109462720"/>
	<rdf:li rdf:resource="http://www.pnas.org/content/115/19/4891"/>
	<rdf:li rdf:resource="http://www.pnas.org/content/115/10/2317"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1802.03426"/>
	<rdf:li rdf:resource="https://press.princeton.edu/titles/11218.html"/>
	<rdf:li rdf:resource="https://lukeoakdenrayner.wordpress.com/2017/12/18/the-chestxray14-dataset-problems/"/>
	<rdf:li rdf:resource="https://www.nytimes.com/2017/10/18/magazine/when-the-revolution-came-for-amy-cuddy.html"/>
	<rdf:li rdf:resource="https://www.nap.edu/catalog/24893/federal-statistics-multiple-data-sources-and-privacy-protection-next-steps?mc_cid=a967e2da8c&amp;mc_eid=1fdc781427"/>
	<rdf:li rdf:resource="https://mitpress.mit.edu/books/artificial-unintelligence"/>
	<rdf:li rdf:resource="http://papers.nips.cc/paper/7213-poincare-embeddings-for-learning-hierarchical-representations"/>
	<rdf:li rdf:resource="http://statweb.stanford.edu/~donoho/Lectures/AMS2000/Curses.pdf"/>
	<rdf:li rdf:resource="http://www.ucpress.edu/book.php?isbn=9780520289949&amp;mc_cid=3d6dd98934&amp;mc_eid=3d88be13af"/>
	<rdf:li rdf:resource="http://press.uchicago.edu/ucp/books/book/chicago/E/bo26176063"/>
	<rdf:li rdf:resource="http://www.springer.com/us/book/9783709107409"/>
	<rdf:li rdf:resource="http://www.nap.edu/catalog/23616/refining-the-concept-of-scientific-interface-when-working-with-big-data"/>
	<rdf:li rdf:resource="http://shop.oreilly.com/product/0636920049081.do"/>
	<rdf:li rdf:resource="https://github.com/Quartz/bad-data-guide"/>
	<rdf:li rdf:resource="https://www.jacobinmag.com/2014/10/data-journalism-done-wrong/"/>
	<rdf:li rdf:resource="http://www.powells.com/biblio/62-9780822357445-1"/>
	<rdf:li rdf:resource="https://medium.com/@mrtz/how-big-data-is-unfair-9aa544d739de"/>
	<rdf:li rdf:resource="http://datacolada.org/2014/05/01/20-we-cannot-afford-to-study-effect-size-in-the-lab/"/>
	<rdf:li rdf:resource="http://www.nicebread.de/a-comment-on-we-cannot-afford-to-study-effect-size-in-the-lab-from-the-datacolada-blog/"/>
	<rdf:li rdf:resource="http://www.niemanlab.org/2014/07/alberto-cairo-data-journalism-needs-to-up-its-own-standards"/>
	<rdf:li rdf:resource="http://www.slate.com/articles/technology/bitwise/2014/08/what_is_big_data_good_for_incremental_change_not_big_paradigm_shifts.html?wpsrc=sh_all_dt_tw_top"/>
	<rdf:li rdf:resource="http://www.slate.com/articles/technology/bitwise/2014/07/facebook_okcupid_user_experiments_ethics_aside_they_show_us_the_limitations.html?wpsrc=sh_all_dt_tw_top"/>
	<rdf:li rdf:resource="http://sss.sagepub.com/content/44/4/555.abstract?etoc"/>
	<rdf:li rdf:resource="http://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1406.0873"/>
	<rdf:li rdf:resource="http://www.springer.com/statistics/computational+statistics/book/978-3-319-03163-7"/>
	<rdf:li rdf:resource="https://www.jacobinmag.com/2014/06/bro-bash/"/>
	<rdf:li rdf:resource="http://piketty.pse.ens.fr/files/capital21c/en/Piketty2014TechnicalAppendixResponsetoFT.pdf"/>
	<rdf:li rdf:resource="http://blogs.ft.com/money-supply/2014/05/23/data-problems-with-capital-in-the-21st-century/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1403.7400"/>
	<rdf:li rdf:resource="http://overview.ap.org/blog/2014/01/algorithms-are-not-enough-lessons-bringing-computer-science-to-journalism/"/>
	<rdf:li rdf:resource="http://modelviewculture.com/pieces/quantify-everything-a-dream-of-a-feminist-data-future"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1401.0742"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1401.5226"/>
	<rdf:li rdf:resource="http://www.jstatsoft.org/v55/i14"/>
	<rdf:li rdf:resource="http://liorpachter.wordpress.com/2014/01/19/why-do-you-look-at-the-speck-in-your-sisters-quilt-plot-and-pay-no-attention-to-the-plank-in-your-own-heat-map/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1402.2965"/>
	<rdf:li rdf:resource="http://simplystatistics.org/2014/02/14/on-the-scalability-of-statistical-procedures-why-the-p-value-bashers-just-dont-get-it/"/>
	<rdf:li rdf:resource="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3037423/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1010.0520"/>
	<rdf:li rdf:resource="http://www.aeaweb.org/articles.php?doi=10.1257/aer.103.7.3001"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://elevanth.org/blog/2023/07/17/none-of-the-above/">
    <title>None of the Above | Elements of Evolutionary Anthropology</title>
    <dc:date>2025-03-23T17:17:45+00:00</dc:date>
    <link>https://elevanth.org/blog/2023/07/17/none-of-the-above/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[--- If I am honest with myself, incorporating something like this (or even my own paper with Gelman!) into undergrad ADA would require a big re-design of the course, because it's currently "here is an array of sometimes-useful statistical methods", not "here is how you turn scientific questions into data-analytic problems, and statistical solutions back into scientific answers".  Knowing a lot of methods is _helpful_ to that undertaking, but it's different.  Maybe that's too much to ask of an undergrad class with >200 students/year...]]></description>
<dc:subject>statistics data_analysis have_read mcelreath.richard closing_old_tabs re:phil-of-bayes_paper to_teach:undergrad-ADA</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1079612a0b01/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:mcelreath.richard"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:closing_old_tabs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:phil-of-bayes_paper"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-ADA"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998">
    <title>Specification Curve: Descriptive and Inferential Statistics on All Reasonable Specifications by Uri Simonsohn, Joseph P. Simmons, Leif D. Nelson :: SSRN</title>
    <dc:date>2024-12-09T21:26:14+00:00</dc:date>
    <link>https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2694998</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Empirical results often hinge on data analytic decisions that are simultaneously defensible, arbitrary, and motivated. To mitigate this problem we introduce Specification-Curve Analysis, which consists of three steps: (i) identifying the set of theoretically justified, statistically valid, and non-redundant analytic specifications, (ii) displaying alternative results graphically, allowing the identification of decisions producing different results, and (iii) conducting statistical tests to determine whether as a whole results are inconsistent with the null hypothesis. We illustrate its use by applying it to three published findings. One proves robust, one weak, one not robust at all."

--- Item (1) seems like a tall order!]]></description>
<dc:subject>to:NB data_analysis statistics model_checking color_me_skeptical more_exactly_the_impulse_is_sound_but_i_doubt_they_can_really_do_it</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:888da2dd46e4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_checking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:more_exactly_the_impulse_is_sound_but_i_doubt_they_can_really_do_it"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.pnas.org/doi/10.1073/pnas.2203150119">
    <title>Observing many researchers using the same data and hypothesis reveals a hidden universe of uncertainty | PNAS</title>
    <dc:date>2022-11-19T20:36:42+00:00</dc:date>
    <link>https://www.pnas.org/doi/10.1073/pnas.2203150119</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This study explores how researchers’ analytical choices affect the reliability of scientific findings. Most discussions of reliability problems in science focus on systematic biases. We broaden the lens to emphasize the idiosyncrasy of conscious and unconscious decisions that researchers make during data analysis. We coordinated 161 researchers in 73 research teams and observed their research decisions as they used the same data to independently test the same prominent social science hypothesis: that greater immigration reduces support for social policies among the public. In this typical case of social science research, research teams reported both widely diverging numerical findings and substantive conclusions despite identical start conditions. Researchers’ expertise, prior beliefs, and expectations barely predict the wide variation in research outcomes. More than 95% of the total variance in numerical results remains unexplained even after qualitative coding of all identifiable decisions in each team’s workflow. This reveals a universe of uncertainty that remains hidden when considering a single study in isolation. The idiosyncratic nature of how researchers’ results and conclusions varied is a previously underappreciated explanation for why many scientific hypotheses remain contested. These results call for greater epistemic humility and clarity in reporting scientific findings."]]></description>
<dc:subject>to:NB to_read data_analysis statistics social_science_methodology</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1d84ac94f53f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.uchicago.edu/ucp/books/book/chicago/B/bo136254067">
    <title>Big Data for Twenty-First-Century Economic Statistics, Abraham, Jarmin, Moyer</title>
    <dc:date>2022-05-11T17:12:03+00:00</dc:date>
    <link>https://press.uchicago.edu/ucp/books/book/chicago/B/bo136254067</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The existing infrastructure for the production of key economic statistics relies heavily on data collected through sample surveys and periodic censuses, together with administrative records generated in connection with tax administration. The increasing difficulty of obtaining survey and census responses threatens the viability of existing data collection approaches. The growing availability of new sources of Big Data—such as scanner data on purchases, credit card transaction records, payroll information, and prices of various goods scraped from the websites of online sellers—has changed the data landscape. These new sources of data hold the promise of allowing the statistical agencies to produce more accurate, more disaggregated, and more timely economic data to meet the needs of policymakers and other data users. This volume documents progress made toward that goal and the challenges to be overcome to realize the full potential of Big Data in the production of economic statistics. It describes the deployment of Big Data to solve both existing and novel challenges in economic measurement, and it will be of interest to statistical agency staff, academic researchers, and serious users of economic statistics."]]></description>
<dc:subject>to:NB data_analysis econometrics computational_statistics social_measurement books:noted books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e7a4adbeaf79/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:econometrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2203.06498">
    <title>[2203.06498] The worst of both worlds: A comparative analysis of errors in learning from data in psychology and machine learning</title>
    <dc:date>2022-03-31T23:35:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2203.06498</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>data_analysis bad_data_analysis psychology data_mining gelman.andrew to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6c960a97eea4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bad_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:gelman.andrew"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/core/books/between-the-spreadsheets/E595C14F50034C30ECDCC6B835710EA8#fndtn-information">
    <title>Between the Spreadsheets: Classifying and Fixing Dirty Data</title>
    <dc:date>2021-12-30T06:00:28+00:00</dc:date>
    <link>https://www.cambridge.org/core/books/between-the-spreadsheets/E595C14F50034C30ECDCC6B835710EA8#fndtn-information</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Dirty data is a problem that costs businesses thousands, if not millions, every year. In organisations large and small across the globe you will hear talk of data quality issues. What you will rarely hear about is the consequences or how to fix it.
"Between the Spreadsheets: Classifying and Fixing Dirty Data draws on classification expert Susan Walsh's decade of experience in data classification to present a fool-proof method for cleaning and classifying your data. The book covers everything from the very basics of data classification to normalisation, taxonomies and presents the author's proven COAT methodology, helping ensure an organisation's data is Consistent, Organised, Accurate and Trustworthy. A series of data horror stories outlines what can go wrong in managing data, and if it does, how it can be fixed.
"After reading this book, regardless of your level of experience, not only will you be able to work with your data more efficiently, but you will also understand the impact the work you do with it has, and how it affects the rest of the organisation.
"Written in an engaging and highly practical manner, Between the Spreadsheets gives readers of all levels a deep understanding of the dangers of dirty data and the confidence and skills to work more efficiently and effectively with it."

--- Last tag because I am very doubtful that there can be a general methodology here, much less a foolproof one.]]></description>
<dc:subject>to:NB books:noted data_analysis data_cleaning color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f4b6b56f8578/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.03082">
    <title>[2105.03082] Comment on &quot;Reproducibility and Replication of Experimental Particle Physics Results&quot;</title>
    <dc:date>2021-05-13T15:04:21+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.03082</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["I would like to thank Junk and Lyons (arXiv:2009.06864) for beginning a discussion about replication in high-energy physics (HEP). Junk and Lyons ultimately argue that HEP learned its lessons the hard way through past failures and that other fields could learn from our procedures. They emphasize that experimental collaborations would risk their legacies were they to make a type-1 error in a search for new physics and outline the vigilance taken to avoid one, such as data blinding and a strict 5σ threshold. The discussion, however, ignores an elephant in the room: there are regularly anomalies in searches for new physics that result in substantial scientific activity but don't replicate with more data."]]></description>
<dc:subject>to:NB particle_physics statistics data_analysis science_as_a_social_process</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:204285fadb51/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:particle_physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:science_as_a_social_process"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2009.06864">
    <title>[2009.06864] Reproducibility and Replication of Experimental Particle Physics Results</title>
    <dc:date>2021-05-13T15:03:51+00:00</dc:date>
    <link>https://arxiv.org/abs/2009.06864</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Recently, much attention has been focused on the replicability of scientific results, causing scientists, statisticians, and journal editors to examine closely their methodologies and publishing criteria. Experimental particle physicists have been aware of the precursors of non-replicable research for many decades and have many safeguards to ensure that the published results are as reliable as possible. The experiments require large investments of time and effort to design, construct, and operate. Large collaborations produce and check the results, and many papers are signed by more than three thousand authors. This paper gives an introduction to what experimental particle physics is and to some of the tools that are used to analyze the data. It describes the procedures used to ensure that results can be computationally reproduced, both by collaborators and by non-collaborators. It describes the status of publicly available data sets and analysis tools that aid in reproduction and recasting of experimental results. It also describes methods particle physicists use to maximize the reliability of the results, which increases the probability that they can be replicated by other collaborations or even the same collaborations with more data and new personnel. Examples of results that were later found to be false are given, both with failed replication attempts and one with alarmingly successful replications. While some of the characteristics of particle physics experiments are unique, many of the procedures and techniques can be and are used in other fields."]]></description>
<dc:subject>to:NB statistics particle_physics science_as_a_social_process data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d93e3a092c34/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:particle_physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:science_as_a_social_process"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.03122">
    <title>[2105.03122] From Graph Centrality to Data Depth</title>
    <dc:date>2021-05-10T22:51:17+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.03122</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Given a sample of points in a Euclidean space, we can define a notion of depth by forming a neighborhood graph and applying a notion of centrality. In the present paper, we focus on the degree, iterates of the H-index, and the coreness, which are all well-known measures of centrality. We study their behaviors when applied to a sample of points drawn i.i.d. from an underlying density and with a connectivity radius properly chosen. Equivalently, we study these notions of centrality in the context of random neighborhood graphs. We show that, in the large-sample limit and under some standard condition on the connectivity radius, the degree converges to the likelihood depth (unsurprisingly), while iterates of the H-index and the coreness converge to new notions of depth."]]></description>
<dc:subject>to:NB graph_theory data_analysis statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ed8fb8095e49/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:graph_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1301.1034">
    <title>[1301.1034] How many of the digits in a mean of 12.3456789012 are worth reporting?</title>
    <dc:date>2021-05-10T15:20:08+00:00</dc:date>
    <link>https://arxiv.org/abs/1301.1034</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["OBJECTIVE. A computer program tells me that a mean value is 12.3456789012, but how many of these digits are significant (the rest being random junk)? Should I report: 12.3?, 12.3456?, or even 10 (if only the first digit is significant)? There are several rules-of-thumb but, surprisingly (given that the problem is so common in science), none seem to be evidence-based. RESULTS. Here I show how the significance of a digit in a particular decade of a mean depends on the standard error of the mean (SEM). I define an index, DM that can be plotted in graphs. From these a simple evidence-based rule for the number of significant digits ("sigdigs") is distilled: the last sigdig in the mean is in the same decade as the first or second non-zero digit in the SEM. As example, for mean 34.63 (SEM 25.62), with n = 17, the reported value should be 35 (SEM 26). Digits beyond these contain little or no useful information, and should not be reported lest they damage your credibility."

--- I am pretty sure this is the rule I was taught in my first serious physics lab course in 1991.]]></description>
<dc:subject>to:NB data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f67835a31c6f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2104.04628">
    <title>[2104.04628] Modeling Time-Varying Random Objects and Dynamic Networks</title>
    <dc:date>2021-04-13T04:00:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2104.04628</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Samples of dynamic or time-varying networks and other random object data such as time-varying probability distributions are increasingly encountered in modern data analysis. Common methods for time-varying data such as functional data analysis are infeasible when observations are time courses of networks or other complex non-Euclidean random objects that are elements of general metric spaces. In such spaces, only pairwise distances between the data objects are available and a strong limitation is that one cannot carry out arithmetic operations due to the lack of an algebraic structure. We combat this complexity by a generalized notion of mean trajectory taking values in the object space. For this, we adopt pointwise Fréchet means and then construct pointwise distance trajectories between the individual time courses and the estimated Fréchet mean trajectory, thus representing the time-varying objects and networks by functional data. Functional principal component analysis of these distance trajectories can reveal interesting features of dynamic networks and object time courses and is useful for downstream analysis. Our approach also makes it possible to study the empirical dynamics of time-varying objects, including dynamic regression to the mean or explosive behavior over time. We demonstrate desirable asymptotic properties of sample based estimators for suitable population targets under mild assumptions. The utility of the proposed methodology is illustrated with dynamic networks, time-varying distribution data and longitudinal growth data."]]></description>
<dc:subject>to:NB functional_data_analysis time_series network_data_analysis networks_in_and_over_time statistics data_analysis statistics_on_manifolds</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:720c11a53ffa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:functional_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:time_series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:networks_in_and_over_time"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics_on_manifolds"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.aeaweb.org/articles?id=10.1257/jel.20191526">
    <title>How Well Do Automated Linking Methods Perform? Lessons from US Historical Data - American Economic Association</title>
    <dc:date>2020-12-11T01:56:44+00:00</dc:date>
    <link>https://www.aeaweb.org/articles?id=10.1257/jel.20191526</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This paper reviews the literature in historical record linkage in the United States and examines the performance of widely used record-linking algorithms and common variations in their assumptions. We use two high-quality, hand-linked data sets and one synthetic ground truth to examine the direct effects of linking algorithms on data quality. We find that (i) no algorithm (including hand linking) consistently produces representative samples; (ii) 15 to 37 percent of links chosen by widely used algorithms are classified as errors by trained human reviewers; and (iii) false links are systematically related to baseline sample characteristics, showing that some algorithms may introduce systematic measurement error into analyses. A case study shows that the combined effects of (i)–(iii) attenuate estimates of the intergenerational income elasticity by up to 29 percent, and common variations in algorithm assumptions result in greater attenuation. As current practice moves to automate linking and increase link rates, these results highlight the important potential consequences of linking errors on inferences with linked data. We conclude with constructive suggestions for reducing linking errors and directions for future research."]]></description>
<dc:subject>to:NB record_linkage entity_resolution relational_learning data_analysis social_science_methodology social_measurement</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:0cfb0e0e0937/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:record_linkage"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:entity_resolution"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:relational_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621">
    <title>Computational Social Science and Sociology | Annual Review of Sociology</title>
    <dc:date>2020-11-19T22:13:26+00:00</dc:date>
    <link>https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The integration of social science with computer science and engineering fields has produced a new area of study: computational social science. This field applies computational methods to novel sources of digital data such as social media, administrative records, and historical archives to develop theories of human behavior. We review the evolution of this field within sociology via bibliometric analysis and in-depth analysis of the following subfields where this new work is appearing most rapidly: (a) social network analysis and group formation; (b) collective behavior and political sociology; (c) the sociology of knowledge; (d) cultural sociology, social psychology, and emotions; (e) the production of culture; (f) economic sociology and organizations; and (g) demography and population studies. Our review reveals that sociologists are not only at the center of cutting-edge research that addresses longstanding questions about human behavior but also developing new lines of inquiry about digital spaces as well. We conclude by discussing challenging new obstacles in the field, calling for increased attention to sociological theory, and identifying new areas where computational social science might be further integrated into mainstream sociology."]]></description>
<dc:subject>to:NB sociology data_mining data_analysis network_data_analysis bail.christopher</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2d69b6950b0e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bail.christopher"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://ieeexplore.ieee.org/document/8861141">
    <title>How Much Does Your Data Exploration Overfit? Controlling Bias via Information Usage - IEEE Journals &amp; Magazine</title>
    <dc:date>2020-11-16T16:05:49+00:00</dc:date>
    <link>https://ieeexplore.ieee.org/document/8861141</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Modern data is messy and high-dimensional, and it is often not clear a priori what are the right questions to ask. Instead, the analyst typically needs to use the data to search for interesting analyses to perform and hypotheses to test. This is an adaptive process, where the choice of analysis to be performed next depends on the results of the previous analyses on the same data. Ultimately, which results are reported can be heavily influenced by the data. It is widely recognized that this process, even if well-intentioned, can lead to biases and false discoveries, contributing to the crisis of reproducibility in science. But while any data-exploration renders standard statistical theory invalid, experience suggests that different types of exploratory analysis can lead to disparate levels of bias, and the degree of bias also depends on the particulars of the data set. In this paper, we propose a general information usage framework to quantify and provably bound the bias and other error metrics of an arbitrary exploratory analysis. We prove that our mutual information based bound is tight in natural settings, and then use it to give rigorous insights into when commonly used procedures do or do not lead to substantially biased estimation. Through the lens of information usage, we analyze the bias of specific exploration procedures such as filtering, rank selection and clustering. Our general framework also naturally motivates randomization techniques that provably reduce exploration bias while preserving the utility of the data analysis. We discuss the connections between our approach and related ideas from differential privacy and blinded data analysis, and supplement our results with illustrative simulations."

--- Pretty sure I've previously bookmarked a pre-print.]]></description>
<dc:subject>to:NB to_read data_analysis data_mining model_selection statistics post-model-selection_inference to_teach:linear_models to_teach:undergrad-ADA to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:3ef24ab591ab/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:post-model-selection_inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:linear_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-ADA"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.biorxiv.org/content/10.1101/2020.10.15.341495v1">
    <title>Numerical Instabilities in Analytical Pipelines Lead to Large and Meaningful Variability in Brain Networks | bioRxiv</title>
    <dc:date>2020-10-22T12:43:39+00:00</dc:date>
    <link>https://www.biorxiv.org/content/10.1101/2020.10.15.341495v1</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The analysis of brain-imaging data requires complex and often non-linear transformations to support findings on brain function or pathologies. And yet, recent work has shown that variability in the choices that one makes when analyzing data can lead to quantitatively and qualitatively different results, endangering the trust in conclusions. Even within a given method or analytical technique, numerical instabilities could compromise findings. We instrumented a structural-connectome estimation pipeline with Monte Carlo Arithmetic, a technique to introduce random noise in floating-point computations, and evaluated the stability of the derived connectomes, their features, and the impact on a downstream analysis. The stability of results was found to be highly dependent upon which features of the connectomes were evaluated, and ranged from perfectly stable (i.e. no observed variability across executions) to highly unstable (i.e. the results contained no trustworthy significant information). While the extreme range and variability in results presented here could severely hamper our understanding of brain organization in brain-imaging studies, it also leads to an increase in the reliability of datasets. This paper highlights the potential of leveraging the induced variance in estimates of brain connectivity to reduce the bias in networks alongside increasing the robustness of their applications in the detection or classification of individual differences. This paper demonstrates that stability evaluations are necessary for understanding error and bias inherent to scientific computing, and that they should be a component of typical analytical workflows.
]]></description>
<dc:subject>to:NB scientific_computing data_analysis functional_connectivity neuroscience statistics network_data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:766c125979dc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:scientific_computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:functional_connectivity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neuroscience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.10992">
    <title>[2002.10992] Migration Networks: Applications of Network Analysis to Macroscale Migration Patterns</title>
    <dc:date>2020-07-27T18:41:29+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.10992</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["An emerging area of research is the study of macroscale migration patterns as a network of nodes that represent places (e.g., countries, cities, and rural areas) and edges that encode migration ties that connect those places. In this chapter, we first review advances in the study of migration networks and recent work that has employed network analysis to examine such networks at different geographical scales. In our discussion, we focus in particular on global scale migration networks. We then propose ways to leverage network analysis in concert with digital technologies and online geolocated data to examine the structure and dynamics of migration networks. The implementation of such approaches for studying migration networks faces many challenges, including ethical ones, methodological ones, socio-technological ones (e.g., data availability and reuse), and research reproducibility. We detail these challenges, and we then consider possible ways of linking digital geolocated data to administrative and survey data as a way of harnessing new technologies to construct increasingly realistic migration networks (e.g., using multiplex networks). We also briefly discuss new methods (e.g., multilayer network analysis) in network analysis and adjacent fields (e.g., machine learning) that can help advance understanding of macroscale patterns of migration."]]></description>
<dc:subject>to:NB migration demography network_data_analysis social_networks spatial_statistics statistics data_analysis porter.mason_a. to_teach:baby-nets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ceccf1807439/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:migration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:demography"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spatial_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:porter.mason_a."/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:baby-nets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.princeton.edu/books/hardcover/9780691172361/humanities-data-analysis">
    <title>Humanities Data Analysis | Princeton University Press</title>
    <dc:date>2020-07-07T16:10:21+00:00</dc:date>
    <link>https://press.princeton.edu/books/hardcover/9780691172361/humanities-data-analysis</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The use of quantitative methods in the humanities and related social sciences has increased considerably in recent years, allowing researchers to discover patterns in a vast range of source materials. Despite this growth, there are few resources addressed to students and scholars who wish to take advantage of these powerful tools. Humanities Data Analysis offers the first intermediate-level guide to quantitative data analysis for humanities students and scholars using the Python programming language. This practical textbook, which assumes a basic knowledge of Python, teaches readers the necessary skills for conducting humanities research in the rapidly developing digital environment.
"The book begins with an overview of the place of data science in the humanities, and proceeds to cover data carpentry: the essential techniques for gathering, cleaning, representing, and transforming textual and tabular data. Then, drawing from real-world, publicly available data sets that cover a variety of scholarly domains, the book delves into detailed case studies. Focusing on textual data analysis, the authors explore such diverse topics as network analysis, genre theory, onomastics, literacy, author attribution, mapping, stylometry, topic modeling, and time series analysis. Exercises and resources for further reading are provided at the end of each chapter.
"An ideal resource for humanities students and scholars aiming to take their Python skills to the next level, Humanities Data Analysis illustrates the benefits that quantitative methods can bring to complex research questions."]]></description>
<dc:subject>to:NB books:noted data_analysis statistics riddell.allen digital_humanities books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bc8bd901ac59/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:riddell.allen"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:digital_humanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.09770">
    <title>[2002.09770] Allotaxonometry and rank-turbulence divergence: A universal instrument for comparing complex systems</title>
    <dc:date>2020-04-30T15:28:13+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.09770</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Complex systems often comprise many kinds of components which vary over many orders of magnitude in size: Populations of cities in countries, individual and corporate wealth in economies, species abundance in ecologies, word frequency in natural language, and node degree in complex networks. Comparisons of component size distributions for two complex systems---or a system with itself at two different time points---generally employ information-theoretic instruments, such as Jensen-Shannon divergence. We argue that these methods lack transparency and adjustability, and should not be applied when component probabilities are non-sensible or are problematic to estimate. Here, we introduce `allotaxonometry' along with `rank-turbulence divergence', a tunable instrument for comparing any two (Zipfian) ranked lists of components. We analytically develop our rank-based divergence in a series of steps, and then establish a rank-based allotaxonograph which pairs a map-like histogram for rank-rank pairs with an ordered list of components according to divergence contribution. We explore the performance of rank-turbulence divergence for a series of distinct settings including: Language use on Twitter and in books, species abundance, baby name popularity, market capitalization, performance in sports, mortality causes, and job titles. We provide a series of supplementary flipbooks which demonstrate the tunability and storytelling power of rank-based allotaxonometry."

--- I'll just note in passing that Spearman's rank correlation coefficient seems to be mentioned in exactly one paragraph, and they don't provide any comparison of what they can get with their (complicated, tunable) average of change in inverse ranks to what we could learn from a simple calculation of Spearman's correlation.  (I'm prepared to believe that there are insights here, but I'd want to be shown compelling examples.)  I'd also like a comparison to Handcock and Morrison's relative distributions [http://www.jstor.org/pss/270964].  The last tag applies with vehemence.]]></description>
<dc:subject>to:NB data_analysis comparison_of_distributions descriptive_statistics heavy_tails dodds.peter_sheridan via:rvenkat color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:990d61c2bca3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:comparison_of_distributions"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:descriptive_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:heavy_tails"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:dodds.peter_sheridan"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:rvenkat"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.princeton.edu/books/hardcover/9780691182377/dark-data">
    <title>Dark Data | Princeton University Press</title>
    <dc:date>2020-03-12T14:57:50+00:00</dc:date>
    <link>https://press.princeton.edu/books/hardcover/9780691182377/dark-data</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In the era of big data, it is easy to imagine that we have all the information we need to make good decisions. But in fact the data we have are never complete, and may be only the tip of the iceberg. Just as much of the universe is composed of dark matter, invisible to us but nonetheless present, the universe of information is full of dark data that we overlook at our peril. In Dark Data, data expert David Hand takes us on a fascinating and enlightening journey into the world of the data we don’t see.
"Dark Data explores the many ways in which we can be blind to missing data and how that can lead us to conclusions and actions that are mistaken, dangerous, or even disastrous. Examining a wealth of real-life examples, from the Challenger shuttle explosion to complex financial frauds, Hand gives us a practical taxonomy of the types of dark data that exist and the situations in which they can arise, so that we can learn to recognize and control for them. In doing so, he teaches us not only to be alert to the problems presented by the things we don’t know, but also shows how dark data can be used to our advantage, leading to greater understanding and better decisions."]]></description>
<dc:subject>to:NB books:noted data_analysis statistics missing_data hand.david_j. popular_social_science books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:915c87a48e3c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:missing_data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hand.david_j."/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:popular_social_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.91.045002">
    <title>Rev. Mod. Phys. 91, 045002 (2019) - Machine learning and the physical sciences</title>
    <dc:date>2020-01-12T22:59:26+00:00</dc:date>
    <link>https://journals.aps.org/rmp/abstract/10.1103/RevModPhys.91.045002</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Machine learning (ML) encompasses a broad range of algorithms and modeling tools used for a vast array of data processing tasks, which has entered most scientific disciplines in recent years. This article reviews in a selective way the recent research on the interface between machine learning and the physical sciences. This includes conceptual developments in ML motivated by physical insights, applications of machine learning techniques to several domains in physics, and cross fertilization between the two fields. After giving a basic notion of machine learning methods and principles, examples are described of how statistical physics is used to understand methods in ML. This review then describes applications of ML methods in particle physics and cosmology, quantum many-body physics, quantum computing, and chemical and material physics. Research and development into novel computing architectures aimed at accelerating ML are also highlighted. Each of the sections describe recent successes as well as domain-specific methodology and challenges."]]></description>
<dc:subject>to:NB machine_learning data_mining physics data_analysis equations_of_motion_from_a_time_series</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:344ce2216288/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:equations_of_motion_from_a_time_series"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1910.08707">
    <title>[1910.08707] Introduction to Coresets: Accurate Coresets</title>
    <dc:date>2019-10-22T13:32:56+00:00</dc:date>
    <link>https://arxiv.org/abs/1910.08707</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A coreset (or core-set) of an input set is its small summation, such that solving a problem on the coreset as its input, provably yields the same result as solving the same problem on the original (full) set, for a given family of problems (models, classifiers, loss functions). Over the past decade, coreset construction algorithms have been suggested for many fundamental problems in e.g. machine/deep learning, computer vision, graphics, databases, and theoretical computer science. This introductory paper was written following requests from (usually non-expert, but also colleagues) regarding the many inconsistent coreset definitions, lack of available source code, the required deep theoretical background from different fields, and the dense papers that make it hard for beginners to apply coresets and develop new ones.
"The paper provides folklore, classic and simple results including step-by-step proofs and figures, for the simplest (accurate) coresets of very basic problems, such as: sum of vectors, minimum enclosing ball, SVD/ PCA and linear regression. Nevertheless, we did not find most of their constructions in the literature. Moreover, we expect that putting them together in a retrospective context would help the reader to grasp modern results that usually extend and generalize these fundamental observations. Experts might appreciate the unified notation and comparison table that links between existing results.
"Open source code with example scripts are provided for all the presented algorithms, to demonstrate their practical usage, and to support the readers who are more familiar with programming than math."]]></description>
<dc:subject>to:NB optimization computational_statistics statistics data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5f863cd94cc3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1907.09864">
    <title>[1907.09864] Doubts on the efficacy of outliers correction methods</title>
    <dc:date>2019-10-02T15:49:58+00:00</dc:date>
    <link>https://arxiv.org/abs/1907.09864</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["While the utilisation of different methods of outliers correction has been shown to counteract the inferential error produced by the presence of contaminating data not belonging to the studied population; the effects produced by their utilisation when samples do not contain contaminating outliers are less clear. Here a simulation approach shows that the most popular methods of outliers correction (2 Sigma, 3 Sigma, MAD, IQR, Grubbs and winsorizing) worsen the inferential evaluation of the studied population in this condition, in particular producing an inflation of Type I error and increasing the error committed in estimating the population mean and STD. We show that those methods that have the highest efficacy in counteract the inflation of Type I and Type II errors in the presence of contaminating outliers also produce the stronger increase of false positive results in their absence, suggesting that the systematic utilisation of methods for outliers correction risk to produce more harmful than beneficial effect on statistical inference. We finally propose that the safest way to deal with the presence of outliers for statistical comparisons is the utilisation of non-parametric tests."]]></description>
<dc:subject>to:NB outliers anomaly_detection statistics data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:adaa4665799a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:outliers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:anomaly_detection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://matthewlincoln.net/2015/03/21/confabulation-in-the-humanities.html">
    <title>Confabulation in the humanities - Matthew Lincoln, PhD</title>
    <dc:date>2019-08-15T19:51:55+00:00</dc:date>
    <link>http://matthewlincoln.net/2015/03/21/confabulation-in-the-humanities.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Now, realize that this doesn't _just_ apply to interpreting quantitative analyses, but also to more traditionally-humanistic explanations...]]></description>
<dc:subject>data_analysis humanities everything_is_obvious_once_you_know_the_answer to_teach via:? have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:19823c3df5de/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:humanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:everything_is_obvious_once_you_know_the_answer"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:?"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1908.01014">
    <title>[1908.01014] A Survey on Compressive Sensing: Classical Results and Recent Advancements</title>
    <dc:date>2019-08-06T14:53:11+00:00</dc:date>
    <link>https://arxiv.org/abs/1908.01014</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Recovering sparse signals from linear measurements has demonstrated outstanding utility in a vast variety of real-world applications. Compressive sensing is the topic that studies the associated raised questions for the possibility of a successful recovery. This topic is well-nourished and numerous results are available in the literature. However, their dispersity makes it challenging and time-consuming for new readers and practitioners to quickly grasp its main ideas and classical algorithms, and further touch upon the recent advancements in this surging field. Besides, the sparsity notion has already demonstrated its effectiveness in many contemporary fields. Thus, these results are useful and inspiring for further investigation of related questions in these emerging fields from new perspectives. In this survey, we gather and overview vital classical tools and algorithms in compressive sensing and describe significant recent advancements. We conclude this survey by a numerical comparison of the performance of described approaches on an interesting application."]]></description>
<dc:subject>to:NB compressed_sensing optimization data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6316decf1639/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:compressed_sensing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1904.02101">
    <title>[1904.02101] The Landscape of R Packages for Automated Exploratory Data Analysis</title>
    <dc:date>2019-08-01T13:44:55+00:00</dc:date>
    <link>https://arxiv.org/abs/1904.02101</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The increasing availability of large but noisy data sets with a large number of heterogeneous variables leads to the increasing interest in the automation of common tasks for data analysis. The most time-consuming part of this process is the Exploratory Data Analysis, crucial for better domain understanding, data cleaning, data validation, and feature engineering. "
There is a growing number of libraries that attempt to automate some of the typical Exploratory Data Analysis tasks to make the search for new insights easier and faster. In this paper, we present a systematic review of existing tools for Automated Exploratory Data Analysis (autoEDA). We explore the features of twelve popular R packages to identify the parts of analysis that can be effectively automated with the current tools and to point out new directions for further autoEDA development.]]></description>
<dc:subject>to:NB R exploratory_data_analysis data_analysis statistics to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:704bfbb81807/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:exploratory_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://dx.doi.org/10.1037/h0071325">
    <title>Analysis of a complex of statistical variables into principal components.</title>
    <dc:date>2018-09-08T18:52:35+00:00</dc:date>
    <link>http://dx.doi.org/10.1037/h0071325</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The problem is stated in detail, a method of analysis is derived and its geometrical meaning shown, methods of solution are illustrated and certain derivative problems are discussed. (To be concluded in October issue.) "

--- In which Harold Hotelling re-invents principal components analysis, 32 years after Karl Pearson.  (Part 2: http://dx.doi.org/10.1037/h0070888)]]></description>
<dc:subject>to:NB have_read principal_components data_analysis hotelling.harold re:ADAfaEPoV</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2e89a74cb6f8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:principal_components"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hotelling.harold"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:ADAfaEPoV"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.tandfonline.com/doi/abs/10.1080/14786440109462720">
    <title>On lines and planes of closest fit to systems of points in space (K. Pearson, 1901)</title>
    <dc:date>2018-09-08T18:49:55+00:00</dc:date>
    <link>https://www.tandfonline.com/doi/abs/10.1080/14786440109462720</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[In which Karl Pearson invents principal components analysis, with the entirely sensible objective of finding low-dimensional approximations to high-dimensional data.  (i.e., basically the way I teach it!)]]></description>
<dc:subject>to:NB principal_components data_analysis pearson.karl re:ADAfaEPoV have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:79b5a2fdcf75/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:principal_components"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:pearson.karl"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:ADAfaEPoV"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.pnas.org/content/115/19/4891">
    <title>Sporadic sampling, not climatic forcing, drives observed early hominin diversity | PNAS</title>
    <dc:date>2018-05-09T15:03:33+00:00</dc:date>
    <link>http://www.pnas.org/content/115/19/4891</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The role of climate change in the origin and diversification of early hominins is hotly debated. Most accounts of early hominin evolution link observed fluctuations in species diversity to directional shifts in climate or periods of intense climatic instability. None of these hypotheses, however, have tested whether observed diversity patterns are distorted by variation in the quality of the hominin fossil record. Here, we present a detailed examination of early hominin diversity dynamics, including both taxic and phylogenetically corrected diversity estimates. Unlike past studies, we compare these estimates to sampling metrics for rock availability (hominin-, primate-, and mammal-bearing formations) and collection effort, to assess the geological and anthropogenic controls on the sampling of the early hominin fossil record. Taxic diversity, primate-bearing formations, and collection effort show strong positive correlations, demonstrating that observed patterns of early hominin taxic diversity can be explained by temporal heterogeneity in fossil sampling rather than genuine evolutionary processes. Peak taxic diversity at 1.9 million years ago (Ma) is a sampling artifact, reflecting merely maximal rock availability and collection effort. In contrast, phylogenetic diversity estimates imply peak diversity at 2.4 Ma and show little relation to sampling metrics. We find that apparent relationships between early hominin diversity and indicators of climatic instability are, in fact, driven largely by variation in suitable rock exposure and collection effort. Our results suggest that significant improvements in the quality of the fossil record are required before the role of climate in hominin evolution can be reliably determined."]]></description>
<dc:subject>to:NB human_evolution paleontology data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:830db5a83687/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:human_evolution"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:paleontology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.pnas.org/content/115/10/2317">
    <title>From global scaling to the dynamics of individual cities | PNAS</title>
    <dc:date>2018-05-08T13:22:04+00:00</dc:date>
    <link>http://www.pnas.org/content/115/10/2317</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Scaling has been proposed as a powerful tool to analyze the properties of complex systems and in particular for cities where it describes how various properties change with population. The empirical study of scaling on a wide range of urban datasets displays apparent nonlinear behaviors whose statistical validity and meaning were recently the focus of many debates. We discuss here another aspect, which is the implication of such scaling forms on individual cities and how they can be used for predicting the behavior of a city when its population changes. We illustrate this discussion in the case of delay due to traffic congestion with a dataset of 101 US cities in the years 1982–2014. We show that the scaling form obtained by agglomerating all of the available data for different cities and for different years does display a nonlinear behavior, but which appears to be unrelated to the dynamics of individual cities when their population grows. In other words, the congestion-induced delay in a given city does not depend on its population only, but also on its previous history. This strong path dependency prohibits the existence of a simple scaling form valid for all cities and shows that we cannot always agglomerate the data for many different systems. More generally, these results also challenge the use of transversal data for understanding longitudinal series for cities."]]></description>
<dc:subject>to:NB cities data_analysis re:urban_scaling_what_urban_scaling</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:41c2b412804d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:urban_scaling_what_urban_scaling"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1802.03426">
    <title>[1802.03426] UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction</title>
    <dc:date>2018-03-11T18:56:42+00:00</dc:date>
    <link>https://arxiv.org/abs/1802.03426</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["UMAP (Uniform Manifold Approximation and Projection) is a novel manifold learning technique for dimension reduction. UMAP is constructed from a theoretical framework based in Riemannian geometry and algebraic topology. The result is a practical scalable algorithm that applies to real world data. The UMAP algorithm is competitive with t-SNE for visualization quality, and arguably preserves more of the global structure with superior run time performance. Furthermore, UMAP as described has no computational restrictions on embedding dimension, making it viable as a general purpose dimension reduction technique for machine learning"]]></description>
<dc:subject>to:NB via:vaguery manifold_learning dimension_reduction data_analysis data_mining to_teach:data-mining re:ADAfaEPoV</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b2108ad9d881/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:vaguery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:manifold_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:dimension_reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:ADAfaEPoV"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.princeton.edu/titles/11218.html">
    <title>Muller, J.Z.: The Tyranny of Metrics | Princeton University Press</title>
    <dc:date>2018-02-26T01:44:24+00:00</dc:date>
    <link>https://press.princeton.edu/titles/11218.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["How the obsession with quantifying human performance threatens our schools, medical care, businesses, and government
"Today, organizations of all kinds are ruled by the belief that the path to success is quantifying human performance, publicizing the results, and dividing up the rewards based on the numbers. But in our zeal to instill the evaluation process with scientific rigor, we've gone from measuring performance to fixating on measuring itself. The result is a tyranny of metrics that threatens the quality of our lives and most important institutions. In this timely and powerful book, Jerry Muller uncovers the damage our obsession with metrics is causing--and shows how we can begin to fix the problem.
"Filled with examples from education, medicine, business and finance, government, the police and military, and philanthropy and foreign aid, this brief and accessible book explains why the seemingly irresistible pressure to quantify performance distorts and distracts, whether by encouraging "gaming the stats" or "teaching to the test." That's because what can and does get measured is not always worth measuring, may not be what we really want to know, and may draw effort away from the things we care about. Along the way, we learn why paying for measured performance doesn't work, why surgical scorecards may increase deaths, and much more. But metrics can be good when used as a complement to—rather than a replacement for—judgment based on personal experience, and Muller also gives examples of when metrics have been beneficial."

--- Muller is a conservative, and apparently here engaged in their centuries-old struggle against "sophists, economists and calculators".  But he's a smart, sane conservative...]]></description>
<dc:subject>in_NB social_measurement books:noted management statistics data_analysis muller.jerry books:owned</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b1e73f6061a8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:management"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:muller.jerry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://lukeoakdenrayner.wordpress.com/2017/12/18/the-chestxray14-dataset-problems/">
    <title>Exploring the ChestXray14 dataset: problems – Luke Oakden-Rayner</title>
    <dc:date>2018-01-30T17:25:50+00:00</dc:date>
    <link>https://lukeoakdenrayner.wordpress.com/2017/12/18/the-chestxray14-dataset-problems/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>have_read data_analysis data_sets medicine classifiers statistics via:tslumley spatial_statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:63604ef7bed5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_sets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:medicine"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:tslumley"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spatial_statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nytimes.com/2017/10/18/magazine/when-the-revolution-came-for-amy-cuddy.html">
    <title>When the Revolution Came for Amy Cuddy - The New York Times</title>
    <dc:date>2018-01-30T17:09:54+00:00</dc:date>
    <link>https://www.nytimes.com/2017/10/18/magazine/when-the-revolution-came-for-amy-cuddy.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Morals under the "to teach" tag:
1. Don't do science like this.
2. Don't be a jerk when criticizing others for doing bad science.
(I realize that I am one to talk about #2.)]]></description>
<dc:subject>have_read social_science_methodology social_psychology psychology replication_crisis gelman.andrew popular_social_science data_analysis to_teach:undergrad-research</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:65a7d3a063da/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:replication_crisis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:gelman.andrew"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:popular_social_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-research"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nap.edu/catalog/24893/federal-statistics-multiple-data-sources-and-privacy-protection-next-steps?mc_cid=a967e2da8c&amp;mc_eid=1fdc781427">
    <title>Federal Statistics, Multiple Data Sources, and Privacy Protection: Next Steps | The National Academies Press</title>
    <dc:date>2018-01-06T02:18:53+00:00</dc:date>
    <link>https://www.nap.edu/catalog/24893/federal-statistics-multiple-data-sources-and-privacy-protection-next-steps?mc_cid=a967e2da8c&amp;mc_eid=1fdc781427</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The environment for obtaining information and providing statistical data for policy makers and the public has changed significantly in the past decade, raising questions about the fundamental survey paradigm that underlies federal statistics. New data sources provide opportunities to develop a new paradigm that can improve timeliness, geographic or subpopulation detail, and statistical efficiency. It also has the potential to reduce the costs of producing federal statistics.
"The panel's first report described federal statistical agencies’ current paradigm, which relies heavily on sample surveys for producing national statistics, and challenges agencies are facing; the legal frameworks and mechanisms for protecting the privacy and confidentiality of statistical data and for providing researchers access to data, and challenges to those frameworks and mechanisms; and statistical agencies access to alternative sources of data. The panel recommended a new approach for federal statistical programs that would combine diverse data sources from government and private sector sources and the creation of a new entity that would provide the foundational elements needed for this new approach, including legal authority to access data and protect privacy.
"This second of the panel's two reports builds on the analysis, conclusions, and recommendations in the first one. This report assesses alternative methods for implementing a new approach that would combine diverse data sources from government and private sector sources, including describing statistical models for combining data from multiple sources; examining statistical and computer science approaches that foster privacy protections; evaluating frameworks for assessing the quality and utility of alternative data sources; and various models for implementing the recommended new entity. Together, the two reports offer ideas and recommendations to help federal statistical agencies examine and evaluate data from alternative sources and then combine them as appropriate to provide the country with more timely, actionable, and useful information for policy makers, businesses, and individuals."]]></description>
<dc:subject>to:NB books:noted to_read data_analysis data_collection social_measurement statistics record_linkage privacy</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c351379d5f39/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_collection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:record_linkage"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:privacy"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://mitpress.mit.edu/books/artificial-unintelligence">
    <title>Artificial Unintelligence: How Computers Misunderstand the World | The MIT Press</title>
    <dc:date>2017-12-17T18:03:20+00:00</dc:date>
    <link>https://mitpress.mit.edu/books/artificial-unintelligence</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In Artificial Unintelligence, Meredith Broussard argues that our collective enthusiasm for applying computer technology to every aspect of life has resulted in a tremendous amount of poorly designed systems. We are so eager to do everything digitally—hiring, driving, paying bills, even choosing romantic partners—that we have stopped demanding that our technology actually work. Broussard, a software developer and journalist, reminds us that there are fundamental limits to what we can (and should) do with technology. With this book, she offers a guide to understanding the inner workings and outer limits of technology—and issues a warning that we should never assume that computers always get things right.
"Making a case against technochauvinism—the belief that technology is always the solution—Broussard argues that it’s just not true that social problems would inevitably retreat before a digitally enabled Utopia. To prove her point, she undertakes a series of adventures in computer programming. She goes for an alarming ride in a driverless car, concluding “the cyborg future is not coming any time soon”; uses artificial intelligence to investigate why students can’t pass standardized tests; deploys machine learning to predict which passengers survived the Titanic disaster; and attempts to repair the U.S. campaign finance system by building AI software. If we understand the limits of what we can do with technology, Broussard tells us, we can make better choices about what we should do with it to make the world better for everyone."]]></description>
<dc:subject>books:noted machine_learning artificial_intelligence computers data_analysis to_teach:data-mining from_library in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:01d8a9f74ae2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:artificial_intelligence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:from_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://papers.nips.cc/paper/7213-poincare-embeddings-for-learning-hierarchical-representations">
    <title>Poincaré Embeddings for Learning Hierarchical Representations</title>
    <dc:date>2017-11-24T18:34:20+00:00</dc:date>
    <link>http://papers.nips.cc/paper/7213-poincare-embeddings-for-learning-hierarchical-representations</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Representation learning has become an invaluable approach for learning from symbolic data such as text and graphs. However, while complex symbolic datasets often exhibit a latent hierarchical structure, state-of-the-art methods typically learn embeddings in Euclidean vector spaces, which do not account for this property. For this purpose, we introduce a new approach for learning hierarchical representations of symbolic data by embedding them into hyperbolic space -- or more precisely into an n-dimensional Poincaré ball. Due to the underlying hyperbolic geometry, this allows us to learn parsimonious representations of symbolic data by simultaneously capturing hierarchy and similarity. We introduce an efficient algorithm to learn the embeddings based on Riemannian optimization and show experimentally that Poincaré embeddings outperform Euclidean embeddings significantly on data with latent hierarchies, both in terms of representation capacity and in terms of generalization ability."]]></description>
<dc:subject>to:NB hyperbolic_geometry dimension_reduction hierarchical_structure statistics data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c2b04267ee36/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hyperbolic_geometry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:dimension_reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hierarchical_structure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://statweb.stanford.edu/~donoho/Lectures/AMS2000/Curses.pdf">
    <title>High-Dimensional Data Analysis: The Curses and Blessings of Dimensionality (Donoho)</title>
    <dc:date>2017-08-08T21:10:10+00:00</dc:date>
    <link>http://statweb.stanford.edu/~donoho/Lectures/AMS2000/Curses.pdf</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The coming century is surely the century of data. A combination of blind faith and
serious purpose makes our society invest massively in the collection and processing of data of all kinds, on scales unimaginable until recently. Hyperspectral Imagery, Internet Portals, Financial tick-by-tick data, and DNA Microarrays are just a few of the better-known sources, feeding data in torrential streams into scientific and business databases worldwide.
"In traditional statistical data analysis, we think of observations of instances of particular phenomena (e.g. instance ↔ human being), these observations being a vector of values we measured on several variables (e.g. blood pressure, weight, height, ...). In traditional statistical methodology, we assumed many observations and a few, well-chosen variables. The trend today is towards more observations but even more so, to radically larger numbers of variables – voracious, automatic, systematic collection of hyper-informative detail about each observed instance. We are seeing examples where the observations gathered on individual instances are curves, or spectra, or images, or even movies, so that a single observation has dimensions in the thousands or billions, while there are only tens or hundreds of instances available for study. Classical methods are simply not designed to cope with this kind of explosive growth of dimensionality of the observation vector. We can say with complete confidence that in the coming century, high-dimensional data analysis will be a very significant activity, and completely new methods of high-dimensional data analysis will be developed; we just don’t know what they are yet.
"Mathematicians are ideally prepared for appreciating the abstract issues involved
in finding patterns in such high-dimensional data. Two of the most influential principles in the coming century will be principles originally discovered and cultivated by mathematicians: the blessings of dimensionality and the curse of dimensionality.
"The curse of dimensionality is a phrase used by several subfields in the mathematical sciences; I use it here to refer to the apparent intractability of systematically searching through a high-dimensional space, the apparent intractability of accurately approximating a general high-dimensional function, the apparent intractability of integrating a high-dimensional function.
"The blessings of dimensionality are less widely noted, but they include the concentration of measure phenomenon (so-called in the geometry of Banach spaces), which means that certain random fluctuations are very well controlled in high dimensions and the success of asymptotic methods, used widely in mathematical statistics and statistical physics, which suggest that statements about very high-dimensional settings may be made where moderate dimensions would be too complicated.
"There is a large body of interesting work going on in the mathematical sciences,
both to attack the curse of dimensionality in specific ways, and to extend the benefits of dimensionality. I will mention work in high-dimensional approximation theory, in probability theory, and in mathematical statistics. I expect to see in the coming decades many further mathematical elaborations to our inventory of Blessings and Curses, and I expect such contributions to have a broad impact on society’s ability to extract meaning from the massive datasets it has decided to compile.
"At the end of my talk, I will also draw on my personal research experiences. This
suggest to me (1) ongoing developments in high-dimensional data analysis may lead mathematicians to study new problems in for example harmonic analysis; and (2) that many of the problems of low dimensional data analysis are unsolved and are similar to problems in harmonic analysis which have only recently been attacked, and for which only the merest beginnings have been made. Both fields can progress together."]]></description>
<dc:subject>to:NB to_read data_analysis statistics mathematics high-dimensional_statistics high-dimensional_probability donoho.david</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:be07d6c1000e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:mathematics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:high-dimensional_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:high-dimensional_probability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:donoho.david"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.ucpress.edu/book.php?isbn=9780520289949&amp;mc_cid=3d6dd98934&amp;mc_eid=3d88be13af">
    <title>Principles of Data Management and Presentation - John P. Hoffmann - Paperback - University of California Press</title>
    <dc:date>2017-08-08T16:42:33+00:00</dc:date>
    <link>http://www.ucpress.edu/book.php?isbn=9780520289949&amp;mc_cid=3d6dd98934&amp;mc_eid=3d88be13af</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The world is saturated with data. We are regularly presented with data in words, tables, and graphics. Students from many academic fields are now expected to be educated about data in one form or another. Yet the typical sequence of courses—introductory statistics and research methods—does not provide sufficient information about how to focus in on a research question, how to access data and work with datasets, or how to present data to various audiences.
"Principles of Data Management and Presentation addresses this gap. Assuming only that students have some familiarity with basic statistics and research methods, it provides a comprehensive set of principles for understanding and using data as part of a research project, including:
"• how to narrow a research topic to a specific research question
"• how to access and organize data that are useful for answering a research question
"• how to use software such as Stata, SPSS, and SAS to manage data
"• how to present data so that they convey a clear and effective message
 "A companion website includes material to enhance the learning experience—specifically statistical software code and the datasets used in the examples, in text format as well as Stata, SPSS, and SAS formats. "

--- The appearance of a radar plot on the cover is not a good sign, but ...]]></description>
<dc:subject>to:NB books:noted data_analysis to_teach</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:663fbd009f26/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://press.uchicago.edu/ucp/books/book/chicago/E/bo26176063">
    <title>Evidence, Becker</title>
    <dc:date>2017-08-03T15:33:31+00:00</dc:date>
    <link>http://press.uchicago.edu/ucp/books/book/chicago/E/bo26176063</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Becker has for seventy years been mulling over the problem of evidence. He argues that social scientists don’t take questions about the usefulness of their data as evidence for their ideas seriously enough. For example, researchers have long used the occupation of a person’s father as evidence of the family’s social class, but studies have shown this to be a flawed measure—for one thing, a lot of people answer that question too vaguely to make the reasoning plausible. The book is filled with examples like this, and Becker uses them to expose a series of errors, suggesting ways to avoid them, or even to turn them into research topics in their own right. He argues strongly that because no data-gathering method produces totally reliable information, a big part of the research job consists of getting rid of error. Readers will find Becker’s newest guidebook a valuable tool, useful for social scientists of every variety."

Review: http://bactra.org/reviews/becker-evidence.html]]></description>
<dc:subject>social_science_methodology data_analysis methodological_advice data_cleaning to_teach:undergrad-research in_library in_NB books:recommended books:reviewed</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1b08eb48caae/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:methodological_advice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-research"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:recommended"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:reviewed"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.springer.com/us/book/9783709107409">
    <title>Network Analysis Literacy - A Practical Approach to the | Katharina A. Zweig | Springer</title>
    <dc:date>2016-11-02T14:20:16+00:00</dc:date>
    <link>http://www.springer.com/us/book/9783709107409</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This book presents a perspective of network analysis as a tool to find and quantify significant structures in the interaction patterns between different types of  entities. Moreover, network analysis provides the basic means to relate these structures to properties of the entities. It has proven itself to be useful for the analysis of biological and social networks, but also for networks describing complex systems in economy, psychology, geography, and various other fields. Today, network analysis packages in the open-source platform R and other open-source software projects enable scientists from all fields to quickly apply network analytic methods to their data sets. Altogether, these applications offer such a wealth of network analytic methods that it can be overwhelming for someone just entering this field. This book provides a road map through this jungle of network analytic methods, offers advice on how to pick the best method for a given network analytic project, and how to avoid common pitfalls. It introduces the methods which are most often used to analyze complex networks, e.g., different global network measures, types of random graph models, centrality indices, and networks motifs. In addition to introducing these methods, the central focus is on network analysis literacy – the competence to decide when to use which of these methods for which type of question. Furthermore, the book intends to increase the reader's competence to read original literature on network analysis by providing a glossary and intensive translation of formal notation and mathematical symbols in everyday speech. Different aspects of network analysis literacy – understanding formal definitions, programming tasks, or the analysis of structural measures and their interpretation – are deepened in various exercises with provided solutions. This text is an excellent, if not the best starting point for all scientists who want to harness the power of network analysis for their field of expertise."]]></description>
<dc:subject>to:NB books:noted network_data_analysis statistics social_networks data_analysis to_teach:baby-nets in_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:14ed434c6586/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:baby-nets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.nap.edu/catalog/23616/refining-the-concept-of-scientific-interface-when-working-with-big-data">
    <title>Refining the Concept of Scientific Interface When Working with Big Data: Proceedings of a Workshop—in Brief | The National Academies Press</title>
    <dc:date>2016-09-01T14:49:55+00:00</dc:date>
    <link>http://www.nap.edu/catalog/23616/refining-the-concept-of-scientific-interface-when-working-with-big-data</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>statistics computational_statistics data_mining data_analysis philosophy_of_science kith_and_kin self-promotion</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8ecab3d28e46/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:philosophy_of_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:self-promotion"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://shop.oreilly.com/product/0636920049081.do">
    <title>Mastering Feature Engineering - O'Reilly Media</title>
    <dc:date>2016-06-24T19:19:31+00:00</dc:date>
    <link>http://shop.oreilly.com/product/0636920049081.do</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Feature engineering is essential to applied machine learning, but using domain knowledge to strengthen your predictive models can be difficult and expensive. To help fill the information gap on feature engineering, this complete hands-on guide teaches beginning-to-intermediate data scientists how to work with this widely practiced but little discussed topic.
"Author Alice Zheng explains common practices and mathematical principles to help engineer features for new data and tasks. If you understand basic machine learning concepts like supervised and unsupervised learning, you’re ready to get started. Not only will you learn how to implement feature engineering in a systematic and principled way, you’ll also learn how to practice better data science."]]></description>
<dc:subject>to:NB books:noted data_mining statistics data_analysis to_teach:data-mining kith_and_kin zheng.alice</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9ef2370887f9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:zheng.alice"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://github.com/Quartz/bad-data-guide">
    <title>Quartz/bad-data-guide: An exhaustive reference to problems seen in real-world data along with suggestions on how to resolve them.</title>
    <dc:date>2016-05-02T20:03:53+00:00</dc:date>
    <link>https://github.com/Quartz/bad-data-guide</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[This is pretty good (and not limited to "data journalism").]]></description>
<dc:subject>data_analysis to_teach:undergrad-ADA to_teach:undergrad-research have_read via:?</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a215b6fbe9ee/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-ADA"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-research"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:?"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.jacobinmag.com/2014/10/data-journalism-done-wrong/">
    <title>Data Journalism Done Wrong | Jacobin</title>
    <dc:date>2014-11-10T01:54:35+00:00</dc:date>
    <link>https://www.jacobinmag.com/2014/10/data-journalism-done-wrong/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Well, yes.

(Now, to be fair, it might be that while there is little connection between low-end wage rates and the number of workers at _McDonalds_ restaurants across countries, there might still be a negative relationship between that wage rate and _total_ employment at such restaurants, or even more general between that wage rate and employment at the low end of the occupational spectrum.  But you couldn't use this to defend Yglesias's original post, since it didn't offer any such theses or evidence.)]]></description>
<dc:subject>why_oh_why_cant_we_have_a_better_press_corps evisceration data_analysis economics minimum_wage</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:47de1c06aa02/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:why_oh_why_cant_we_have_a_better_press_corps"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:evisceration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:economics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:minimum_wage"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.powells.com/biblio/62-9780822357445-1">
    <title>Beautiful Data: A History of Vision and Reason Since 1945 (Experimental Futures) by Orit Halpern - Powell's Books</title>
    <dc:date>2014-10-30T00:28:32+00:00</dc:date>
    <link>http://www.powells.com/biblio/62-9780822357445-1</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Beautiful Data is both a history of big data and interactivity, and a sophisticated meditation on ideas about vision and cognition in the second half of the twentieth century. Contending that our forms of attention, observation, and truth are contingent and contested, Orit Halpern historicizes the ways that we are trained, and train ourselves, to observe and analyze the world. Tracing the postwar impact of cybernetics and the communication sciences on the social and human sciences, design, arts, and urban planning, she finds a radical shift in attitudes toward recording and displaying information. These changed attitudes produced what she calls communicative objectivity: new forms of observation, rationality, and economy based on the management and analysis of data. Halpern complicates assumptions about the value of data and visualization, arguing that changes in how we manage and train perception, and define reason and intelligence, are also transformations in governmentality. She also challenges the paradoxical belief that we are experiencing a crisis of attention caused by digital media, a crisis that can be resolved only through intensified media consumption."]]></description>
<dc:subject>books:noted visual_display_of_quantitative_information data_analysis history_of_ideas in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:57f638cc0e1c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:visual_display_of_quantitative_information"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:history_of_ideas"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://medium.com/@mrtz/how-big-data-is-unfair-9aa544d739de">
    <title>How big data is unfair — Medium</title>
    <dc:date>2014-10-10T19:21:02+00:00</dc:date>
    <link>https://medium.com/@mrtz/how-big-data-is-unfair-9aa544d739de</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>data_mining data_analysis evidence_based public_policy discrimination to:blog to_teach:data-mining algorithmic_fairness to_teach:statistics_of_inequality_and_discrimination</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d7b446455683/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:evidence_based"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:public_policy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:discrimination"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:blog"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_of_inequality_and_discrimination"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://datacolada.org/2014/05/01/20-we-cannot-afford-to-study-effect-size-in-the-lab/">
    <title>Data Colada | [20] We cannot afford to study effect size in the lab</title>
    <dc:date>2014-09-18T01:05:32+00:00</dc:date>
    <link>http://datacolada.org/2014/05/01/20-we-cannot-afford-to-study-effect-size-in-the-lab/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>statistics data_analysis psychology experimental_psychology estimation to_teach</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:422078f0deba/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:experimental_psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.nicebread.de/a-comment-on-we-cannot-afford-to-study-effect-size-in-the-lab-from-the-datacolada-blog/">
    <title>Felix Schönbrodt's website</title>
    <dc:date>2014-09-18T01:05:19+00:00</dc:date>
    <link>http://www.nicebread.de/a-comment-on-we-cannot-afford-to-study-effect-size-in-the-lab-from-the-datacolada-blog/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>statistics data_analysis psychology experimental_psychology estimation to_teach</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:7272d0a3931b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:experimental_psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.niemanlab.org/2014/07/alberto-cairo-data-journalism-needs-to-up-its-own-standards">
    <title>Alberto Cairo: Data journalism needs to up its own standards » Nieman Journalism Lab</title>
    <dc:date>2014-09-13T03:17:30+00:00</dc:date>
    <link>http://www.niemanlab.org/2014/07/alberto-cairo-data-journalism-needs-to-up-its-own-standards</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>have_read why_oh_why_cant_we_have_a_better_press_corps journalism data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:47079008d302/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:why_oh_why_cant_we_have_a_better_press_corps"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:journalism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.slate.com/articles/technology/bitwise/2014/08/what_is_big_data_good_for_incremental_change_not_big_paradigm_shifts.html?wpsrc=sh_all_dt_tw_top">
    <title>What is big data good for? Incremental change, not big paradigm shifts.</title>
    <dc:date>2014-08-08T01:54:11+00:00</dc:date>
    <link>http://www.slate.com/articles/technology/bitwise/2014/08/what_is_big_data_good_for_incremental_change_not_big_paradigm_shifts.html?wpsrc=sh_all_dt_tw_top</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>data_mining data_analysis auerbach.david to_teach:data-mining have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c8dd922d78de/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:auerbach.david"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.slate.com/articles/technology/bitwise/2014/07/facebook_okcupid_user_experiments_ethics_aside_they_show_us_the_limitations.html?wpsrc=sh_all_dt_tw_top">
    <title>Facebook, OkCupid user experiments: Ethics aside, they show us the limitations of big data.</title>
    <dc:date>2014-08-01T03:14:10+00:00</dc:date>
    <link>http://www.slate.com/articles/technology/bitwise/2014/07/facebook_okcupid_user_experiments_ethics_aside_they_show_us_the_limitations.html?wpsrc=sh_all_dt_tw_top</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>data_mining data_analysis computational_statistics debunking auerbach.david to:blog</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:257f8780b3bd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:debunking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:auerbach.david"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:blog"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://sss.sagepub.com/content/44/4/555.abstract?etoc">
    <title>Multivariate statistics and the enactment of metabolic complexity</title>
    <dc:date>2014-07-29T14:59:49+00:00</dc:date>
    <link>http://sss.sagepub.com/content/44/4/555.abstract?etoc</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This ethnographic study, based on fieldwork at the Computational and Systems Medicine laboratory at Imperial College London, shows how researchers in the field of metabolomics – the post-genomic study of the molecules and processes that make up metabolism – enact and coproduce complex views of biology with multivariate statistics. From this data-driven science, metabolism emerges as a multiple, informational and statistical object, which is both produced by and also necessitates particular forms of data production and analysis. Multivariate statistics emerge as ‘natural’ and ‘correct’ ways of engaging with a metabolism that is made up of many variables. In this sense, multivariate statistics allow researchers to engage with and conceptualize metabolism, and also disease and processes of life, as complex entities. Consequently, this article builds on studies of scientific practice and visualization to examine data as material objects rather than black-boxed representations. Data practices are not merely the technological components of experimentation, but are simultaneously technologies and methods and are intertwined with ways of seeing and enacting the biological world. Ultimately, this article questions the increasing invocation and role of complexity within biology, suggesting that discourses of complexity are often imbued with reductionist and determinist ways of thinking about biology, as scientists engage with complexity in calculated and controlled, but also limited, ways."]]></description>
<dc:subject>to:NB to_read ethnography science_as_a_social_process biochemical_networks biology statistics complexity data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:944c59a7a680/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ethnography"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:science_as_a_social_process"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:biochemical_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html">
    <title>How Not to Be Misled by the Jobs Report - NYTimes.com</title>
    <dc:date>2014-07-12T01:50:05+00:00</dc:date>
    <link>http://www.nytimes.com/2014/05/02/upshot/how-not-to-be-misled-by-the-jobs-report.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[This is an awesome demonstration.]]></description>
<dc:subject>statistics data_analysis via:civilstat to_teach why_oh_why_cant_we_have_a_better_press_corps to:blog</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:51b39a535303/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:civilstat"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:why_oh_why_cant_we_have_a_better_press_corps"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:blog"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1406.0873">
    <title>[1406.0873] Unifying linear dimensionality reduction</title>
    <dc:date>2014-07-12T00:26:03+00:00</dc:date>
    <link>http://arxiv.org/abs/1406.0873</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Linear dimensionality reduction methods are a cornerstone of analyzing high dimensional data, due to their simple geometric interpretations and typically attractive computational properties. These methods capture many data features of interest, such as covariance, dynamical structure, correlation between data sets, input-output relationships, and margin between data classes. Methods have been developed with a variety of names and motivations in many fields, and perhaps as a result the deeper connections between all these methods have not been understood. Here we unify methods from this disparate literature as optimization programs over matrix manifolds. We discuss principal component analysis, factor analysis, linear multidimensional scaling, Fisher's linear discriminant analysis, canonical correlations analysis, maximum autocorrelation factors, slow feature analysis, undercomplete independent component analysis, linear regression, and more. This optimization framework helps elucidate some rarely discussed shortcomings of well-known methods, such as the suboptimality of certain eigenvector solutions. Modern techniques for optimization over matrix manifolds enable a generic linear dimensionality reduction solver, which accepts as input data and an objective to be optimized, and returns, as output, an optimal low-dimensional projection of the data. This optimization framework further allows rapid development of novel variants of classical methods, which we demonstrate here by creating an orthogonal-projection canonical correlations analysis. More broadly, we suggest that our generic linear dimensionality reduction solver can move linear dimensionality reduction toward becoming a blackbox, objective-agnostic numerical technology."]]></description>
<dc:subject>data_analysis principal_components factor_analysis optimization statistics dimension_reduction ghahramani.zoubin in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6bae21f8af20/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:principal_components"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:factor_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:dimension_reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ghahramani.zoubin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.springer.com/statistics/computational+statistics/book/978-3-319-03163-7">
    <title>Text Analysis with R for Students of Literature</title>
    <dc:date>2014-07-08T14:10:00+00:00</dc:date>
    <link>http://www.springer.com/statistics/computational+statistics/book/978-3-319-03163-7</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Text Analysis with R for Students of Literature is written with students and scholars of literature in mind but will be applicable to other humanists and social scientists wishing to extend their methodological tool kit to include quantitative and computational approaches to the study of text. Computation provides access to information in text that we simply cannot gather using traditional qualitative methods of close reading and human synthesis. Text Analysis with R for Students of Literature provides a practical introduction to computational text analysis using the open source programming language R. R is extremely popular throughout the sciences and because of its accessibility, R is now used increasingly in other research areas. Readers begin working with text right away and each chapter works through a new technique or process such that readers gain a broad exposure to core R procedures and a basic understanding of the possibilities of computational text analysis at both the micro and macro scale. Each chapter builds on the previous as readers move from small scale “microanalysis” of single texts to large scale “macroanalysis” of text corpora, and each chapter concludes with a set of practice exercises that reinforce and expand upon the chapter lessons. The book’s focus is on making the technical palatable and making the technical useful and immediately gratifying."]]></description>
<dc:subject>to:NB books:noted data_analysis R text_mining humanities electronic_copy books:owned</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8410e555cf39/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:text_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:humanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:electronic_copy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.jacobinmag.com/2014/06/bro-bash/">
    <title>Bro Bash | Jacobin</title>
    <dc:date>2014-06-09T17:22:09+00:00</dc:date>
    <link>https://www.jacobinmag.com/2014/06/bro-bash/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["There’s nothing feminist about leaving numbers to the bros."

--- Has there really been much call for abandoning data & quantitative analysis & c. on the left?  It sounds remarkably idiotic.]]></description>
<dc:subject>progressive_forces feminism sexism data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2634c529c5dd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:progressive_forces"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:feminism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sexism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://piketty.pse.ens.fr/files/capital21c/en/Piketty2014TechnicalAppendixResponsetoFT.pdf">
    <title>Addendum: Response to FT</title>
    <dc:date>2014-05-30T02:49:58+00:00</dc:date>
    <link>http://piketty.pse.ens.fr/files/capital21c/en/Piketty2014TechnicalAppendixResponsetoFT.pdf</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Polite, yet brutal.]]></description>
<dc:subject>inequality economics data_analysis piketty.thomas evisceration via:unfogged</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d5215ef5f9a6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:inequality"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:economics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:piketty.thomas"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:evisceration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:unfogged"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://blogs.ft.com/money-supply/2014/05/23/data-problems-with-capital-in-the-21st-century/">
    <title>Data problems with Capital in the 21st Century | Money Supply</title>
    <dc:date>2014-05-25T12:33:45+00:00</dc:date>
    <link>http://blogs.ft.com/money-supply/2014/05/23/data-problems-with-capital-in-the-21st-century/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[To be honest, these don't seem like huge issues, though they do need to be addressed.  The most concerning one for me is actually about averaging country by country, though I don't think making this population-weighted solves the issue.

- Piketty's (crushing, convincing) response: http://piketty.pse.ens.fr/files/capital21c/en/Piketty2014TechnicalAppendixResponsetoFT.pdf]]></description>
<dc:subject>economics inequality data_analysis piketty.thomas</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:84dd35140ef6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:economics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:inequality"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:piketty.thomas"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1403.7400">
    <title>[1403.7400] Big Questions for Social Media Big Data: Representativeness, Validity and Other Methodological Pitfalls</title>
    <dc:date>2014-04-22T15:46:00+00:00</dc:date>
    <link>http://arxiv.org/abs/1403.7400</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Large-scale databases of human activity in social media have captured scientific and policy attention, producing a flood of research and discussion. This paper considers methodological and conceptual challenges for this emergent field, with special attention to the validity and representativeness of social media big data analyses. Persistent issues include the over-emphasis of a single platform, Twitter, sampling biases arising from selection by hashtags, and vague and unrepresentative sampling frames. The socio-cultural complexity of user behavior aimed at algorithmic invisibility (such as subtweeting, mock-retweeting, use of "screen captures" for text, etc.) further complicate interpretation of big data social media. Other challenges include accounting for field effects, i.e. broadly consequential events that do not diffuse only through the network under study but affect the whole society. The application of network methods from other fields to the study of human social activity may not always be appropriate. The paper concludes with a call to action on practical steps to improve our analytic capacity in this promising, rapidly-growing field."]]></description>
<dc:subject>to:NB social_science_methodology social_media data_analysis tufekci.zeynep social_measurement have_read to_teach:data-mining to_teach:baby-nets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bc75ace1a2f7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_media"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:tufekci.zeynep"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:baby-nets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://overview.ap.org/blog/2014/01/algorithms-are-not-enough-lessons-bringing-computer-science-to-journalism/">
    <title>The Overview Project » Algorithms are not enough: lessons bringing computer science to journalism</title>
    <dc:date>2014-04-04T16:15:42+00:00</dc:date>
    <link>http://overview.ap.org/blog/2014/01/algorithms-are-not-enough-lessons-bringing-computer-science-to-journalism/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[(Makes loud and prolonged noises of approval)
(Looks guiltily at own practices)]]></description>
<dc:subject>data_analysis text_mining journalism programming design data_mining visual_display_of_quantitative_information to_teach:data-mining to_teach:statcomp to:blog</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9e6f6ed730f6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:text_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:journalism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:design"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:visual_display_of_quantitative_information"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statcomp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:blog"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://modelviewculture.com/pieces/quantify-everything-a-dream-of-a-feminist-data-future">
    <title>Quantify Everything: A Dream of a Feminist Data Future, by Amelia Abreu | Model View Culture</title>
    <dc:date>2014-04-03T15:51:14+00:00</dc:date>
    <link>http://modelviewculture.com/pieces/quantify-everything-a-dream-of-a-feminist-data-future</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>feminism data_analysis measurement labor via:perspectivelute have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ad15fe501cda/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:feminism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:labor"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:perspectivelute"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1401.0742">
    <title>[1401.0742] Data Smashing</title>
    <dc:date>2014-03-10T02:14:47+00:00</dc:date>
    <link>http://arxiv.org/abs/1401.0742</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Investigation of the underlying physics or biology from empirical data requires a quantifiable notion of similarity - when do two observed data sets indicate nearly identical generating processes, and when they do not. The discriminating characteristics to look for in data is often determined by heuristics designed by experts, e.g., distinct shapes of "folded" lightcurves may be used as "features" to classify variable stars, while determination of pathological brain states might require a Fourier analysis of brainwave activity. Finding good features is non-trivial. Here, we propose a universal solution to this problem: we delineate a principle for quantifying similarity between sources of arbitrary data streams, without a priori knowledge, features or training. We uncover an algebraic structure on a space of symbolic models for quantized data, and show that such stochastic generators may be added and uniquely inverted; and that a model and its inverse always sum to the generator of flat white noise. Therefore, every data stream has an anti-stream: data generated by the inverse model. Similarity between two streams, then, is the degree to which one, when summed to the other's anti-stream, mutually annihilates all statistical structure to noise. We call this data smashing. We present diverse applications, including disambiguation of brainwaves pertaining to epileptic seizures, detection of anomalous cardiac rhythms, and classification of astronomical objects from raw photometry. In our examples, the data smashing principle, without access to any domain knowledge, meets or exceeds the performance of specialized algorithms tuned by domain experts."]]></description>
<dc:subject>to:NB data_analysis statistics stochastic_processes lipson.hod re:AoS_project color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:677dfc46524a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:stochastic_processes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:lipson.hod"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:AoS_project"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1401.5226">
    <title>[1401.5226] The Why and How of Nonnegative Matrix Factorization</title>
    <dc:date>2014-03-10T01:35:57+00:00</dc:date>
    <link>http://arxiv.org/abs/1401.5226</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Nonnegative matrix factorization (NMF) has become a widely used tool for the analysis of high-dimensional data as it automatically extracts sparse and meaningful features from a set of nonnegative data vectors. We first illustrate this property of NMF on three applications, in image processing, text mining and hyperspectral imaging --this is the why. Then we address the problem of solving NMF, which is NP-hard in general. We review some standard NMF algorithms, and also present a recent subclass of NMF problems, referred to as near-separable NMF, that can be solved efficiently (that is, in polynomial time), even in the presence of noise --this is the how. Finally, we briefly describe some problems in mathematics and computer science closely related to NMF via the nonnegative rank."]]></description>
<dc:subject>to:NB data_analysis low-rank_approximation optimization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6b063dad6a3a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:low-rank_approximation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:optimization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.jstatsoft.org/v55/i14">
    <title>Scalable Strategies for Computing with Massive Data</title>
    <dc:date>2014-02-21T00:29:41+00:00</dc:date>
    <link>http://www.jstatsoft.org/v55/i14</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This paper presents two complementary statistical computing frameworks that address challenges in parallel processing and the analysis of massive data. First, the foreach package allows users of the R programming environment to define parallel loops that may be run sequentially on a single machine, in parallel on a symmetric multiprocessing (SMP) machine, or in cluster environments without platform-specific code. Second, the bigmemory package implements memory- and file-mapped data structures that provide (a) access to arbitrarily large data while retaining a look and feel that is familiar to R users and (b) data structures that are shared across processor cores in order to support efficient parallel computing techniques. Although these packages may be used independently, this paper shows how they can be used in combination to address challenges that have effectively been beyond the reach of researchers who lack specialized software development skills or expensive hardware."]]></description>
<dc:subject>R computational_statistics data_analysis in_NB to_teach:statcomp have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:55d52a2c8dbc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statcomp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://liorpachter.wordpress.com/2014/01/19/why-do-you-look-at-the-speck-in-your-sisters-quilt-plot-and-pay-no-attention-to-the-plank-in-your-own-heat-map/">
    <title>Why do you look at the speck in your sister’s quilt plot and pay no attention to the plank in your own heat map? | Bits of DNA</title>
    <dc:date>2014-02-18T02:45:18+00:00</dc:date>
    <link>http://liorpachter.wordpress.com/2014/01/19/why-do-you-look-at-the-speck-in-your-sisters-quilt-plot-and-pay-no-attention-to-the-plank-in-your-own-heat-map/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>visual_display_of_quantitative_information data_analysis bad_data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f0adfc6a81b9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:visual_display_of_quantitative_information"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bad_data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1402.2965">
    <title>[1402.2965] Zipf's Law for All the Natural Cities around the World</title>
    <dc:date>2014-02-18T00:36:52+00:00</dc:date>
    <link>http://arxiv.org/abs/1402.2965</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Two fundamental issues surrounding research on Zipf's law regarding city sizes are whether and why Zipf's law holds. This paper does not deal with the latter issue with respect to why, and instead investigates whether Zipf's law holds in a global setting, thus involving all cities around the world. Unlike previous studies, which have mainly relied on conventional census data, and census- bureau-imposed definitions of cities, we adopt naturally and objectively delineated cities, or natural cities, to be more precise, in order to examine Zipf's law. We find that Zipf's law holds remarkably well for all natural cities at the global level, and remains almost valid at the continental level except for Africa at certain time instants. We further examine the law at the country level, and note that Zipf's law is violated from country to country or from time to time. This violation is mainly due to our limitations; we are limited to individual countries, and to a static view on city-size distributions. The central argument of this paper is that Zipf's law is universal, and we therefore must use the correct scope in order to observe it. We further find that this law is reflected in the distribution of cities: the number of cities in individual countries follows an inverse power relationship; the number of cities in the first largest country is twice as many as that in the second largest country, three times as many as that in the third largest country, and so on. "]]></description>
<dc:subject>to:NB cities heavy_tails data_analysis color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:7e32eeec9014/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:heavy_tails"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://simplystatistics.org/2014/02/14/on-the-scalability-of-statistical-procedures-why-the-p-value-bashers-just-dont-get-it/">
    <title>On the scalability of statistical procedures: why the p-value bashers just don’t get it. | Simply Statistics</title>
    <dc:date>2014-02-17T13:51:05+00:00</dc:date>
    <link>http://simplystatistics.org/2014/02/14/on-the-scalability-of-statistical-procedures-why-the-p-value-bashers-just-dont-get-it/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Enforcing education and practice in data analysis is the only way to resolve the problems that people usually attribute to P-values. In the short term, we should at minimum require all the editors of journals who regularly handle data analysis to show competency in statistics and data analysis."]]></description>
<dc:subject>hypothesis_testing statistics bad_data_analysis data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:fe2a977ccd05/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hypothesis_testing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bad_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3037423/">
    <title>Encoding and decoding in fMRI</title>
    <dc:date>2014-01-02T23:22:14+00:00</dc:date>
    <link>http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3037423/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>have_read neural_coding_and_decoding fmri data_analysis classifiers regression in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1be814d11b9a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_coding_and_decoding"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:fmri"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1010.0520">
    <title>[1010.0520] Successive normalization of rectangular arrays</title>
    <dc:date>2013-12-14T16:36:45+00:00</dc:date>
    <link>http://arxiv.org/abs/1010.0520</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Standard statistical techniques often require transforming data to have mean 0 and standard deviation 1. Typically, this process of "standardization" or "normalization" is applied across subjects when each subject produces a single number. High throughput genomic and financial data often come as rectangular arrays where each coordinate in one direction concerns subjects who might have different status (case or control, say), and each coordinate in the other designates "outcome" for a specific feature, for example, "gene," "polymorphic site" or some aspect of financial profile. It may happen, when analyzing data that arrive as a rectangular array, that one requires BOTH the subjects and the features to be "on the same footing." Thus there may be a need to standardize across rows and columns of the rectangular matrix. There arises the question as to how to achieve this double normalization. We propose and investigate the convergence of what seems to us a natural approach to successive normalization which we learned from our colleague Bradley Efron. We also study the implementation of the method on simulated data and also on data that arose from scientific experimentation."]]></description>
<dc:subject>to:NB data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b5148b24559a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.aeaweb.org/articles.php?doi=10.1257/aer.103.7.3001">
    <title>AER (103,7) p. 3001 - Conclusions Regarding Cross-Group Differences in Happiness Depend on Difficulty of Reaching Respondents</title>
    <dc:date>2013-12-04T17:28:22+00:00</dc:date>
    <link>http://www.aeaweb.org/articles.php?doi=10.1257/aer.103.7.3001</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A growing literature explores differences in subjective well-being across demographic groups, often relying on surveys with high nonresponse rates. By using the reported number of call attempts made to participants in the University of Michigan's Surveys of Consumers, we show that comparisons among easy-to-reach respondents differ from comparisons among hard-to-reach ones. Notably, easy-to-reach women are happier than easy-to-reach men, but hard-to-reach men are happier than hard-to-reach women, and conclusions of a survey could reverse with more attempted calls. Better alternatives to comparing group sample averages might include putting greater weight on hard-to-reach respondents or even extrapolating trends in responses."

- This could make a good teaching example, if it pans out.]]></description>
<dc:subject>to:NB data_analysis surveys statistics happiness color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:55ef0d334937/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:surveys"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:happiness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>