<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (cshalizi)</title>
    <link>https://pinboard.in/u:cshalizi/public/</link>
    <description>recent bookmarks from cshalizi</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://www.nber.org/papers/w32917"/>
	<rdf:li rdf:resource="https://link.springer.com/article/10.1007/s00146-020-01097-6"/>
	<rdf:li rdf:resource="https://www.countbayesie.com/blog/2023/4/21/linear-diffusion"/>
	<rdf:li rdf:resource="https://direct.mit.edu/books/oa-monograph/5587/Computational-FormalismArt-History-and-Machine"/>
	<rdf:li rdf:resource="https://direct.mit.edu/books/oa-monograph/5674/Distant-ViewingComputational-Exploration-of"/>
	<rdf:li rdf:resource="https://philsci-archive.pitt.edu/22690/"/>
	<rdf:li rdf:resource="https://www.theverge.com/features/23764584/ai-artificial-intelligence-data-notation-labor-scale-surge-remotasks-openai-chatbots"/>
	<rdf:li rdf:resource="https://www.propublica.org/article/chicagos-race-neutral-traffic-cameras-ticket-black-and-latino-drivers-the-most"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2301.13188"/>
	<rdf:li rdf:resource="https://link.springer.com/article/10.1007/s10618-022-00907-3"/>
	<rdf:li rdf:resource="https://www.foreignaffairs.com/world/spirals-delusion-artificial-intelligence-decision-making"/>
	<rdf:li rdf:resource="https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html"/>
	<rdf:li rdf:resource="https://logicmag.io/play/my-stepdad's-huge-data-set/"/>
	<rdf:li rdf:resource="https://www.cambridge.org/core/books/big-data-and-the-welfare-state/340936CE478BD6264DE77D9123357D4A#fndtn-information"/>
	<rdf:li rdf:resource="https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html?action=click&amp;module=Well&amp;pgtype=Homepage&amp;section=Business"/>
	<rdf:li rdf:resource="https://www.nature.com/articles/s41467-019-10933-3/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2203.06498"/>
	<rdf:li rdf:resource="https://www.sciencedirect.com/science/article/pii/S016792361400061X?casa_token=8mKjKuwIF58AAAAA:DjRmaJuDoBZjjHb2kA3iEoCckvybsakE7Ww6qdBRABxULXlOuE8FIvmSbgMgYO0ZwLasjyow"/>
	<rdf:li rdf:resource="https://press.princeton.edu/books/ebook/9780691207995/text-as-data"/>
	<rdf:li rdf:resource="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-090820-020800"/>
	<rdf:li rdf:resource="https://nymag.com/intelligencer/2020/09/inside-palantir-technologies-peter-thiel-alex-karp.html"/>
	<rdf:li rdf:resource="https://www.nature.com/articles/s41386-021-01020-7"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.14364"/>
	<rdf:li rdf:resource="https://www.cambridge.org/9781108845359"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2105.07020"/>
	<rdf:li rdf:resource="https://www.nber.org/papers/w28811"/>
	<rdf:li rdf:resource="https://www.sup.org/books/title/?id=32597"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2104.10443"/>
	<rdf:li rdf:resource="https://www.nber.org/papers/w23673"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2103.05766"/>
	<rdf:li rdf:resource="https://yalebooks.yale.edu/book/9780300209570/atlas-ai"/>
	<rdf:li rdf:resource="https://doi.org/10.5210/fm.v19i7.4901"/>
	<rdf:li rdf:resource="https://www.cambridge.org/core/journals/data-and-policy/article/from-satisficing-to-artificing-the-evolution-of-administrative-decisionmaking-in-the-age-of-the-algorithm/8962400DADAC3C740AC023A20B38E285"/>
	<rdf:li rdf:resource="https://doi.org/10.1093/oso/9780190684099.001.0001"/>
	<rdf:li rdf:resource="https://doi.org/10.1093/oso/9780198864165.001.0001"/>
	<rdf:li rdf:resource="https://nowpublishers.com/article/Details/MAL-078-3"/>
	<rdf:li rdf:resource="https://nowpublishers.com/article/Details/MAL-078-2"/>
	<rdf:li rdf:resource="https://nowpublishers.com/article/Details/MAL-078-1"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2101.04715"/>
	<rdf:li rdf:resource="https://sociologicalscience.com/articles-v8-2-26/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1711.10427"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.12802"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.08496"/>
	<rdf:li rdf:resource="https://www.lse.ac.uk/cpnss/research/genetically-evolving-models-in-science"/>
	<rdf:li rdf:resource="https://www.cambridge.org/9781108477444"/>
	<rdf:li rdf:resource="https://www.nytimes.com/2020/01/20/opinion/facial-recognition-ban-privacy.html"/>
	<rdf:li rdf:resource="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3727562"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2011.11483"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1909.10832"/>
	<rdf:li rdf:resource="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621"/>
	<rdf:li rdf:resource="https://ieeexplore.ieee.org/document/8861141"/>
	<rdf:li rdf:resource="https://pubsonline.informs.org/doi/pdf/10.1287/mksc.2019.1188"/>
	<rdf:li rdf:resource="https://www.wired.com/story/ad-tech-could-be-the-next-internet-bubble/"/>
	<rdf:li rdf:resource="https://press.princeton.edu/books/ebook/9780691200002/metrics-at-work"/>
	<rdf:li rdf:resource="https://mitpress.mit.edu/books/data-action"/>
	<rdf:li rdf:resource="https://www.jstor.org/stable/j.ctt13x0hch"/>
	<rdf:li rdf:resource="https://www.pnas.org/content/117/15/8398"/>
	<rdf:li rdf:resource="https://law.stanford.edu/publications/ethnic-bias-in-big-data-analytics-how-private-biases-can-migrate-into-public-policy/"/>
	<rdf:li rdf:resource="https://global.oup.com/academic/product/predict-and-surveil-9780190684099"/>
	<rdf:li rdf:resource="https://cloud.ibm.com/docs/personality-insights?topic=personality-insights-science"/>
	<rdf:li rdf:resource="https://www.pnas.org/content/early/2020/07/13/1920484117"/>
	<rdf:li rdf:resource="https://wwnorton.com/books/9780393634846/about-the-book/description"/>
	<rdf:li rdf:resource="https://wwnorton.com/books/9781631496103"/>
	<rdf:li rdf:resource="https://www-wsj-com.cdn.ampproject.org/v/s/www.wsj.com/amp/articles/ai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201?usqp=mq331AQFKAGwASA%3D&amp;amp_js_v=0.1#referrer=https%3A%2F%2Fwww.google.com&amp;amp_tf=From%20%251%24s&amp;ampshare=https%3A%2F%2Fwww.wsj.com%2Farticles%2Fai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201"/>
	<rdf:li rdf:resource="https://www.nytimes.com/2020/06/24/technology/facial-recognition-arrest.html?action=click&amp;module=Top%20Stories&amp;pgtype=Homepage"/>
	<rdf:li rdf:resource="https://academic.oup.com/jla/article/doi/10.1093/jla/laz001/5476086"/>
	<rdf:li rdf:resource="https://www.buzzfeednews.com/article/ryanmac/clearview-ai-fbi-ice-global-law-enforcement"/>
	<rdf:li rdf:resource="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2477899"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.05193"/>
	<rdf:li rdf:resource="https://hdsr.mitpress.mit.edu/pub/56lnenzj"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://www.nber.org/papers/w32917">
    <title>Credit Scores: Performance and Equity | NBER</title>
    <dc:date>2025-03-08T21:24:22+00:00</dc:date>
    <link>https://www.nber.org/papers/w32917</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Credit scores are critical for allocating consumer debt in the United States, yet little evidence is available on their performance. We benchmark a widely used credit score against a machine learning model of consumer default and find significant misclassification of borrowers, especially those with low scores. Our model improves predictive accuracy for young, low-income, and minority groups due to its superior performance with low quality data, resulting in a gain in standing for these populations. Our findings suggest that improving credit scoring performance could lead to more equitable access to credit."

--- ??? Ain't I been reading evaluations of credit scoring algorithms in stats / ML / data mining venues for 20 years now?]]></description>
<dc:subject>to:NB color_me_skeptical credit_ratings prediction data_mining economistic_imperialism</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ee5ffe715728/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:credit_ratings"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:economistic_imperialism"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://link.springer.com/article/10.1007/s00146-020-01097-6">
    <title>The Nooscope manifested: AI as instrument of knowledge extractivism | AI &amp; SOCIETY</title>
    <dc:date>2024-12-04T14:38:02+00:00</dc:date>
    <link>https://link.springer.com/article/10.1007/s00146-020-01097-6</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The purpose of the Nooscope map is to secularize AI from the ideological status of ‘intelligent machine’ to one of knowledge instruments. Rather than evoking legends of alien cognition, it is more reasonable to consider machine learning as an instrument of knowledge magnification that helps to perceive features, patterns, and correlations through vast spaces of data beyond human reach. In the history of science and technology, this is no news; it has already been pursued by optical instruments throughout the histories of astronomy and medicine.Footnote3 In the tradition of science, machine learning is just a Nooscope, an instrument to see and navigate the space of knowledge (from the Greek skopein ‘to examine, look’ and noos ‘knowledge’)."

--- I'm skeptical of this as an account of _all of machine learning_, but initially sympathetic as an account of LLMs.]]></description>
<dc:subject>large_language_models_(so_called) artificial_intelligence data_mining re:shoggothim via:warrenellis no_really_via:warrenellis in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:7860d376a580/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:large_language_models_(so_called)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:artificial_intelligence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:shoggothim"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:warrenellis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:no_really_via:warrenellis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.countbayesie.com/blog/2023/4/21/linear-diffusion">
    <title>Linear Diffusion: Building a Diffusion Model from linear Components — Count Bayesie</title>
    <dc:date>2024-11-28T00:35:00+00:00</dc:date>
    <link>https://www.countbayesie.com/blog/2023/4/21/linear-diffusion</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[--- This is awesome.  It doesn't work that great, but it's awesome.  I'm tempted to have The Kids duplicate it with the eigendresses assignments in data mining, but unfortunately I don't think I saved the text when I scraped the images.]]></description>
<dc:subject>have_read data_mining principal_components generative_diffusion_models in_NB to_teach:statistics_and_generative_ai</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6ea3912b2b43/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:principal_components"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:generative_diffusion_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_and_generative_ai"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://direct.mit.edu/books/oa-monograph/5587/Computational-FormalismArt-History-and-Machine">
    <title>Computational FormalismArt History and Machine Learning | Books Gateway | MIT Press</title>
    <dc:date>2024-04-30T14:44:43+00:00</dc:date>
    <link>https://direct.mit.edu/books/oa-monograph/5587/Computational-FormalismArt-History-and-Machine</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["How the use of machine learning to analyze art images has revived formalism in art history, presenting a golden opportunity for art historians and computer scientists to learn from one another.
"Though formalism is an essential tool for art historians, much recent art history has focused on the social and political aspects of art. But now art historians are adopting machine learning methods to develop new ways to analyze the purely visual in datasets of art images. Amanda Wasielewski uses the term “computational formalism” todescribe this use of machine learning and computer vision technique in art historical research. At the same time that art historians are analyzing art images in new ways, computer scientists are using art images for experiments in machine learning and computer vision. Their research, says Wasielewski, would be greatly enriched by the inclusion of humanistic issues.
"The main purpose in applying computational techniques such as machine learning to art datasets is to automate the process of categorization using metrics such as style, a historically fraught concept in art history. After examining a fifteen-year trajectory in image categorization and art dataset creation in the fields of machine learning and computer vision, Wasielewski considers deep learning techniques that both create and detect forgeries and fakes in art. She investigates examples of art historical analysis in the fields of computer and information sciences, placing this research in the context of art historiography. She also raises questions as which artworks are chosen for digitization, and of those artworks that are born digital, which works gain acceptance into the canon of high art."]]></description>
<dc:subject>to:NB downloaded art_history data_mining digitial_humanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:30a0f90c67b1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:art_history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:digitial_humanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://direct.mit.edu/books/oa-monograph/5674/Distant-ViewingComputational-Exploration-of">
    <title>Distant ViewingComputational Exploration of Digital Images | Books Gateway | MIT Press</title>
    <dc:date>2024-04-29T14:22:26+00:00</dc:date>
    <link>https://direct.mit.edu/books/oa-monograph/5674/Distant-ViewingComputational-Exploration-of</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A new theory and methodology for the application of computer vision methods to the computational analysis of collected, digitized visual materials, called “distant viewing.”
"'Distant Viewing: Computational Exploration of Digital Images presents a new theory and methodology for the computational analysis of digital images, offering a lively, constructive critique of computer vision that you can actually use. What does it mean to say that computer vision “understands” visual inputs? Annotations never capture a whole image. The way digital images convey information requires what researchers Taylor Arnold and Lauren Tilton call “distant viewing”—a play on the well-known term “distant reading” from computational literary analysis.
"Recognizing computer vision's limitations, Arnold and Tilton's spirited examination makes the technical exciting by applying distant viewing to the sitcoms Bewitched and I Dream of Jeannie, movie posters and other popular forms of advertising, and Dorothea Lange's photography. In the tradition of visual culture studies and computer vision, Distant Viewing's interdisciplinary perspective encompasses film and media studies, visual semiotics, and the sciences to create a playful, accessible guide for an international audience working in digital humanities, data science, media studies, and visual culture studies."]]></description>
<dc:subject>to:NB books:noted downloaded art_history digital_humanities image_processing data_mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6699678ef830/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:art_history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:digital_humanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:image_processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://philsci-archive.pitt.edu/22690/">
    <title>The Devil in the Data: Machine Learning &amp; the Theory-Free Ideal - PhilSci-Archive</title>
    <dc:date>2023-12-10T02:36:46+00:00</dc:date>
    <link>https://philsci-archive.pitt.edu/22690/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Philosophers of science have argued that the widespread adoption of the methods of machine learning (ML) will entail radical changes to the variety of epistemic outputs science is capable of producing. Call this the disruption claim. This, in turn, rests on a distinctness claim, which holds ML to exist on novel epistemic footing relative to classical modelling approaches in virtue of its atheoreticity. We describe the operation of ML systems in scientific practice and reveal it to be a necessarily theory-laden exercise. This undercuts claims of epistemic distinctness and, therefore, at least one path to claims of disruption."

--- Apparently Andrews thought better of draft title of "The Immortal Science of ML".  (Part of me hopes she reconsiders.)]]></description>
<dc:subject>to:NB philosophy_of_science data_mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bb4879318782/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:philosophy_of_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.theverge.com/features/23764584/ai-artificial-intelligence-data-notation-labor-scale-surge-remotasks-openai-chatbots">
    <title>Inside the AI Factory: the humans that make tech seem human - The Verge</title>
    <dc:date>2023-06-28T15:53:18+00:00</dc:date>
    <link>https://www.theverge.com/features/23764584/ai-artificial-intelligence-data-notation-labor-scale-surge-remotasks-openai-chatbots</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[--- This is well-written, and no doubt accurate.  I am very much inclined to assign it, the next time I teach data mining, as a look behind the curtain of where labels come from.
--- BUT: it stacks the deck rhetorically pretty strongly.
--- Instance 1, which leapt out at me as the child of a development economist: It never mentions the prevailing wages in these countries, to give any sense of whether those are good jobs _in context_.  For the record, the World Bank puts Kenya's national income in 2021 at just under $2100 [https://data.worldbank.org/country/kenya?view=chart].  At an 8 hour day * 5 days/week * 50 weeks/year that comes to $1.05/hr.  Suddenly paying $1--$3 an hour does not sound that bad!  (And the initial rates of up to $10/hr were princely --- the same ratio to national income in the US would be around $350/hr!) [*] 
--- Now, whenever there's a positive-sum productive activity, there is a zero-sum competition over how to divide the surplus.  I am always in favor of the workers getting a bigger share.  I would 100% support (e.g.) the Kenyan annotators unionizing to get more stable and better-paid jobs.  But creating a small labor aristocracy neither a development strategy nor a moral obligation.
--- Instance 2, the unfavorable comparisons to mid-20-century office work in the US and other developed countries.  Those jobs were famously alienating!  We developed whole artistic genres about how alienating they were!

*: Obviously, the average wage for wage-earners has to be higher than the national income per person.  But, for comparison, the US national income per person divided by the length of the working year comes out to $35/hr.  (Somewhat to my surprise, the World Bank puts Kenya's Gini index at 40.8, vs. 39.7 for the US [https://data.worldbank.org/indicator/SI.POV.GINI].)  Again: prevailing wages in big cities like Nairobi are probably higher than the rest of the country; I didn't find any good figures on that in five minutes of search.]]></description>
<dc:subject>data_mining data_sets machine_learning have_read via:alison_gopnik !_at_the_via in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5077d9479d14/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_sets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:machine_learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:alison_gopnik"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:!_at_the_via"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.propublica.org/article/chicagos-race-neutral-traffic-cameras-ticket-black-and-latino-drivers-the-most">
    <title>Chicago’s “Race-Neutral” Traffic Cameras Ticket Black and Latino Drivers the Most — ProPublica</title>
    <dc:date>2023-05-02T20:24:17+00:00</dc:date>
    <link>https://www.propublica.org/article/chicagos-race-neutral-traffic-cameras-ticket-black-and-latino-drivers-the-most</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A summary of the UIC research provided to ProPublica last week confirmed the racial disparities in red-light and speed-camera ticketing and found that most of the speed cameras improve safety."

--- Pretty sure they never come back to the "improve safety" part, which seems important!]]></description>
<dc:subject>have_read chicago track_down_references data_mining algorithmic_fairness to_teach:statistics_of_inequality_and_discrimination</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1fa35ec0cf69/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:chicago"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:track_down_references"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_of_inequality_and_discrimination"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2301.13188">
    <title>[2301.13188] Extracting Training Data from Diffusion Models</title>
    <dc:date>2023-02-15T20:07:10+00:00</dc:date>
    <link>https://arxiv.org/abs/2301.13188</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Image diffusion models such as DALL-E 2, Imagen, and Stable Diffusion have attracted significant attention due to their ability to generate high-quality synthetic images. In this work, we show that diffusion models memorize individual images from their training data and emit them at generation time. With a generate-and-filter pipeline, we extract over a thousand training examples from state-of-the-art models, ranging from photographs of individual people to trademarked company logos. We also train hundreds of diffusion models in various settings to analyze how different modeling and data decisions affect privacy. Overall, our results show that diffusion models are much less private than prior generative models such as GANs, and that mitigating these vulnerabilities may require new advances in privacy-preserving training."

--- I find this genuinely surprising!]]></description>
<dc:subject>in_NB neural_networks data_mining generative_diffusion_models</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:50d279186918/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:generative_diffusion_models"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://link.springer.com/article/10.1007/s10618-022-00907-3">
    <title>Approximation trees: statistical reproducibility in model distillation | SpringerLink</title>
    <dc:date>2023-01-17T06:19:08+00:00</dc:date>
    <link>https://link.springer.com/article/10.1007/s10618-022-00907-3</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This paper examines the reproducibility of learned explanations for black-box predictions via model distillation using classification trees. We find that common tree distillation methods fail to reproduce a single stable explanation when applied to the same teacher model due the randomness of the distillation process. We study this issue of reliable interpretation and propose a standardized framework for tree distillation to achieve reproducibility. The proposed framework consists of (1) a statistical test to stabilize tree splits, and (2) a stopping rule for tree building when using a teacher that provides an estimate of the uncertainty of its predictions, e.g. random forests. We demonstrate the empirical performance of the proposed distillation method on a variety of synthetic and real-world datasets."]]></description>
<dc:subject>to:NB decision_trees data_mining hooker.giles</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2edc09ab8e94/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hooker.giles"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.foreignaffairs.com/world/spirals-delusion-artificial-intelligence-decision-making">
    <title>Spirals of Delusion: How AI Distorts Decision-Making and Makes Dictators More Dangerous</title>
    <dc:date>2022-08-31T21:56:27+00:00</dc:date>
    <link>https://www.foreignaffairs.com/world/spirals-delusion-artificial-intelligence-decision-making</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>have_read in_NB data_mining algorithmic_fairness kith_and_kin farrell.henry to_teach:data-mining re:democratic_cognition seeing_like_a_finite_state_machine</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bf239783c563/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:farrell.henry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:democratic_cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:seeing_like_a_finite_state_machine"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html">
    <title>How China Is Policing the Future - The New York Times</title>
    <dc:date>2022-07-22T14:57:34+00:00</dc:date>
    <link>https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[--- Remarkably little about whether these systems actually work, either in the sense of predicting accurately, _or_ in the sense of getting people to do what the authorities want.]]></description>
<dc:subject>have_read to_teach:data-mining seeing_like_a_finite_state_machine surveillance china:prc prediction data_mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b19320bc850b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:seeing_like_a_finite_state_machine"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:surveillance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:china:prc"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://logicmag.io/play/my-stepdad's-huge-data-set/">
    <title>My Stepdad’s Huge Data Set</title>
    <dc:date>2022-07-03T04:11:52+00:00</dc:date>
    <link>https://logicmag.io/play/my-stepdad's-huge-data-set/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[--- A well-written and accessible article which I will _not_ be teaching in the data-mining class, because there's just no way that could go well.]]></description>
<dc:subject>have_read pr0n advertising data_mining epidemiology_of_representations practices_relating_to_the_transmission_of_genetic_information to_teach:data-mining sfw</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9a15a8940499/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:pr0n"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:advertising"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:epidemiology_of_representations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:practices_relating_to_the_transmission_of_genetic_information"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sfw"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/core/books/big-data-and-the-welfare-state/340936CE478BD6264DE77D9123357D4A#fndtn-information">
    <title>Big Data and the Welfare State</title>
    <dc:date>2022-07-03T03:34:09+00:00</dc:date>
    <link>https://www.cambridge.org/core/books/big-data-and-the-welfare-state/340936CE478BD6264DE77D9123357D4A#fndtn-information</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["A core principle of the welfare state is that everyone pays taxes or contributions in exchange for universal insurance against social risks such as sickness, old age, unemployment, and plain bad luck. This solidarity principle assumes that everyone is a member of a single national insurance pool, and it is commonly explained by poor and asymmetric information, which undermines markets and creates the perception that we are all in the same boat. Living in the midst of an information revolution, this is no longer a satisfactory approach. This book explores, theoretically and empirically, the consequences of 'big data' for the politics of social protection. Torben Iversen and Philipp Rehm argue that more and better data polarize preferences over public insurance and often segment social insurance into smaller, more homogenous, and less redistributive pools, using cases studies of health and unemployment insurance and statistical analyses of life insurance, credit markets, and public opinion."]]></description>
<dc:subject>to:NB books:noted prediction data_mining political_economy welfare_state downloaded</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:01042e85ab57/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:political_economy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:welfare_state"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html?action=click&amp;module=Well&amp;pgtype=Homepage&amp;section=Business">
    <title>How China Is Policing the Future - The New York Times</title>
    <dc:date>2022-06-28T18:24:38+00:00</dc:date>
    <link>https://www.nytimes.com/2022/06/25/technology/china-surveillance-police.html?action=click&amp;module=Well&amp;pgtype=Homepage&amp;section=Business</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>surveillance data_mining prediction china:prc to_teach:data-mining police</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a70ccc43e029/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:surveillance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:china:prc"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:police"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nature.com/articles/s41467-019-10933-3/">
    <title>Estimating the success of re-identifications in incomplete datasets using generative models | Nature Communications</title>
    <dc:date>2022-04-13T03:01:09+00:00</dc:date>
    <link>https://www.nature.com/articles/s41467-019-10933-3/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["While rich medical, behavioral, and socio-demographic data are key to modern data-driven research, their collection and use raise legitimate privacy concerns. Anonymizing datasets through de-identification and sampling before sharing them has been the main tool used to address those concerns. We here propose a generative copula-based method that can accurately estimate the likelihood of a specific person to be correctly re-identified, even in a heavily incomplete dataset. On 210 populations, our method obtains AUC scores for predicting individual uniqueness ranging from 0.84 to 0.97, with low false-discovery rate. Using our model, we find that 99.98% of Americans would be correctly re-identified in any dataset using 15 demographic attributes. Our results suggest that even heavily sampled anonymized datasets are unlikely to satisfy the modern standards for anonymization set forth by GDPR and seriously challenge the technical and legal adequacy of the de-identification release-and-forget model."]]></description>
<dc:subject>to:NB privacy data_mining to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6a95196c6bc1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:privacy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2203.06498">
    <title>[2203.06498] The worst of both worlds: A comparative analysis of errors in learning from data in psychology and machine learning</title>
    <dc:date>2022-03-31T23:35:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2203.06498</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>data_analysis bad_data_analysis psychology data_mining gelman.andrew to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6c960a97eea4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bad_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:psychology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:gelman.andrew"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.sciencedirect.com/science/article/pii/S016792361400061X?casa_token=8mKjKuwIF58AAAAA:DjRmaJuDoBZjjHb2kA3iEoCckvybsakE7Ww6qdBRABxULXlOuE8FIvmSbgMgYO0ZwLasjyow">
    <title>A data-driven approach to predict the success of bank telemarketing - ScienceDirect</title>
    <dc:date>2022-03-12T13:25:01+00:00</dc:date>
    <link>https://www.sciencedirect.com/science/article/pii/S016792361400061X?casa_token=8mKjKuwIF58AAAAA:DjRmaJuDoBZjjHb2kA3iEoCckvybsakE7Ww6qdBRABxULXlOuE8FIvmSbgMgYO0ZwLasjyow</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Eye-balling it, the random forest I did for the solutions looks pretty competitive here.

--- Link to my homework assignment:
http://www.stat.cmu.edu/~cshalizi/dm/22/hw/07/hw-07.pdf]]></description>
<dc:subject>data_mining marketing to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:3c63a5c22abb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:marketing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.princeton.edu/books/ebook/9780691207995/text-as-data">
    <title>Text as Data | Princeton University Press</title>
    <dc:date>2022-02-05T21:05:55+00:00</dc:date>
    <link>https://press.princeton.edu/books/ebook/9780691207995/text-as-data</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>books:suggest_to_library books:owned to:NB books:noted text_mining social_science_methodology data_mining social_measurement</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:932d4cca5bd9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:text_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_measurement"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-090820-020800">
    <title>The Society of Algorithms | Annual Review of Sociology</title>
    <dc:date>2021-08-03T04:35:55+00:00</dc:date>
    <link>https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-090820-020800</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The pairing of massive data sets with processes—or algorithms—written in computer code to sort through, organize, extract, or mine them has made inroads in almost every major social institution. This article proposes a reading of the scholarly literature concerned with the social implications of this transformation. First, we discuss the rise of a new occupational class, which we call the coding elite. This group has consolidated power through their technical control over the digital means of production and by extracting labor from a newly marginalized or unpaid workforce, the cybertariat. Second, we show that the implementation of techniques of mathematical optimization across domains as varied as education, medicine, credit and finance, and criminal justice has intensified the dominance of actuarial logics of decision-making, potentially transforming pathways to social reproduction and mobility but also generating a pushback by those so governed. Third, we explore how the same pervasive algorithmic intermediation in digital communication is transforming the way people interact, associate, and think. We conclude by cautioning against the wildest promises of artificial intelligence but acknowledging the increasingly tight coupling between algorithmic processes, social structures, and subjectivities."]]></description>
<dc:subject>to:NB data_mining networked_life to_teach:data-mining fourcade.marion</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:08a0ff6c918b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:networked_life"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:fourcade.marion"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://nymag.com/intelligencer/2020/09/inside-palantir-technologies-peter-thiel-alex-karp.html">
    <title>Inside Palantir, Silicon Valley’s Most Secretive Unicorn</title>
    <dc:date>2021-07-15T17:49:26+00:00</dc:date>
    <link>https://nymag.com/intelligencer/2020/09/inside-palantir-technologies-peter-thiel-alex-karp.html</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Rooms Full of People" is good.]]></description>
<dc:subject>data_mining national_surveillance_state to_teach:data-mining have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a0a558670d10/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:national_surveillance_state"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nature.com/articles/s41386-021-01020-7">
    <title>Systematic misestimation of machine learning performance in neuroimaging studies of depression | Neuropsychopharmacology</title>
    <dc:date>2021-06-11T18:03:12+00:00</dc:date>
    <link>https://www.nature.com/articles/s41386-021-01020-7</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We currently observe a disconcerting phenomenon in machine learning studies in psychiatry: While we would expect larger samples to yield better results due to the availability of more data, larger machine learning studies consistently show much weaker performance than the numerous small-scale studies. Here, we systematically investigated this effect focusing on one of the most heavily studied questions in the field, namely the classification of patients suffering from Major Depressive Disorder (MDD) and healthy controls based on neuroimaging data. Drawing upon structural MRI data from a balanced sample of N = 1868 MDD patients and healthy controls from our recent international Predictive Analytics Competition (PAC), we first trained and tested a classification model on the full dataset which yielded an accuracy of 61%. Next, we mimicked the process by which researchers would draw samples of various sizes (N = 4 to N = 150) from the population and showed a strong risk of misestimation. Specifically, for small sample sizes (N = 20), we observe accuracies of up to 95%. For medium sample sizes (N = 100) accuracies up to 75% were found. Importantly, further investigation showed that sufficiently large test sets effectively protect against performance misestimation whereas larger datasets per se do not. While these results question the validity of a substantial part of the current literature, we outline the relatively low-cost remedy of larger test sets, which is readily available in most cases."

--- I haven't read the paper yet so there might be alternative explanations, but I can't help noting that this is 100% consistent with the most cynical possible interpretation of [http://bactra.org/weblog/698.html].]]></description>
<dc:subject>to:NB neural_data_analysis statistics classifiers to_teach:data-mining re:neutral_model_of_inquiry data_mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a31d8ee83a33/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:neutral_model_of_inquiry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.14364">
    <title>[2105.14364] Graph Similarity Description: How Are These Graphs Similar?</title>
    <dc:date>2021-06-01T17:48:31+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.14364</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["How do social networks differ across platforms? How do information networks change over time? Answering questions like these requires us to compare two or more graphs. This task is commonly treated as a measurement problem, but numerical answers give limited insight. Here, we argue that if the goal is to gain understanding, we should treat graph similarity assessment as a description problem instead. We formalize this problem as a model selection task using the Minimum Description Length principle, capturing the similarity of the input graphs in a common model and the differences between them in transformations to individual models. To discover good models, we propose Momo, which breaks the problem into two parts and introduces efficient algorithms for each. Through an extensive set of experiments on a wide range of synthetic and real-world graphs, we confirm that Momo works well in practice."]]></description>
<dc:subject>to:NB data_mining network_data_analysis re:network_differences statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ec0f72bf02dc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:network_differences"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/9781108845359">
    <title>Deep learning in science | Pattern recognition and machine learning | Cambridge University Press</title>
    <dc:date>2021-05-28T16:34:18+00:00</dc:date>
    <link>https://www.cambridge.org/9781108845359</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This is the first rigorous, self-contained treatment of the theory of deep learning. Starting with the foundations of the theory and building it up, this is essential reading for any scientists, instructors, and students interested in artificial intelligence and deep learning. It provides guidance on how to think about scientific questions, and leads readers through the history of the field and its fundamental connections to neuroscience. The author discusses many applications to beautiful problems in the natural sciences, in physics, chemistry, and biomedicine. Examples include the search for exotic particles and dark matter in experimental physics, the prediction of molecular properties and reaction outcomes in chemistry, and the prediction of protein structures and the diagnostic analysis of biomedical images in the natural sciences. The text is accompanied by a full set of exercises at different difficulty levels and encourages out-of-the-box thinking."

--- Baldi has been around long enough that I am curious to see his take.

]]></description>
<dc:subject>books:noted neural_networks data_mining statistics baldi.pierre your_favorite_deep_neural_network_sucks books:have_suggested_to_library books:in_library downloaded in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:5f32ee3469e2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:baldi.pierre"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:your_favorite_deep_neural_network_sucks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:have_suggested_to_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:in_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2105.07020">
    <title>[2105.07020] Urban Analytics: History, Trajectory, and Critique</title>
    <dc:date>2021-05-18T13:55:44+00:00</dc:date>
    <link>https://arxiv.org/abs/2105.07020</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Urban analytics combines spatial analysis, statistics, computer science, and urban planning to understand and shape city futures. While it promises better policymaking insights, concerns exist around its epistemological scope and impacts on privacy, ethics, and social control. This chapter reflects on the history and trajectory of urban analytics as a scholarly and professional discipline. In particular, it considers the direction in which this field is going and whether it improves our collective and individual welfare. It first introduces early theories, models, and deductive methods from which the field originated before shifting toward induction. It then explores urban network analytics that enrich traditional representations of spatial interaction and structure. Next it discusses urban applications of spatiotemporal big data and machine learning. Finally, it argues that privacy and ethical concerns are too often ignored as ubiquitous monitoring and analytics can empower social repression. It concludes with a call for a more critical urban analytics that recognizes its epistemological limits, emphasizes human dignity, and learns from and supports marginalized communities."]]></description>
<dc:subject>to:NB data_mining cities schweitzer.lisa batty.michael</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:260376c6ecdd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:cities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:schweitzer.lisa"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:batty.michael"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nber.org/papers/w28811">
    <title>AI Adoption and System-Wide Change | NBER</title>
    <dc:date>2021-05-17T14:40:30+00:00</dc:date>
    <link>https://www.nber.org/papers/w28811</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Analyses of AI adoption focus on its adoption at the individual task level. What has received significantly less attention is how AI adoption is shaped by the fact that organisations are composed of many interacting tasks. AI adoption may, therefore, require system-wide change which is both a constraint and an opportunity. We provide the first formal analysis where multiple tasks may be part of a modular or non-modular system. We find that reliance on AI, a prediction tool, increases decision variation which, in turn, raises challenges if decisions across the organisation interact. Modularity, which leads to task independence rather than system-level inter-dependencies, softens that impact. Thus, modularity can facilitate AI adoption. However, it does this at the expense of synergies. By contrast, when there are mechanisms for inter-decision coordination, AI adoption is enhanced when there is a non-modular environment. Consequently, we show that there are important cases where AI adoption will be enhanced when it can be adopted beyond tasks but as part of a designed organisational system."

--- Deliberately not tagged "artificial_intelligence".]]></description>
<dc:subject>to:NB economics data_mining to_teach:data-mining organizations</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:29d9f6d466a9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:economics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:organizations"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.sup.org/books/title/?id=32597">
    <title>The AI Marketing Canvas: A Five-Stage Road Map to Implementing Artificial Intelligence in Marketing | Raj Venkatesan and Jim Lecinski</title>
    <dc:date>2021-05-17T13:01:26+00:00</dc:date>
    <link>https://www.sup.org/books/title/?id=32597</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This book offers a direct, actionable plan CMOs can use to map out initiatives that are properly sequenced and designed for success—regardless of where their marketing organization is in the process.
"The authors pose the following critical questions to marketers: (1) How should modern marketers be thinking about artificial intelligence and machine learning? and (2) How should marketers be developing a strategy and plan to implement AI into their marketing toolkit?
"The opening chapters provide marketing leaders with an overview of what exactly AI is and how is it different than traditional computer science approaches. Venkatesan and Lecinski, then, propose a best-practice, five-stage framework for implementing what they term the "AI Marketing Canvas." Their approach is based on research and interviews they conducted with leading marketers, and offers many tangible examples of what brands are doing at each stage of the AI Marketing Canvas. By way of guidance, Venkatesan and Lecinski provide examples of brands—including Google, Lyft, Ancestry.com, and Coca-Cola—that have successfully woven AI into their marketing strategies. The book concludes with a discussion of important implications for marketing leaders—for your team and culture."

--- I am quite sure that I would find this book equally horrifying in aims, content and style, but I am also equally sure that a fair chunk of my students will end up working for these people.]]></description>
<dc:subject>in_NB books:noted marketing data_mining deceiving_us_has_become_an_industrial_process to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:be76d8874382/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:marketing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:deceiving_us_has_become_an_industrial_process"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2104.10443">
    <title>[2104.10443] Interpretability of machine-learning models in physical sciences</title>
    <dc:date>2021-04-22T15:27:48+00:00</dc:date>
    <link>https://arxiv.org/abs/2104.10443</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In machine learning (ML), it is in general challenging to provide a detailed explanation on how a trained model arrives at its prediction. Thus, usually we are left with a black-box, which from a scientific standpoint is not satisfactory. Even though numerous methods have been recently proposed to interpret ML models, somewhat surprisingly, interpretability in ML is far from being a consensual concept, with diverse and sometimes contrasting motivations for it. Reasonable candidate properties of interpretable models could be model transparency (i.e. how does the model work?) and post hoc explanations (i.e., what else can the model tell me?). Here, I review the current debate on ML interpretability and identify key challenges that are specific to ML applied to materials science."]]></description>
<dc:subject>to:NB data_mining equations_of_motion_from_a_time_series</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:21b982acac32/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:equations_of_motion_from_a_time_series"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nber.org/papers/w23673">
    <title>Opportunities and Challenges: Lessons from Analyzing Terabytes of Scanner Data | NBER</title>
    <dc:date>2021-04-11T03:31:25+00:00</dc:date>
    <link>https://www.nber.org/papers/w23673</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["This paper seeks to better understand what makes big data analysis different, what we can and cannot do with existing econometric tools, and what issues need to be dealt with in order to work with the data efficiently. As a case study, I set out to extract any business cycle information that might exist in four terabytes of weekly scanner data. The main challenge is to handle the volume, variety, and characteristics of the data within the constraints of our computing environment. Scalable and efficient algorithms are available to ease the computation burden, but they often have unknown statistical properties and are not designed for the purpose of efficient estimation or optimal inference. As well, economic data have unique characteristics that generic algorithms may not accommodate. There is a need for computationally efficient econometric methods as big data is likely here to stay."]]></description>
<dc:subject>to:NB data_mining macroeconomics ng.serena heard_the_talk to_read re:your_favorite_dsge_sucks</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:59ca1ea7aed8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:macroeconomics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ng.serena"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:heard_the_talk"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:your_favorite_dsge_sucks"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2103.05766">
    <title>[2103.05766] Interpretable Machines: Constructing Valid Prediction Intervals with Random Forests</title>
    <dc:date>2021-03-21T18:40:32+00:00</dc:date>
    <link>https://arxiv.org/abs/2103.05766</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["An important issue when using Machine Learning algorithms in recent research is the lack of interpretability. Although these algorithms provide accurate point predictions for various learning problems, uncertainty estimates connected with point predictions are rather sparse. A contribution to this gap for the Random Forest Regression Learner is presented here. Based on its Out-of-Bag procedure, several parametric and non-parametric prediction intervals are provided for Random Forest point predictions and theoretical guarantees for its correct coverage probability is delivered. In a second part, a thorough investigation through Monte-Carlo simulation is conducted evaluating the performance of the proposed methods from three aspects: (i) Analyzing the correct coverage rate of the proposed prediction intervals, (ii) Inspecting interval width and (iii) Verifying the competitiveness of the proposed intervals with existing methods. The simulation yields that the proposed prediction intervals are robust towards non-normal residual distributions and are competitive by providing correct coverage rates and comparably narrow interval lengths, even for comparably small samples."]]></description>
<dc:subject>to:NB confidence_sets prediction statistics random_forests ensemble_methods data_mining to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:fdba38e0badc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:confidence_sets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://yalebooks.yale.edu/book/9780300209570/atlas-ai">
    <title>Atlas of AI | Yale University Press</title>
    <dc:date>2021-02-28T02:35:08+00:00</dc:date>
    <link>https://yalebooks.yale.edu/book/9780300209570/atlas-ai</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["What happens when artificial intelligence saturates political life and depletes the planet? How is AI shaping our understanding of ourselves and our societies? In this book Kate Crawford reveals how this planetary network is fueling a shift toward undemocratic governance and increased inequality. Drawing on more than a decade of research, award-winning science, and technology, Crawford reveals how AI is a technology of extraction: from the energy and minerals needed to build and sustain its infrastructure, to the exploited workers behind “automated” services, to the data AI collects from us. 
"Rather than taking a narrow focus on code and algorithms, Crawford offers us a political and a material perspective on what it takes to make artificial intelligence and where it goes wrong. While technical systems present a veneer of objectivity, they are always systems of power. This is an urgent account of what is at stake as technology companies use artificial intelligence to reshape the world."]]></description>
<dc:subject>books:noted data_mining to_teach:data-mining via:henry_farrell coveted algorithmic_fairness in_NB downloaded</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:62f1b55fef86/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:henry_farrell"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:coveted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://doi.org/10.5210/fm.v19i7.4901">
    <title>Engineering the public: Big data, surveillance and computational politics | First Monday</title>
    <dc:date>2021-02-04T18:52:17+00:00</dc:date>
    <link>https://doi.org/10.5210/fm.v19i7.4901</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Digital technologies have given rise to a new combination of big data and computational practices which allow for massive, latent data collection and sophisticated computational modeling, increasing the capacity of those with resources and access to use these tools to carry out highly effective, opaque and unaccountable campaigns of persuasion and social engineering in political, civic and commercial spheres. I examine six intertwined dynamics that pertain to the rise of computational politics: the rise of big data, the shift away from demographics to individualized targeting, the opacity and power of computational modeling, the use of persuasive behavioral science, digital media enabling dynamic real-time experimentation, and the growth of new power brokers who own the data or social media environments. I then examine the consequences of these new mechanisms on the public sphere and political campaigns."

--- I would insert some qualifications about how well targeted advertising / persuasion actually works, currently, but this holds up very well for something published in 2014 (and apparently mostly written in 2012?).  Somewhat surprised I didn't bookmark it back then.]]></description>
<dc:subject>in_NB data_mining advertising re:actually-dr-internet-is-the-name-of-the-monsters-creator tufekci.zeynep to_teach:data-mining have_read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1f37d36add25/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:advertising"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:actually-dr-internet-is-the-name-of-the-monsters-creator"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:tufekci.zeynep"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/core/journals/data-and-policy/article/from-satisficing-to-artificing-the-evolution-of-administrative-decisionmaking-in-the-age-of-the-algorithm/8962400DADAC3C740AC023A20B38E285">
    <title>From satisficing to artificing: The evolution of administrative decision-making in the age of the algorithm | Data &amp; Policy | Cambridge Core</title>
    <dc:date>2021-02-04T15:27:49+00:00</dc:date>
    <link>https://www.cambridge.org/core/journals/data-and-policy/article/from-satisficing-to-artificing-the-evolution-of-administrative-decisionmaking-in-the-age-of-the-algorithm/8962400DADAC3C740AC023A20B38E285</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Algorithmic decision tools (ADTs) are being introduced into public sector organizations to support more accurate and consistent decision-making. Whether they succeed turns, in large part, on how administrators use these tools. This is one of the first empirical studies to explore how ADTs are being used by Street Level Bureaucrats (SLBs). The author develops an original conceptual framework and uses in-depth interviews to explore whether SLBs are ignoring ADTs (algorithm aversion); deferring to ADTs (automation bias); or using ADTs together with their own judgment (an approach the author calls “artificing”). Interviews reveal that artificing is the most common use-type, followed by aversion, while deference is rare. Five conditions appear to influence how practitioners use ADTs: (a) understanding of the tool (b) perception of human judgment (c) seeing value in the tool (d) being offered opportunities to modify the tool (e) alignment of tool with expectations."]]></description>
<dc:subject>data_mining ethnography algorithmic_fairness to_teach:data-mining in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bbe1b34260a8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ethnography"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://doi.org/10.1093/oso/9780190684099.001.0001">
    <title>Predict and Surveil: Data, Discretion, and the Future of Policing - Oxford Scholarship</title>
    <dc:date>2021-01-16T05:17:01+00:00</dc:date>
    <link>https://doi.org/10.1093/oso/9780190684099.001.0001</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[10.1093/oso/9780190684099.001.0001]]></description>
<dc:subject>in_NB books:noted crime prediction data_mining police downloaded</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e1883fd38699/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:crime"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:police"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://doi.org/10.1093/oso/9780198864165.001.0001">
    <title>Phantom Pattern Problem: The Mirage of Big Data - Oxford Scholarship</title>
    <dc:date>2021-01-16T04:09:24+00:00</dc:date>
    <link>https://doi.org/10.1093/oso/9780198864165.001.0001</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Pattern recognition prowess served our ancestors well. However, today we are confronted by a deluge of data that are far more abstract, complicated, and difficult to interpret than were annual seasons and the sounds of predators. The number of possible patterns that can be identified relative to the number that are genuinely useful has grown exponentially—which means that the chances that a discovered pattern is useful is rapidly approaching zero. Coincidental streaks, clusters, and correlations are the norm—not the exception. Our challenge is to overcome our inherited inclination to think that all patterns are meaningful.Computer algorithms can easily identify an essentially unlimited number of phantom patterns and relationships that vanish when confronted with fresh data. The paradox of big data is that the more data we ransack for patterns, the more likely it is that what we find will be worthless. Our challenge is to overcome our inherited inclination to think that all patterns are meaningful."

--- Last tag because I doubt there's anything new here (but that may not be the point --- perhaps this is intended as more expository / popularizing?)]]></description>
<dc:subject>to:NB books:noted multiple_testing high-dimensional_statistics statistics data_mining color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8072b64fafe5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:multiple_testing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:high-dimensional_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://nowpublishers.com/article/Details/MAL-078-3">
    <title>now publishers - Data Analytics on Graphs Part III: Machine Learning on Graphs, from Graph Topology to Applications</title>
    <dc:date>2021-01-14T19:23:00+00:00</dc:date>
    <link>https://nowpublishers.com/article/Details/MAL-078-3</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Modern data analytics applications on graphs often operate on domains where graph topology is not known a priori, and hence its determination becomes part of the problem definition, rather than serving as prior knowledge which aids the problem solution. Part III of this monograph starts by a comprehensive account of ways to learn the pertinent graph topology, ranging from the simplest case where the physics of the problem already suggest a possible graph structure, through to general cases where the graph structure is to be learned from the data observed on a graph. A particular emphasis is placed on the use of standard “relationship measures” in this context, including the correlation and precision matrices, together with the ways to combine these with the available prior knowledge and structural conditions, such as the smoothness of the graph signals or sparsity of graph connections. Next, for learning sparse graphs (that is, graphs with a small number of edges), the utility of the least absolute shrinkage and selection operator, known as LASSO is addressed, along with its graph specific variant, the graphical LASSO. For completeness, both variants of LASSO are derived in an intuitive way, starting from basic principles. An in-depth elaboration of the graph topology learning paradigm is provided through examples on physically well defined graphs, such as electric circuits, linear heat transfer, social and computer networks, and springmass systems. We also review main trends in graph neural networks (GNN) and graph convolutional networks (GCN) from the perspective of graph signal filtering. Particular insight is given to the role of diffusion processes over graphs, to show that GCNs can be understood from the graph diffusion perspective. Given the largely heuristic nature of the existing GCNs, their treatment through graph diffusion processes may also serve as a basis for new designs of GCNs. Tensor representation of lattice-structured graphs is next considered, and it is shown that tensors (multidimensional data arrays) can be treated a special class of graph signals, whereby the graph vertices reside on a high-dimensional regular lattice structure. The concept of graph tensor networks then provides a unifying framework for learning on irregular domains. This part of monograph concludes with an in-dept account of emerging applications in financial data processing and underground transportation network modeling. By means of portfolio cuts of an asset graph, we show how domain knowledge can be meaningfully incorporated into investment analysis. In the underground transportation example, we demonstrate how graph theory can be used to identify those stations in the London underground network which have the greatest influence on the functionality of the traffic, and proceed, in an innovative way, to assess the impact of a station closure on service levels across the city."]]></description>
<dc:subject>to:NB data_mining network_data_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:aaa4585b482d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://nowpublishers.com/article/Details/MAL-078-2">
    <title>now publishers - Data Analytics on Graphs Part II: Signals on Graphs</title>
    <dc:date>2021-01-14T19:21:27+00:00</dc:date>
    <link>https://nowpublishers.com/article/Details/MAL-078-2</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The area of Data Analytics on graphs deals with information processing of data acquired on irregular but structured graph domains. The focus of Part I of this monograph has been on both the fundamental and higher-order graph properties, graph topologies, and spectral representations of graphs. Part I also establishes rigorous frameworks for vertex clustering and graph segmentation, and illustrates the power of graphs in various data association tasks. Part II embarks on these concepts to address the algorithmic and practical issues related to data/signal processing on graphs, with the focus on the analysis and estimation of both deterministic and random data on graphs. The fundamental ideas related to graph signals are introduced through a simple and intuitive, yet general enough case study of multisensor temperature field estimation. The concept of systems on graph is defined using graph signal shift operators, which generalize the corresponding principles from traditional learning systems. At the core of the spectral domain representation of graph signals and systems is the Graph Fourier Transform (GFT), defined based on the eigendecomposition of both the adjacency matrix and the graph Laplacian. Spectral domain representations are then used as the basis to introduce graph signal filtering concepts and address their design, including Chebyshev series polynomial approximation. Ideas related to the sampling of graph signals, and in particular the challenging topic of data dimensionality reduction through graph subsampling, are presented and further linked with compressive sensing. The principles of time-varying signals on graphs and basic definitions related to random graph signals are next reviewed. Localized graph signal analysis in the joint vertex-spectral domain is referred to as the vertex-frequency analysis, since it can be considered as an extension of classical time-frequency analysis to the graph serving as signal domain. Important aspects of the local graph Fourier transform (LGFT) are covered, together with its various forms including the graph spectral and vertex domain windows and the inversion conditions and relations. A link between the LGFT with a varying spectral window and the spectral graph wavelet transform (SGWT) is also established. Realizations of the LGFT and SGWT using polynomial (Chebyshev) approximations of the spectral functions are further considered and supported by examples. Finally, energy versions of the vertex-frequency representations are introduced, along with their relations with classical timefrequency analysis, including a vertex-frequency distribution that can satisfy the marginal properties. The material is supported by illustrative examples."

]]></description>
<dc:subject>to:NB data_mining graph_theory network_data_analysis fourier_analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2f8de14b2961/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:graph_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:fourier_analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://nowpublishers.com/article/Details/MAL-078-1">
    <title>now publishers - Data Analytics on Graphs Part I: Graphs and Spectra on Graphs</title>
    <dc:date>2021-01-14T19:20:33+00:00</dc:date>
    <link>https://nowpublishers.com/article/Details/MAL-078-1</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The area of Data Analytics on graphs promises a paradigm shift, as we approach information processing of new classes of data which are typically acquired on irregular but structured domains (such as social networks, various ad-hoc sensor networks). Yet, despite the long history of Graph Theory, current approaches tend to focus on aspects of optimisation of graphs themselves rather than on eliciting strategies relevant to the objective application of the graph paradigm, such as detection, estimation, statistical and probabilistic inference, clustering and separation from signals and data acquired on graphs. In order to bridge this gap, we first revisit graph topologies from a Data Analytics point of view, to establish a taxonomy of graph networks through a linear algebraic formalism of graph topology (vertices, connections, directivity). This serves as a basis for spectral analysis of graphs, whereby the eigenvalues and eigenvectors of graph Laplacian and adjacency matrices are shown to convey physical meaning related to both graph topology and higher-order graph properties, such as cuts, walks, paths, and neighborhoods. Through a number of carefully chosen examples, we demonstrate that the isomorphic nature of graphs enables both the basic properties of data observed on graphs and their descriptors (features) to be preserved throughout the data analytics process, even in the case of reordering of graph vertices, where classical approaches fail. Next, to illustrate the richness and flexibility of estimation strategies performed on graph signals, spectral analysis of graphs is introduced through eigenanalysis of mathematical descriptors of graphs and in a generic way. Finally, benefiting from enhanced degrees of freedom associated with graph representations, a framework for vertex clustering and graph segmentation is established based on graph spectral representation (eigenanalysis) which demonstrates the power of graphs in various data association tasks, from image clustering and segmentation trough to low-dimensional manifold representation. The supporting examples demonstrate the promise of Graph Data Analytics in modeling structural and functional/semantic inferences. At the same time, Part I serves as a basis for Part II and Part III which deal with theory, methods and applications of processing Data on Graphs and Graph Topology Learning from data."]]></description>
<dc:subject>to:NB graph_theory data_mining network_data_analysis spectral_methods</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:bfd5d06856f6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:graph_theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spectral_methods"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2101.04715">
    <title>[2101.04715] A unified framework for correlation mining in ultra-high dimension</title>
    <dc:date>2021-01-14T16:00:40+00:00</dc:date>
    <link>https://arxiv.org/abs/2101.04715</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["An important problem in large scale inference is the identification of variables that have large correlations or partial correlations. Recent work has yielded breakthroughs in the ultra-high dimensional setting when the sample size n is fixed and the dimension p→∞ ([Hero, Rajaratnam 2011, 2012]). Despite these advances, the correlation screening framework suffers from some serious practical, methodological and theoretical deficiencies. For instance, theoretical safeguards for partial correlation screening requires that the population covariance matrix be block diagonal. This block sparsity assumption is however highly restrictive in numerous practical applications. As a second example, results for correlation and partial correlation screening framework requires the estimation of dependence measures or functionals, which can be highly prohibitive computationally. In this paper, we propose a unifying approach to correlation and partial correlation mining which specifically goes beyond the block diagonal correlation structure, thus yielding a methodology that is suitable for modern applications. By making connections to random geometric graphs, the number of highly correlated or partial correlated variables are shown to have novel compound Poisson finite-sample characterizations, which hold for both the finite p case and when p→∞. The unifying framework also demonstrates an important duality between correlation and partial correlation screening with important theoretical and practical consequences."]]></description>
<dc:subject>to:NB high-dimensional_statistics data_mining hero.alfred_o.</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:c37308da648f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:high-dimensional_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:hero.alfred_o."/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://sociologicalscience.com/articles-v8-2-26/">
    <title>Which Data Fairly Differentiate? American Views on the Use of Personal Data in Two Market Settings | Sociological Science</title>
    <dc:date>2021-01-14T15:15:48+00:00</dc:date>
    <link>https://sociologicalscience.com/articles-v8-2-26/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Corporations increasingly use personal data to offer individuals different products and prices. I present first-of-its-kind evidence about how U.S. consumers assess the fairness of companies using personal information in this way. Drawing on a nationally representative survey that asks respondents to rate how fair or unfair it is for car insurers and lenders to use various sorts of information—from credit scores to web browser history to residential moves—I find that everyday Americans make strong moral distinctions among types of data, even when they are told data predict consumer behavior (insurance claims and loan defaults, respectively). Open-ended responses show that people adjudicate fairness by drawing on shared understandings of whether data are logically related to the predicted outcome and whether the categories companies use conflate morally distinct individuals. These findings demonstrate how dynamics long studied by economic sociologists manifest in legitimating a new and important mode of market allocation."]]></description>
<dc:subject>sociology prediction data_mining algorithmic_fairness to_teach:data-mining in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:d2f1aee86353/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1711.10427">
    <title>[1711.10427] Latent Association Mining in Binary Data</title>
    <dc:date>2021-01-11T16:34:05+00:00</dc:date>
    <link>https://arxiv.org/abs/1711.10427</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We consider the problem of identifying stable sets of mutually associated features in moderate or high-dimensional binary data. In this context we develop and investigate a method called Latent Association Mining for Binary Data (LAMB). The LAMB method is based on a simple threshold model in which the observed binary values represent a random thresholding of a latent continuous vector that may have a complex association structure. We consider a measure of latent association that quantifies association in the latent continuous vector without bias due to the random thresholding. The LAMB method uses an iterative testing based search procedure to identify stable sets of mutually associated features. We compare the LAMB method with several competing methods on artificial binary-valued datasets and two real count-valued datasets. The LAMB method detects meaningful associations in these datasets. In the case of the count-valued datasets, associations detected by the LAMB method are based only on information about whether the counts are zero or non-zero, and is competitive with methods that have access to the full count data."]]></description>
<dc:subject>to:NB data_mining nobel.andrew to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ef1c23d8da18/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:nobel.andrew"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.12802">
    <title>[2012.12802] Machine Learning Advances for Time Series Forecasting</title>
    <dc:date>2020-12-24T15:33:23+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.12802</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this paper we survey the most recent advances in supervised machine learning and high-dimensional models for time series forecasting. We consider both linear and nonlinear alternatives. Among the linear methods we pay special attention to penalized regressions and ensemble of models. The nonlinear methods considered in the paper include shallow and deep neural networks, in their feed-forward and recurrent versions, and tree-based methods, such as random forests and boosted trees. We also consider ensemble and hybrid models by combining ingredients from different alternatives. Tests for superior predictive ability are briefly reviewed. Finally, we discuss application of machine learning in economics and finance and provide an illustration with high-frequency financial data."]]></description>
<dc:subject>to:NB time_series prediction data_mining decision_trees random_forests neural_networks ensemble_methods to_teach:data_over_space_and_time</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:30ad1b56b9ba/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:time_series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:decision_trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_forests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:neural_networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:ensemble_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data_over_space_and_time"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.08496">
    <title>[2012.08496] Spectral Methods for Data Science: A Statistical Perspective</title>
    <dc:date>2020-12-16T15:13:22+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.08496</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Spectral methods have emerged as a simple yet surprisingly effective approach for extracting information from massive, noisy and incomplete data. In a nutshell, spectral methods refer to a collection of algorithms built upon the eigenvalues (resp. singular values) and eigenvectors (resp. singular vectors) of some properly designed matrices constructed from data. A diverse array of applications have been found in machine learning, data science, and signal processing. Due to their simplicity and effectiveness, spectral methods are not only used as a stand-alone estimator, but also frequently employed to initialize other more sophisticated algorithms to improve performance.
"While the studies of spectral methods can be traced back to classical matrix perturbation theory and methods of moments, the past decade has witnessed tremendous theoretical advances in demystifying their efficacy through the lens of statistical modeling, with the aid of non-asymptotic random matrix theory. This monograph aims to present a systematic, comprehensive, yet accessible introduction to spectral methods from a modern statistical perspective, highlighting their algorithmic implications in diverse large-scale applications. In particular, our exposition gravitates around several central questions that span various applications: how to characterize the sample efficiency of spectral methods in reaching a target level of statistical accuracy, and how to assess their stability in the face of random noise, missing data, and adversarial corruptions? In addition to conventional ℓ2 perturbation analysis, we present a systematic ℓ∞ and ℓ2,∞ perturbation theory for eigenspace and singular subspaces, which has only recently become available owing to a powerful "leave-one-out" analysis framework."]]></description>
<dc:subject>to:NB statistics spectral_methods data_mining approximation linear_algebra to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6c3188aa8c89/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spectral_methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:approximation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:linear_algebra"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.lse.ac.uk/cpnss/research/genetically-evolving-models-in-science">
    <title>Genetically Evolving Models in Science (GEMS)</title>
    <dc:date>2020-12-12T22:50:06+00:00</dc:date>
    <link>https://www.lse.ac.uk/cpnss/research/genetically-evolving-models-in-science</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The development of scientific models suffers from two related problems: the ever-growing number of experimental results and scientists’ cognitive limitations (including cognitive biases). This multidisciplinary project (psychology, philosophy, computer modelling, computer science and cognitive neuroscience) addresses these problems by developing a novel methodology for generating scientific models automatically. The methodology is not specific to any particular discipline and can be applied to any science where experimental data are available. The method treats models as computer programs and evolves a population of models using genetic programming. The extent to which the models fit the empirical data is used as a fitness function. The best models – potentially modified by cross-over and mutation – are selected for the next generation. Pilot simulations have established the validity of the methodology with simple experiments in psychology."

--- ETA a little bit later.  I tracked down the only paper from this group I can find, [http://eprints.lse.ac.uk/106996/], and ohhh boy.  A quick scan shows absolutely no mention of symbolic regression, or any other work on, precisely, using genetic programming to evolve models to fit data.  (I also can't quite figure out their loss function.)  There seems to be no training/testing split, just implicit capacity control by limiting the tree used to transform inputs into predictions to a small number of nodes.
Much as I enjoyed revisiting the spirit of SFI summer school projects c. 1998 (bliss it was in that dawn to be alive), I find this a striking contrast between ambition and execution.  But then again, being strictly fair, it's only a preliminary paper.]]></description>
<dc:subject>track_down_references genetic_algorithms curve_fitting data_mining its_lems_world_we_just_live_in_it symbolic_regression</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f91e6e5ea694/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:track_down_references"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:genetic_algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:curve_fitting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:its_lems_world_we_just_live_in_it"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:symbolic_regression"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.cambridge.org/9781108477444">
    <title>Small summaries for big data | Knowledge management, databases and data mining | Cambridge University Press</title>
    <dc:date>2020-11-30T17:32:10+00:00</dc:date>
    <link>https://www.cambridge.org/9781108477444</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The massive volume of data generated in modern applications can overwhelm our ability to conveniently transmit, store, and index it. For many scenarios, building a compact summary of a dataset that is vastly smaller enables flexibility and efficiency in a range of queries over the data, in exchange for some approximation. This comprehensive introduction to data summarization, aimed at practitioners and students, showcases the algorithms, their behavior, and the mathematical underpinnings of their operation. The coverage starts with simple sums and approximate counts, building to more advanced probabilistic structures such as the Bloom Filter, distinct value summaries, sketches, and quantile summaries. Summaries are described for specific types of data, such as geometric data, graphs, and vectors and matrices. The authors offer detailed descriptions of and pseudocode for key algorithms that have been incorporated in systems from companies such as Google, Apple, Microsoft, Netflix and Twitter."]]></description>
<dc:subject>to:NB books:noted random_projections locality-sensitive_hashing dimension_reduction clustering data_mining computational_statistics to_teach:data-mining books:in_library books:have_suggested_to_library downloaded re:codename:catherine_wheel</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:8fb436ef1b8f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_projections"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:locality-sensitive_hashing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:dimension_reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:computational_statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:in_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:have_suggested_to_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:codename:catherine_wheel"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nytimes.com/2020/01/20/opinion/facial-recognition-ban-privacy.html">
    <title>We’re Banning Facial Recognition. We’re Missing the Point.</title>
    <dc:date>2020-11-27T05:28:52+00:00</dc:date>
    <link>https://www.nytimes.com/2020/01/20/opinion/facial-recognition-ban-privacy.html</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>facial_recognition data_mining privacy algorithmic_fairness schneier.bruce</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:3435da20fc1c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:facial_recognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:privacy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:schneier.bruce"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3727562">
    <title>Democratic Data: A Relational Theory For Data Governance by Salome Viljoen :: SSRN</title>
    <dc:date>2020-11-26T16:03:32+00:00</dc:date>
    <link>https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3727562</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Data governance law — the law regulating how data about people is collected, processed, and used — is the subject of lively theorizing. Concerns over datafication (the transformation of information or knowledge about people into a commodity) and its harmful personal and social effects have produced an abundance of proposals for reform. Different theories advance different legal interests in information, resulting in various individualist claims and remedies. Some seek to reassert individual control for data subjects over the terms of their datafication, while others aim to maximize data subject financial gain. But these proposals share a common conceptual flaw: they miss the central importance of population-level relations among individuals for how data collection produces both social value and social harm. The data collection practices of the most powerful technology companies are primarily aimed at deriving population-level insights from data subjects for population-level applicability, not individual-level insights specific to the data subject in question. Put simply, the point of data production is to put people into population-based relations with one another; this activity drives data collection practices in the digital economy and results in some of the most pressing forms of social informational harm. Individualist data subject rights cannot represent, let alone address, these population-level effects.
"Treating data’s population-level effects as central to the task of data governance opens up new terrain. The proper aim of data governance is not to reassert individual control over the terms of one’s own datafication or to maximize personal gain, but instead to develop the institutional responses necessary to represent the relevant population-level interests at stake in data production. This shifts the task of reform from granting individuals rights to exit or payment, to securing recognition and standing to shape the purposes and conditions of data production for those with interests at stake in such choices. From this reorientation, data governance law may develop legal reforms capable of responding to the harms of datafication without foreclosing socially beneficial forms of data production."]]></description>
<dc:subject>law data_mining privacy in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1ca2a6473ed5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:law"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:privacy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2011.11483">
    <title>[2011.11483] Social Determinants of Recidivism: A Machine Learning Solution</title>
    <dc:date>2020-11-25T14:32:55+00:00</dc:date>
    <link>https://arxiv.org/abs/2011.11483</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this study, we propose advancements in criminal justice analytics along three dimensions. First, for the long-standing problem of recidivism risk assessment, we shift the focus from predicting the likelihood of recidivism to identifying its underlying determinants within distinct subgroups. Second, to achieve this, we introduce a machine learning pipeline that combines unsupervised and supervised techniques to identify homogeneous clusters of individuals and find statistically significant determinants of recidivism within each cluster. We demonstrate useful heuristics to address key challenges in this pipeline related to parameter selection and data processing. Third, we use these results to compare outcomes across subgroups, enabling a more nuanced understanding of the root factors that lead to differences in recidivism. Overall, this approach aims to explore new ways of addressing long-standing criminal justice challenges, providing a reliable framework for informed policy intervention."]]></description>
<dc:subject>to:NB crime prediction data_mining color_me_skeptical</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2e5c63e6f42a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:crime"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:color_me_skeptical"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1909.10832">
    <title>[1909.10832] High-dimensional clustering via Random Projections</title>
    <dc:date>2020-11-25T14:19:10+00:00</dc:date>
    <link>https://arxiv.org/abs/1909.10832</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["In this work, we address the unsupervised classification issue by exploiting the general idea of Random Projection Ensemble. Specifically, we propose to generate a set of low dimensional independent random projections and to perform model-based clustering on each of them. The top B∗ projections, i.e. the projections which show the best grouping structure are then retained. The final partition is obtained by aggregating the clusters found in the projections via consensus. The performances of the method are assessed on both real and simulated datasets. The obtained results suggest that the proposal represents a promising tool for high-dimensional clustering."]]></description>
<dc:subject>to:NB clustering random_projections data_mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:814eb4c5041a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:random_projections"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621">
    <title>Computational Social Science and Sociology | Annual Review of Sociology</title>
    <dc:date>2020-11-19T22:13:26+00:00</dc:date>
    <link>https://www.annualreviews.org/doi/abs/10.1146/annurev-soc-121919-054621</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The integration of social science with computer science and engineering fields has produced a new area of study: computational social science. This field applies computational methods to novel sources of digital data such as social media, administrative records, and historical archives to develop theories of human behavior. We review the evolution of this field within sociology via bibliometric analysis and in-depth analysis of the following subfields where this new work is appearing most rapidly: (a) social network analysis and group formation; (b) collective behavior and political sociology; (c) the sociology of knowledge; (d) cultural sociology, social psychology, and emotions; (e) the production of culture; (f) economic sociology and organizations; and (g) demography and population studies. Our review reveals that sociologists are not only at the center of cutting-edge research that addresses longstanding questions about human behavior but also developing new lines of inquiry about digital spaces as well. We conclude by discussing challenging new obstacles in the field, calling for increased attention to sociological theory, and identifying new areas where computational social science might be further integrated into mainstream sociology."]]></description>
<dc:subject>to:NB sociology data_mining data_analysis network_data_analysis bail.christopher</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:2d69b6950b0e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:network_data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:bail.christopher"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://ieeexplore.ieee.org/document/8861141">
    <title>How Much Does Your Data Exploration Overfit? Controlling Bias via Information Usage - IEEE Journals &amp; Magazine</title>
    <dc:date>2020-11-16T16:05:49+00:00</dc:date>
    <link>https://ieeexplore.ieee.org/document/8861141</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Modern data is messy and high-dimensional, and it is often not clear a priori what are the right questions to ask. Instead, the analyst typically needs to use the data to search for interesting analyses to perform and hypotheses to test. This is an adaptive process, where the choice of analysis to be performed next depends on the results of the previous analyses on the same data. Ultimately, which results are reported can be heavily influenced by the data. It is widely recognized that this process, even if well-intentioned, can lead to biases and false discoveries, contributing to the crisis of reproducibility in science. But while any data-exploration renders standard statistical theory invalid, experience suggests that different types of exploratory analysis can lead to disparate levels of bias, and the degree of bias also depends on the particulars of the data set. In this paper, we propose a general information usage framework to quantify and provably bound the bias and other error metrics of an arbitrary exploratory analysis. We prove that our mutual information based bound is tight in natural settings, and then use it to give rigorous insights into when commonly used procedures do or do not lead to substantially biased estimation. Through the lens of information usage, we analyze the bias of specific exploration procedures such as filtering, rank selection and clustering. Our general framework also naturally motivates randomization techniques that provably reduce exploration bias while preserving the utility of the data analysis. We discuss the connections between our approach and related ideas from differential privacy and blinded data analysis, and supplement our results with illustrative simulations."

--- Pretty sure I've previously bookmarked a pre-print.]]></description>
<dc:subject>to:NB to_read data_analysis data_mining model_selection statistics post-model-selection_inference to_teach:linear_models to_teach:undergrad-ADA to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:3ef24ab591ab/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:model_selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:post-model-selection_inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:linear_models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:undergrad-ADA"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://pubsonline.informs.org/doi/pdf/10.1287/mksc.2019.1188">
    <title>How Effective Is Third-Party Consumer Profiling? Evidence from Field Studies</title>
    <dc:date>2020-11-11T15:29:11+00:00</dc:date>
    <link>https://pubsonline.informs.org/doi/pdf/10.1287/mksc.2019.1188</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Data brokers often use online browsing records to create digital consumer
profiles that they sell to marketers as predefined audiences for ad targeting. However, this
process is a “black box”—little is known about the reliability of the digital profiles that are
created or of the audience identification provided by buying platforms. In this paper, we
investigate using three field tests the accuracy of a variety of demographic and audienceinterest segments. We examine the accuracy of more than 90 third-party audiences across
19 data brokers. Audience segments vary greatly in quality and are often inaccurate across
leading data brokers. In comparison with random audience selection, the use of black box
data profiles, on average, increased identification of a user with a desired single attribute
by 0%–77%. Audience identification can be improved, on average, by 123% when combined with optimization software. However, given the high extra costs of targeting solutions and the relative inaccuracy, we find that third-party audiences are often economically unattractive except for higher-priced media placements"]]></description>
<dc:subject>to:NB to_read advertising data_mining to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:a05c464a09d0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:advertising"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.wired.com/story/ad-tech-could-be-the-next-internet-bubble/">
    <title>Ad Tech Could Be the Next Internet Bubble | WIRED</title>
    <dc:date>2020-11-06T14:32:04+00:00</dc:date>
    <link>https://www.wired.com/story/ad-tech-could-be-the-next-internet-bubble/</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>why_oh_why_cant_we_have_a_better_press_corps advertising networked_life internet data_mining track_down_references our_decrepit_institutions to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:90e222793ad3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:why_oh_why_cant_we_have_a_better_press_corps"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:advertising"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:networked_life"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:internet"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:track_down_references"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:our_decrepit_institutions"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://press.princeton.edu/books/ebook/9780691200002/metrics-at-work">
    <title>Metrics at Work | Princeton University Press</title>
    <dc:date>2020-10-23T02:15:10+00:00</dc:date>
    <link>https://press.princeton.edu/books/ebook/9780691200002/metrics-at-work</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["When the news moved online, journalists suddenly learned what their audiences actually liked, through algorithmic technologies that scrutinize web traffic and activity. Has this advent of audience metrics changed journalists’ work practices and professional identities? In Metrics at Work, Angèle Christin documents the ways that journalists grapple with audience data in the form of clicks, and analyzes how new forms of clickbait journalism travel across national borders.
"Drawing on four years of fieldwork in web newsrooms in the United States and France, including more than one hundred interviews with journalists, Christin reveals many similarities among the media groups examined—their editorial goals, technological tools, and even office furniture. Yet she uncovers crucial and paradoxical differences in how American and French journalists understand audience analytics and how these affect the news produced in each country. American journalists routinely disregard traffic numbers and primarily rely on the opinion of their peers to define journalistic quality. Meanwhile, French journalists fixate on internet traffic and view these numbers as a sign of their resonance in the public sphere. Christin offers cultural and historical explanations for these disparities, arguing that distinct journalistic traditions structure how journalists make sense of digital measurements in the two countries.
"Contrary to the popular belief that analytics and algorithms are globally homogenizing forces, Metrics at Work shows that computational technologies can have surprisingly divergent ramifications for work and organizations worldwide."]]></description>
<dc:subject>to:NB books:noted journalism data_mining networked_life books:in_library books:have_suggested_to_library downloaded</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1eade603ff41/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:journalism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:networked_life"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:in_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:have_suggested_to_library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://mitpress.mit.edu/books/data-action">
    <title>Data Action | The MIT Press</title>
    <dc:date>2020-09-21T03:55:27+00:00</dc:date>
    <link>https://mitpress.mit.edu/books/data-action</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Big data can be used for good—from tracking disease to exposing human rights violations—and for bad: implementing surveillance and control. Data inevitably represents the ideologies of those who control its use; data analytics and algorithms too often exclude women, the poor, and ethnic groups. In Data Action, Sarah Williams provides a guide for working with data in more ethical and responsible ways. Williams outlines a method that emphasizes collaboration among data scientists, policy experts, data designers, and the public. The approach generates policy debates, influences civic decisions, and informs design to help ensure that the voices of people represented in the data are neither marginalized nor left unheard."]]></description>
<dc:subject>books:noted data_mining to_download in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ca32b0c5f33c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_download"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.jstor.org/stable/j.ctt13x0hch">
    <title>The Black Box Society: The Secret Algorithms That Control Money and Information on JSTOR</title>
    <dc:date>2020-08-17T15:20:38+00:00</dc:date>
    <link>https://www.jstor.org/stable/j.ctt13x0hch</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>books:noted downloaded data_mining finance to_read to_teach:data-mining in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:98fede5cdf9c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:downloaded"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:finance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.pnas.org/content/117/15/8398">
    <title>Measuring the predictability of life outcomes with a scientific mass collaboration | PNAS</title>
    <dc:date>2020-07-29T15:17:33+00:00</dc:date>
    <link>https://www.pnas.org/content/117/15/8398</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["How predictable are life trajectories? We investigated this question with a scientific mass collaboration using the common task method; 160 teams built predictive models for six life outcomes using data from the Fragile Families and Child Wellbeing Study, a high-quality birth cohort study. Despite using a rich dataset and applying machine-learning methods optimized for prediction, the best predictions were not very accurate and were only slightly better than those from a simple benchmark model. Within each outcome, prediction error was strongly associated with the family being predicted and weakly associated with the technique used to generate the prediction. Overall, these results suggest practical limits to the predictability of life outcomes in some settings and illustrate the value of mass collaborations in the social sciences."]]></description>
<dc:subject>to:NB prediction data_mining social_science_methodology sociology to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:75b10a8bff25/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:social_science_methodology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://law.stanford.edu/publications/ethnic-bias-in-big-data-analytics-how-private-biases-can-migrate-into-public-policy/">
    <title>When Algorithms Import Private Bias into Public Enforcement: The Promise and Limitations of Statistical Debiasing Solutions - Journal Article - Stanford Law School</title>
    <dc:date>2020-07-28T18:40:16+00:00</dc:date>
    <link>https://law.stanford.edu/publications/ethnic-bias-in-big-data-analytics-how-private-biases-can-migrate-into-public-policy/</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["We make two contributions to understanding the role of algorithms in regulatory enforcement. First, we illustrate how big-data analytics can inadvertently import private biases into public policy. We show that a much-hyped use of predictive analytics – using consumer data to target food-safety enforcement – can disproportionately harm Asian establishments. Second, we study a solution by Pope and Sydnor (2011), which aims to debias predictors via marginalization, while still using information of contested predictors. We find the solution may be limited when protected groups have distinct predictor distributions, due to model extrapolation. Common machine-learning techniques heighten these problems."]]></description>
<dc:subject>data_mining algorithmic_fairness to_teach:data-mining in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e4e4e6824a03/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://global.oup.com/academic/product/predict-and-surveil-9780190684099">
    <title>Predict and Surveil - Sarah Brayne - Oxford University Press</title>
    <dc:date>2020-07-26T20:24:27+00:00</dc:date>
    <link>https://global.oup.com/academic/product/predict-and-surveil-9780190684099</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The scope of criminal justice surveillance, from the police to the prisons, has expanded rapidly in recent decades. At the same time, the use of big data has spread across a range of fields, including finance, politics, health, and marketing. While law enforcement's use of big data is hotly contested, very little is known about how the police actually use it in daily operations and with what consequences.
"In Predict and Surveil, Sarah Brayne offers an unprecedented, inside look at how police use big data and new surveillance technologies, leveraging on-the-ground fieldwork with one of the most technologically advanced law enforcement agencies in the world-the Los Angeles Police Department. Drawing on original interviews and ethnographic observations from over two years of fieldwork with the LAPD, Brayne examines the causes and consequences of big data and algorithmic control. She reveals how the police use predictive analytics and new surveillance technologies to deploy resources, identify criminal suspects, and conduct investigations; how the adoption of big data analytics transforms police organizational practices; and how the police themselves respond to these new data-driven practices. While big data analytics has the potential to reduce bias, increase efficiency, and improve prediction accuracy, Brayne argues that it also reproduces and deepens existing patterns of inequality, threatens privacy, and challenges civil liberties."
]]></description>
<dc:subject>books:noted police prediction data_mining to_teach:data-mining via:? books:owned in_NB books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ef0f9f67a0de/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:police"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:?"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://cloud.ibm.com/docs/personality-insights?topic=personality-insights-science">
    <title>The science behind the service [IBM Watson Personality Insights]</title>
    <dc:date>2020-07-16T18:57:42+00:00</dc:date>
    <link>https://cloud.ibm.com/docs/personality-insights?topic=personality-insights-science</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[Words on Twitter -> Glove embedding -> prediction of Big 5 personality scores (et al.).

Correlations of about 0.3 for English in their samples where they actually administered personality tests.  No details about how the subjects were recruited, or even whether this is the correlation on a test set or on a training set.


--- ETA: At the bottom of the document, they admit that "While the correlation between inferred and survey-based scores is both positive and significant, the results imply that inferred scores might not always correlate with survey-based results", and give three references to three non-IBM publications which (supposedly) "conducted experiments to compare how well inferred scores match scores obtained from surveys".  The third of these is: "Mairesse and Walker (2006) reported 60- to 70-percent accuracy for Big Five personality prediction."  I was intrigued by what 60% accuracy would mean for continuous vectors, so I followed their link; this reported both regression results and classifications where the threshold between the class was, for each personality dimension, set at the median.  What's remarkable there is that _most_ of the reported results are not marked as statistically significant improvements over the baseline (always predict the mean or always predict the more common class).  For self-reported speech, they do 40 hypothesis test and get 2 (!) significant improvements over baseline results at the 5% level.  For corpora judged by others, the models do better; over-all I count 130 hypothesis tests and 36 results significant at the 5% level, so it's not _quite_ a case for the neutral model of inquiry, but it's Pluto's Republic all the way down.]]></description>
<dc:subject>personality_tests text_mining data_mining trapped_in_plutos_republic to:blog re:career_advising_in_plutos_republic</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:643a68d73f09/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:personality_tests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:text_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:trapped_in_plutos_republic"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:blog"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:re:career_advising_in_plutos_republic"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.pnas.org/content/early/2020/07/13/1920484117">
    <title>Predicting personality from patterns of behavior collected with smartphones | PNAS</title>
    <dc:date>2020-07-16T15:59:43+00:00</dc:date>
    <link>https://www.pnas.org/content/early/2020/07/13/1920484117</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Smartphones enjoy high adoption rates around the globe. Rarely more than an arm’s length away, these sensor-rich devices can easily be repurposed to collect rich and extensive records of their users’ behaviors (e.g., location, communication, media consumption), posing serious threats to individual privacy. Here we examine the extent to which individuals’ Big Five personality dimensions can be predicted on the basis of six different classes of behavioral information collected via sensor and log data harvested from smartphones. Taking a machine-learning approach, we predict personality at broad domain (rmedianrmedian = 0.37) and narrow facet levels (rmedianrmedian = 0.40) based on behavioral data collected from 624 volunteers over 30 consecutive days (25,347,089 logging events). Our cross-validated results reveal that specific patterns in behaviors in the domains of 1) communication and social behavior, 2) music consumption, 3) app usage, 4) mobility, 5) overall phone activity, and 6) day- and night-time activity are distinctively predictive of the Big Five personality traits. The accuracy of these predictions is similar to that found for predictions based on digital footprints from social media platforms and demonstrates the possibility of obtaining information about individuals’ private traits from behavioral patterns passively collected from their smartphones. Overall, our results point to both the benefits (e.g., in research settings) and dangers (e.g., privacy implications, psychological targeting) presented by the widespread collection and modeling of behavioral data obtained from smartphones."]]></description>
<dc:subject>to:NB personality_tests data_mining i_really_dont_want_to_read_this_but_maybe_i_should to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:6dcd8646579b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:personality_tests"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:i_really_dont_want_to_read_this_but_maybe_i_should"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://wwnorton.com/books/9780393634846/about-the-book/description">
    <title>A Scheme of Heaven | Alexander Boxer | W. W. Norton &amp; Company</title>
    <dc:date>2020-07-15T21:19:14+00:00</dc:date>
    <link>https://wwnorton.com/books/9780393634846/about-the-book/description</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["Humans are pattern-matching creatures, and astrology is the universe’s grandest pattern-matching game. In this refreshing work of history and analysis, data scientist Alexander Boxer examines classical texts on astrology to expose its underlying scientific and mathematical framework. Astrology, he argues, was the ancient world’s most ambitious applied mathematics problem, a monumental data-analysis enterprise sustained by some of history’s most brilliant minds, from Ptolemy to al-Kindi to Kepler.
"Thousands of years ago, astrologers became the first to stumble upon the powerful storytelling possibilities inherent in numerical data. To correlate the configurations of the cosmos with our day-to-day lives, astrologers relied upon a “scheme of heaven,” or horoscope, showing the precise configuration of the planets at a particular instant in time as viewed from a particular place on Earth. Although recognized as pseudoscience today, horoscopes were once considered a cutting-edge scientific tool. Boxer teaches us how to read these esoteric charts—and appreciate the complex astronomical calculations needed to generate them—by diagramming how the heavens appeared at important moments in astrology’s history, from the assassination of Julius Caesar as viewed from Rome to the Apollo 11 lunar landing as seen from the surface of the Moon. He then puts these horoscopes to the test using modern data sets and statistical science, arguing that today’s data scientists do work similar to astrologers of yore. By looking back at the algorithms of ancient astrology, he suggests, we can better recognize the patterns that are timeless characteristics of our own pattern-matching tendencies."

--- I am reminded of how, when I tried to explain my research on optimal prediction to AEO, she said "Oh, like Cardano!" and gave me Grafton's book.]]></description>
<dc:subject>to:NB books:noted astrology data_mining to_teach:data-mining history_of_science books:owned books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:f8bac23f61e6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to:NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:astrology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:history_of_science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://wwnorton.com/books/9781631496103">
    <title>If Then: How the Simulmatics Corporation Invented the Future | Jill Lepore | W. W. Norton &amp; Company</title>
    <dc:date>2020-07-15T20:18:10+00:00</dc:date>
    <link>https://wwnorton.com/books/9781631496103</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The Simulmatics Corporation, founded in 1959, mined data, targeted voters, accelerated news, manipulated consumers, destabilized politics, and disordered knowledge—decades before Facebook, Google, Amazon, and Cambridge Analytica. Silicon Valley likes to imagine that it has no past, but the scientists of Simulmatics are the long-dead grandfathers of Mark Zuckerberg and Elon Musk. Borrowing from psychological warfare, they used computers to predict and direct human behavior, deploying their “People Machine” from New York, Cambridge, and Saigon for clients that included John Kennedy’s presidential campaign, the New York Times, Young & Rubicam, and, during the Vietnam War, the Department of Defense.
"Jill Lepore, distinguished Harvard historian and New Yorker staff writer, unearthed from archives the almost unbelievable story of this long-vanished corporation, and of the women hidden behind it. In the 1950s and 1960s, Lepore argues, Simulmatics invented the future by building the machine in which the world now finds itself trapped and tormented, algorithm by algorithm."]]></description>
<dc:subject>books:noted history_of_technology data_mining lepore.jill the_present_before_it_was_widely_distributed books:owned in_NB books:suggest_to_library</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:fa4e3695a53f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:noted"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:history_of_technology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:lepore.jill"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:the_present_before_it_was_widely_distributed"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:owned"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:books:suggest_to_library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www-wsj-com.cdn.ampproject.org/v/s/www.wsj.com/amp/articles/ai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201?usqp=mq331AQFKAGwASA%3D&amp;amp_js_v=0.1#referrer=https%3A%2F%2Fwww.google.com&amp;amp_tf=From%20%251%24s&amp;ampshare=https%3A%2F%2Fwww.wsj.com%2Farticles%2Fai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201">
    <title>AI Isn’t Magical and Won’t Help You Reopen Your Business - WSJ</title>
    <dc:date>2020-07-13T17:02:18+00:00</dc:date>
    <link>https://www-wsj-com.cdn.ampproject.org/v/s/www.wsj.com/amp/articles/ai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201?usqp=mq331AQFKAGwASA%3D&amp;amp_js_v=0.1#referrer=https%3A%2F%2Fwww.google.com&amp;amp_tf=From%20%251%24s&amp;ampshare=https%3A%2F%2Fwww.wsj.com%2Farticles%2Fai-isnt-magical-and-wont-help-you-reopen-your-business-11590811201</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[TL;DR: Regression models are distribution-dependent and aren't reliable after distributional shift.]]></description>
<dc:subject>have_read data_mining to_teach:data-mining coronavirus_pandemic_of_2019--</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ef307c5b5efa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:have_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:coronavirus_pandemic_of_2019--"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.nytimes.com/2020/06/24/technology/facial-recognition-arrest.html?action=click&amp;module=Top%20Stories&amp;pgtype=Homepage">
    <title>Wrongfully Accused by an Algorithm - The New York Times</title>
    <dc:date>2020-06-25T16:05:42+00:00</dc:date>
    <link>https://www.nytimes.com/2020/06/24/technology/facial-recognition-arrest.html?action=click&amp;module=Top%20Stories&amp;pgtype=Homepage</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>classifiers data_mining police utter_stupidity to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:b4f394d2fc25/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:classifiers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:police"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:utter_stupidity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://academic.oup.com/jla/article/doi/10.1093/jla/laz001/5476086">
    <title>Discrimination in the Age of Algorithms | Journal of Legal Analysis | Oxford Academic</title>
    <dc:date>2020-06-12T17:34:19+00:00</dc:date>
    <link>https://academic.oup.com/jla/article/doi/10.1093/jla/laz001/5476086</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["The law forbids discrimination. But the ambiguity of human decision-making often makes it hard for the legal system to know whether anyone has discriminated. To understand how algorithms affect discrimination, we must understand how they affect the detection of discrimination. With the appropriate requirements in place, algorithms create the potential for new forms of transparency and hence opportunities to detect discrimination that are otherwise unavailable. The specificity of algorithms also makes transparent tradeoffs among competing values. This implies algorithms are not only a threat to be regulated; with the right safeguards, they can be a potential positive force for equity."]]></description>
<dc:subject>algorithmic_fairness inequality data_mining to_teach:statistics_of_inequality_and_discrimination via:chouldechova discrimination in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:e4061487a1b8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:inequality"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_of_inequality_and_discrimination"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:via:chouldechova"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:discrimination"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.buzzfeednews.com/article/ryanmac/clearview-ai-fbi-ice-global-law-enforcement">
    <title>Clearview AI's Facial Recognition Tech Is Being Used By The Justice Department, ICE, And The FBI</title>
    <dc:date>2020-04-14T17:36:39+00:00</dc:date>
    <link>https://www.buzzfeednews.com/article/ryanmac/clearview-ai-fbi-ice-global-law-enforcement</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>surveillance data_mining to_teach:data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:1a6d10afc48e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:surveillance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2477899">
    <title>Big Data's Disparate Impact by Solon Barocas, Andrew D. Selbst :: SSRN</title>
    <dc:date>2020-04-08T15:58:29+00:00</dc:date>
    <link>https://papers.ssrn.com/sol3/papers.cfm?abstract_id=2477899</link>
    <dc:creator>cshalizi</dc:creator><dc:subject>to_read algorithmic_fairness prediction data_mining to_teach:data-mining to_teach:statistics_of_inequality_and_discrimination law in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:9ab505518792/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_of_inequality_and_discrimination"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:law"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.05193">
    <title>[2002.05193] A Hierarchy of Limitations in Machine Learning</title>
    <dc:date>2020-02-19T18:55:01+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.05193</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA[""All models are wrong, but some are useful", wrote George E. P. Box (1979). Machine learning has focused on the usefulness of probability models for prediction in social systems, but is only now coming to grips with the ways in which these models are wrong---and the consequences of those shortcomings. This paper attempts a comprehensive, structured overview of the specific conceptual, procedural, and statistical limitations of models in machine learning when applied to society. Machine learning modelers themselves can use the described hierarchy to identify possible failure points and think through how to address them, and consumers of machine learning models can know what to question when confronted with the decision about if, where, and how to apply machine learning. The limitations go from commitments inherent in quantification itself, through to showing how unmodeled dependencies can lead to cross-validation being overly optimistic as a way of assessing model performance."]]></description>
<dc:subject>to_read prediction data_mining kith_and_kin in_NB malik.momin_m.</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:07675ced4632/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:kith_and_kin"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:malik.momin_m."/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://hdsr.mitpress.mit.edu/pub/56lnenzj">
    <title>Should We Trust Algorithms? · Harvard Data Science Review</title>
    <dc:date>2020-02-10T16:24:42+00:00</dc:date>
    <link>https://hdsr.mitpress.mit.edu/pub/56lnenzj</link>
    <dc:creator>cshalizi</dc:creator><description><![CDATA["There is increasing use of algorithms in the health care and criminal justice systems, and corresponding increased concern with their ethical use. But perhaps a more basic issue is whether we should believe what we hear about them and what the algorithm tells us. It is illuminating to distinguish between the trustworthiness of claims made about an algorithm, and those made by an algorithm, which reveals the potential contribution of statistical science to both evaluation and ‘intelligent transparency.’ In particular, a four-phase evaluation structure is proposed, parallel to that adopted for pharmaceuticals."]]></description>
<dc:subject>algorithmic_fairness statistics data_mining spiegelhalter.david to_teach:data-mining to_teach:statistics_of_inequality_and_discrimination in_NB</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:cshalizi/b:ab9f2582428e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:algorithmic_fairness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:data_mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:spiegelhalter.david"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:to_teach:statistics_of_inequality_and_discrimination"/>
	<rdf:li rdf:resource="https://pinboard.in/u:cshalizi/t:in_NB"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>