<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (Vaguery)</title>
    <link>https://pinboard.in/u:Vaguery/public/</link>
    <description>recent bookmarks from Vaguery</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://arxiv.org/abs/2210.02580"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2205.14430"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1903.02915"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1804.10168"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2110.12261"/>
	<rdf:li rdf:resource="https://www.tidyverse.org/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2005.00497"/>
	<rdf:li rdf:resource="https://github.com/noprompt/meander"/>
	<rdf:li rdf:resource="http://astrobiology.com/2021/04/inferring-exoplanet-disequilibria-with-multivariate-information-in-atmospheric-reaction-networks.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2012.11780"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/2002.00937"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1612.02487"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1810.02016"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1803.08625"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1901.06758"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1712.07381"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1811.06912"/>
	<rdf:li rdf:resource="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411762/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1812.05225"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1612.07545"/>
	<rdf:li rdf:resource="https://pudding.cool/2017/05/song-repetition/"/>
	<rdf:li rdf:resource="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0198341"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1710.00992"/>
	<rdf:li rdf:resource="https://en.wikipedia.org/wiki/Fr%C3%A9chet_distance"/>
	<rdf:li rdf:resource="https://paulromer.net/jupyter-mathematica-and-the-future-of-the-research-paper/index.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1712.05630"/>
	<rdf:li rdf:resource="https://twitter.com/RadarAndStuff/status/1043575880054988800"/>
	<rdf:li rdf:resource="https://www.biorxiv.org/content/early/2018/09/19/421842"/>
	<rdf:li rdf:resource="http://trent.st/ffx/"/>
	<rdf:li rdf:resource="https://jasmcole.com/2018/01/16/cooking-the-books/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1709.07097"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1602.01241"/>
	<rdf:li rdf:resource="http://caleydo.org/tools/upset/"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1606.06159"/>
	<rdf:li rdf:resource="https://jasmcole.com/2017/04/08/extractor-attractor/#more-76737"/>
	<rdf:li rdf:resource="http://www.stat.columbia.edu/~gelman/research/published/objectivityr5.pdf"/>
	<rdf:li rdf:resource="https://en.wikipedia.org/wiki/Abstract_simplicial_complex"/>
	<rdf:li rdf:resource="https://github.com/appliedtopology/javaplex/wiki/Tutorial"/>
	<rdf:li rdf:resource="https://en.wikipedia.org/wiki/Topological_data_analysis"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1104.5557"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1501.01573"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1608.04048"/>
	<rdf:li rdf:resource="http://www.sciencedirect.com/science/article/pii/S1751157712001034"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1609.08827"/>
	<rdf:li rdf:resource="https://research.googleblog.com/2016/12/open-sourcing-embedding-projector-tool.html"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1606.00856"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1602.03926"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1607.06274"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1605.08749"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1603.04626"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1605.07030"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1606.01081"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1511.05271"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1605.03373"/>
	<rdf:li rdf:resource="http://thenewinquiry.com/essays/view-from-nowhere/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1601.07996"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1206.1386"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1503.03332"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1505.05211"/>
	<rdf:li rdf:resource="https://github.com/donnemartin/data-science-ipython-notebooks"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1511.01343"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1505.01866"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1507.06988"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1506.07800"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1408.3600"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1311.1911"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1406.7349"/>
	<rdf:li rdf:resource="http://vcg.github.io/upset/about/#"/>
	<rdf:li rdf:resource="http://biorxiv.org/content/early/2015/01/10/013623"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1403.6804"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://arxiv.org/abs/2210.02580">
    <title>[2210.02580] Functional Labeled Optimal Partitioning</title>
    <dc:date>2025-04-17T14:14:29+00:00</dc:date>
    <link>https://arxiv.org/abs/2210.02580</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Peak detection is a problem in sequential data analysis that involves differentiating regions with higher counts (peaks) from regions with lower counts (background noise).
It is crucial to correctly predict areas that deviate from the background noise, in both the train and test sets of labels.
Dynamic programming changepoint algorithms have been proposed to solve the peak detection problem by constraining the mean to alternatively increase and then decrease.
The current constrained changepoint algorithms only create predictions on the test set, while completely ignoring the train set.
Changepoint algorithms that are both accurate when fitting the train set, and make predictions on the test set, have been proposed but not in the context of peak detection models.
We propose to resolve these issues by creating a new dynamic programming algorithm, FLOPART, that has zero train label errors, and is able to provide highly accurate predictions on the test set.
We provide an empirical analysis that shows FLOPART has a similar time complexity while being more accurate than the existing algorithms in terms of train and test label errors.
]]></description>
<dc:subject>machine-learning time-series algorithms rather-interesting data-analysis to-understand to-write-about consider:data-labeling consider:feature-discovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:96c4dc95518d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:data-labeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:feature-discovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2205.14430">
    <title>[2205.14430] Angle-Uniform Parallel Coordinates</title>
    <dc:date>2024-05-07T11:14:00+00:00</dc:date>
    <link>https://arxiv.org/abs/2205.14430</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We present angle-uniform parallel coordinates, a data-independent technique that deforms the image plane of parallel coordinates so that the angles of linear relationships between two variables are linearly mapped along the horizontal axis of the parallel coordinates plot. Despite being a common method for visualizing multidimensional data, parallel coordinates are ineffective for revealing positive correlations since the associated parallel coordinates points of such structures may be located at infinity in the image plane and the asymmetric encoding of negative and positive correlations may lead to unreliable estimations. To address this issue, we introduce a transformation that bounds all points horizontally using an angle-uniform mapping and shrinks them vertically in a structure-preserving fashion; polygonal lines become smooth curves and a symmetric representation of data correlations is achieved. We further propose a combined subsampling and density visualization approach to reduce visual clutter caused by overdrawing. Our method enables accurate visual pattern interpretation of data correlations, and its data-independent nature makes it applicable to all multidimensional datasets. The usefulness of our method is demonstrated using examples of synthetic and real-world datasets.
]]></description>
<dc:subject>data-analysis visualization parallel-coordinates multiobjective-optimization rather-interesting statistics data-science scientific-communication</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5ad8f735baab/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:parallel-coordinates"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:multiobjective-optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:scientific-communication"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1903.02915">
    <title>[1903.02915] jMetalPy: a Python Framework for Multi-Objective Optimization with Metaheuristics</title>
    <dc:date>2022-05-14T11:25:08+00:00</dc:date>
    <link>https://arxiv.org/abs/1903.02915</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[This paper describes jMetalPy, an object-oriented Python-based framework for multi-objective optimization with metaheuristic techniques. Building upon our experiences with the well-known jMetal framework, we have developed a new multi-objective optimization software platform aiming not only at replicating the former one in a different programming language, but also at taking advantage of the full feature set of Python, including its facilities for fast prototyping and the large amount of available libraries for data processing, data analysis, data visualization, and high-performance computing. As a result, jMetalPy provides an environment for solving multi-objective optimization problems focused not only on traditional metaheuristics, but also on techniques supporting preference articulation and dynamic problems, along with a rich set of features related to the automatic generation of statistical data from the results generated, as well as the real-time and interactive visualization of the Pareto front approximations produced by the algorithms. jMetalPy offers additionally support for parallel computing in multicore and cluster systems. We include some use cases to explore the main features of jMetalPy and to illustrate how to work with it.
]]></description>
<dc:subject>python data-analysis visualization multiobjective-optimization rather-interesting</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:ce628f7bddfd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:python"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:multiobjective-optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1804.10168">
    <title>[1804.10168] BEST : A decision tree algorithm that handles missing values</title>
    <dc:date>2022-01-29T12:52:52+00:00</dc:date>
    <link>https://arxiv.org/abs/1804.10168</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The main contribution of this paper is the development of a new decision tree algorithm. The proposed approach allows users to guide the algorithm through the data partitioning process. We believe this feature has many applications but in this paper we demonstrate how to utilize this algorithm to analyse data sets containing missing values. We tested our algorithm against simulated data sets with various missing data structures and a real data set. The results demonstrate that this new classification procedure efficiently handles missing values and produces results that are slightly more accurate and more interpretable than most common procedures without any imputations or pre-processing.
]]></description>
<dc:subject>decision-trees statistics machine-learning rather-interesting missing-values data-analysis to-write-about to-understand consider:symbolic-regression</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a8fdaa105b2e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:decision-trees"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:missing-values"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:symbolic-regression"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2110.12261">
    <title>[2110.12261] espiownage: Tracking Transients in Steelpan Drum Strikes Using Surveillance Technology</title>
    <dc:date>2022-01-23T12:26:10+00:00</dc:date>
    <link>https://arxiv.org/abs/2110.12261</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We present an improvement in the ability to meaningfully track features in high speed videos of Caribbean steelpan drums illuminated by Electronic Speckle Pattern Interferometry (ESPI). This is achieved through the use of up-to-date computer vision libraries for object detection and image segmentation as well as a significant effort toward cleaning the dataset previously used to train systems for this application. Besides improvements on previous metric scores by 10% or more, noteworthy in this project are the introduction of a segmentation-regression map for the entire drum surface yielding interference fringe counts comparable to those obtained via object detection, as well as the accelerated workflow for coordinating the data-cleaning-and-model-training feedback loop for rapid iteration allowing this project to be conducted on a timescale of only 18 days.
]]></description>
<dc:subject>music signal-processing interferometry experimental-physics acoustics rather-interesting looking-to-see data-analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:1d4ed4e54a68/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:music"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:signal-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:interferometry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:experimental-physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:acoustics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.tidyverse.org/">
    <title>Tidyverse</title>
    <dc:date>2021-10-28T14:23:31+00:00</dc:date>
    <link>https://www.tidyverse.org/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.

]]></description>
<dc:subject>R data-analysis software-development-is-not-programming to-understand library statistics visualization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:f35f64dd76b2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:software-development-is-not-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2005.00497">
    <title>[2005.00497] The Grammar of Interactive Explanatory Model Analysis</title>
    <dc:date>2021-08-01T12:06:17+00:00</dc:date>
    <link>https://arxiv.org/abs/2005.00497</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The growing need for in-depth analysis of predictive models leads to a series of new methods for explaining their local and global properties. Which of these methods is the best? It turns out that this is an ill-posed question. One cannot sufficiently explain a black-box machine learning model using a single method that gives only one perspective. Isolated explanations are prone to misunderstanding, which inevitably leads to wrong or simplistic reasoning. This problem is known as the Rashomon effect and refers to diverse, even contradictory interpretations of the same phenomenon. Surprisingly, the majority of methods developed for explainable machine learning focus on a single aspect of the model behavior. In contrast, we showcase the problem of explainability as an interactive and sequential analysis of a model. This paper presents how different Explanatory Model Analysis (EMA) methods complement each other and why it is essential to juxtapose them together. The introduced process of Interactive EMA (IEMA) derives from the algorithmic side of explainable machine learning and aims to embrace ideas developed in cognitive sciences. We formalize the grammar of IEMA to describe potential human-model dialogues. IEMA is implemented in the human-centered framework that adopts interactivity, customizability and automation as its main traits. Combined, these methods enhance the responsible approach to predictive modeling.
]]></description>
<dc:subject>machine-learning the-mangle-in-practice rather-interesting explainability user-experience interactivity statistics data-analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:52f512d386c1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:the-mangle-in-practice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:explainability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-experience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:interactivity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://github.com/noprompt/meander">
    <title>noprompt/meander: Tools for transparent data transformation</title>
    <dc:date>2021-05-18T22:15:46+00:00</dc:date>
    <link>https://github.com/noprompt/meander</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Meander is a Clojure/ClojureScript library that empowers you to write transparent data transformation code that allows you to plainly see the input and output of these transformations.

The latest version of the library can be found at the following link.

]]></description>
<dc:subject>clojure libraries data-analysis formal-languages rather-interesting no-really to-understand ontology type-theory</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7d70cfe8dddd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clojure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:libraries"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:formal-languages"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:no-really"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ontology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:type-theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://astrobiology.com/2021/04/inferring-exoplanet-disequilibria-with-multivariate-information-in-atmospheric-reaction-networks.html">
    <title>Inferring Exoplanet Disequilibria with Multivariate Information in Atmospheric Reaction Networks - Astrobiology</title>
    <dc:date>2021-05-18T22:08:56+00:00</dc:date>
    <link>http://astrobiology.com/2021/04/inferring-exoplanet-disequilibria-with-multivariate-information-in-atmospheric-reaction-networks.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Inferring the properties of exoplanets from their atmospheres, while confronting low resolution and low signal-to-noise in the context of the quantities we want to derive, poses rigorous demands upon the data collected from observation.

]]></description>
<dc:subject>astrobiology rather-interesting data-analysis spectroscopy machine-learning clustering nonlinear-dynamics origin-of-life</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a6299daa7b34/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:astrobiology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:spectroscopy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nonlinear-dynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:origin-of-life"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2012.11780">
    <title>[2012.11780] Towards an Automatic System for Extracting Planar Orientations from Software Generated Point Clouds</title>
    <dc:date>2021-05-09T11:42:24+00:00</dc:date>
    <link>https://arxiv.org/abs/2012.11780</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In geology, a key activity is the characterisation of geological structures (surface formation topology and rock units) using Planar Orientation measurements such as Strike, Dip and Dip Direction. In general these measurements are collected manually using basic equipment; usually a compass/clinometer and a backboard, recorded on a map by hand. Various computing techniques and technologies, such as Lidar, have been utilised in order to automate this process and update the collection paradigm for these types of measurements. Techniques such as Structure from Motion (SfM) reconstruct of scenes and objects by generating a point cloud from input images, with detailed reconstruction possible on the decimetre scale. SfM-type techniques provide advantages in areas of cost and usability in more varied environmental conditions, while sacrificing the extreme levels of data fidelity. Here is presented a methodology of data acquisition and a Machine Learning-based software system: GeoStructure, developed to automate the measurement of orientation measurements. Rather than deriving measurements using a method applied to the input images, such as the Hough Transform, this method takes measurements directly from the reconstructed point cloud surfaces. Point cloud noise is mitigated using a Mahalanobis distance implementation. Significant structure is characterised using a k-nearest neighbour region growing algorithm, and final surface orientations are quantified using the plane, and normal direction cosines.
]]></description>
<dc:subject>machine-learning inference data-analysis geology image-processing rather-interesting constraint-satisfaction performance-measure</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8a8c856e99b0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:geology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:constraint-satisfaction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/2002.00937">
    <title>[2002.00937] Radioactive data: tracing through training</title>
    <dc:date>2020-05-02T15:03:53+00:00</dc:date>
    <link>https://arxiv.org/abs/2002.00937</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We want to detect whether a particular image dataset has been used to train a model. We propose a new technique, \emph{radioactive data}, that makes imperceptible changes to this dataset such that any model trained on it will bear an identifiable mark. The mark is robust to strong variations such as different architectures or optimization methods. Given a trained model, our technique detects the use of radioactive data and provides a level of confidence (p-value). Our experiments on large-scale benchmarks (Imagenet), using standard architectures (Resnet-18, VGG-16, Densenet-121) and training procedures, show that we can detect usage of radioactive data with high confidence (p<10^-4) even when only 1% of the data used to trained our model is radioactive. Our method is robust to data augmentation and the stochasticity of deep network optimization. As a result, it offers a much higher signal-to-noise ratio than data poisoning and backdoor methods.
]]></description>
<dc:subject>machine-learning data-analysis rather-interesting security looking-to-see metaheuristics privacy algorithms to-write-about to-simulate consider:parallels-for-other-metaheuristics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5c006b02c3bb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:security"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metaheuristics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:privacy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-simulate"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:parallels-for-other-metaheuristics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1612.02487">
    <title>[1612.02487] Interactive Elicitation of Knowledge on Feature Relevance Improves Predictions in Small Data Sets</title>
    <dc:date>2019-10-26T12:45:09+00:00</dc:date>
    <link>https://arxiv.org/abs/1612.02487</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Providing accurate predictions is challenging for machine learning algorithms when the number of features is larger than the number of samples in the data. Prior knowledge can improve machine learning models by indicating relevant variables and parameter values. Yet, this prior knowledge is often tacit and only available from domain experts. We present a novel approach that uses interactive visualization to elicit the tacit prior knowledge and uses it to improve the accuracy of prediction models. The main component of our approach is a user model that models the domain expert's knowledge of the relevance of different features for a prediction task. In particular, based on the expert's earlier input, the user model guides the selection of the features on which to elicit user's knowledge next. The results of a controlled user study show that the user model significantly improves prior knowledge elicitation and prediction accuracy, when predicting the relative citation counts of scientific documents in a specific domain.
]]></description>
<dc:subject>data-analysis statistics user-experience the-mangle-in-practice rather-interesting artificial-colleagues asking-for-help to-understand to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:19f278ff77bc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-experience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:the-mangle-in-practice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:artificial-colleagues"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:asking-for-help"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1810.02016">
    <title>[1810.02016] The Four Point Permutation Test for Latent Block Structure in Incidence Matrices</title>
    <dc:date>2019-10-26T12:42:46+00:00</dc:date>
    <link>https://arxiv.org/abs/1810.02016</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Transactional data may be represented as a bipartite graph G:=(L∪R,E), where L denotes agents, R denotes objects visible to many agents, and an edge in E denotes an interaction between an agent and an object. Unsupervised learning seeks to detect block structures in the adjacency matrix Z between L and R, thus grouping together sets of agents with similar object interactions. New results on quasirandom permutations suggest a non-parametric \textbf{four point test} to measure the amount of block structure in G, with respect to vertex orderings on L and R. Take disjoint 4-edge random samples, order these four edges by left endpoint, and count the relative frequencies of the 4! possible orderings of the right endpoint. When these orderings are equiprobable, the edge set E corresponds to a quasirandom permutation π of |E| symbols. Total variation distance of the relative frequency vector away from the uniform distribution on 24 permutations measures the amount of block structure. Such a test statistic, based on ⌊|E|/4⌋ samples, is computable in O(|E|/p) time on p processors. Possibly block structure may be enhanced by precomputing \textbf{natural orders} on L and R, related to the second eigenvector of graph Laplacians. In practice this takes O(d|E|) time, where d is the graph diameter. Five open problems are described.
]]></description>
<dc:subject>combinatorics counting rather-interesting probability-theory data-analysis data-mining graph-theory network-theory hypergraphs to-write-about to-simulate</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5d0e64d5a30a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:combinatorics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:counting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graph-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:network-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hypergraphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-simulate"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1803.08625">
    <title>[1803.08625] A Concept Learning Tool Based On Calculating Version Space Cardinality</title>
    <dc:date>2019-06-23T11:39:21+00:00</dc:date>
    <link>https://arxiv.org/abs/1803.08625</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In this paper, we proposed VeSC-CoL (Version Space Cardinality based Concept Learning) to deal with concept learning on extremely imbalanced datasets, especially when cross-validation is not a viable option. VeSC-CoL uses version space cardinality as a measure for model quality to replace cross-validation. Instead of naive enumeration of the version space, Ordered Binary Decision Diagram and Boolean Satisfiability are used to compute the version space. Experiments show that VeSC-CoL can accurately learn the target concept when computational resource is allowed.
]]></description>
<dc:subject>machine-learning data-analysis representation to-understand error head-scratcher</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:13a0ac02110f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:error"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:head-scratcher"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1901.06758">
    <title>[1901.06758] A deep learning approach to real-time parking occupancy prediction in spatio-temporal networks incorporating multiple spatio-temporal data sources</title>
    <dc:date>2019-06-12T13:53:34+00:00</dc:date>
    <link>https://arxiv.org/abs/1901.06758</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[A deep learning model is applied for predicting block-level parking occupancy in real time. The model leverages Graph-Convolutional Neural Networks (GCNN) to extract the spatial relations of traffic flow in large-scale networks, and utilizes Recurrent Neural Networks (RNN) with Long-Short Term Memory (LSTM) to capture the temporal features. In addition, the model is capable of taking multiple heterogeneously structured traffic data sources as input, such as parking meter transactions, traffic speed, and weather conditions. The model performance is evaluated through a case study in Pittsburgh downtown area. The proposed model outperforms other baseline methods including multi-layer LSTM and Lasso with an average testing MAPE of 10.6\% when predicting block-level parking occupancies 30 minutes in advance. The case study also shows that, in generally, the prediction model works better for business areas than for recreational locations. We found that incorporating traffic speed and weather information can significantly improve the prediction performance. Weather data is particularly useful for improving predicting accuracy in recreational areas.
]]></description>
<dc:subject>machine-learning city-planning data-analysis looking-to-see prediction deep-learning to-write-about consider:data-sourcing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:74a9aca209d1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:city-planning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:deep-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:data-sourcing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1712.07381">
    <title>[1712.07381] Extreme Value Analysis Without the Largest Values: What Can Be Done?</title>
    <dc:date>2019-05-03T11:21:33+00:00</dc:date>
    <link>https://arxiv.org/abs/1712.07381</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In this paper we are concerned with the analysis of heavy-tailed data when a portion of the extreme values is unavailable. This research was motivated by an analysis of the degree distributions in a large social network. The degree distributions of such networks tend to have power law behavior in the tails. We focus on the Hill estimator, which plays a starring role in heavy-tailed modeling. The Hill estimator for this data exhibited a smooth and increasing "sample path" as a function of the number of upper order statistics used in constructing the estimator. This behavior became more apparent as we artificially removed more of the upper order statistics. Building on this observation we introduce a new version of the Hill estimator. It is a function of the number of the upper order statistics used in the estimation, but also depends on the number of unavailable extreme values. We establish functional convergence of the normalized Hill estimator to a Gaussian process. An estimation procedure is developed based on the limit theory to estimate the number of missing extremes and extreme value parameters including the tail index and the bias of Hill's estimator. We illustrate how this approach works in both simulations and real data examples.
]]></description>
<dc:subject>statistics extreme-values rather-interesting algorithms estimation inference data-analysis nudge-targets consider:looking-to-see consider:performance-measures</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a732ab80dda1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:extreme-values"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:estimation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:performance-measures"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1811.06912">
    <title>[1811.06912] Exploring Student Check-In Behavior for Improved Point-of-Interest Prediction</title>
    <dc:date>2019-04-10T10:28:38+00:00</dc:date>
    <link>https://arxiv.org/abs/1811.06912</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[With the availability of vast amounts of user visitation history on location-based social networks (LBSN), the problem of Point-of-Interest (POI) prediction has been extensively studied. However, much of the research has been conducted solely on voluntary checkin datasets collected from social apps such as Foursquare or Yelp. While these data contain rich information about recreational activities (e.g., restaurants, nightlife, and entertainment), information about more prosaic aspects of people's lives is sparse. This not only limits our understanding of users' daily routines, but more importantly the modeling assumptions developed based on characteristics of recreation-based data may not be suitable for richer check-in data. In this work, we present an analysis of education "check-in" data using WiFi access logs collected at Purdue University. We propose a heterogeneous graph-based method to encode the correlations between users, POIs, and activities, and then jointly learn embeddings for the vertices. We evaluate our method compared to previous state-of-the-art POI prediction methods, and show that the assumptions made by previous methods significantly degrade performance on our data with dense(r) activity signals. We also show how our learned embeddings could be used to identify similar students (e.g., for friend suggestions).
]]></description>
<dc:subject>social-dynamics social-networks data-analysis rather-interesting statistics machine-learning clustering data-pageant visualization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:178a4234fb86/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:social-dynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:social-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-pageant"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411762/">
    <title>Genome graphs and the evolution of genome inference</title>
    <dc:date>2019-02-13T10:43:47+00:00</dc:date>
    <link>https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5411762/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The human reference genome is part of the foundation of modern human biology and a monumental scientific achievement. However, because it excludes a great deal of common human variation, it introduces a pervasive reference bias into the field of human genomics. To reduce this bias, it makes sense to draw on representative collections of human genomes, brought together into reference cohorts. There are a number of techniques to represent and organize data gleaned from these cohorts, many using ideas implicitly or explicitly borrowed from graph-based models. Here, we survey various projects underway to build and apply these graph-based structures—which we collectively refer to as genome graphs—and discuss the improvements in read mapping, variant calling, and haplotype determination that genome graphs are expected to produce.

]]></description>
<dc:subject>via:arthegall bioinformatics clustering visualization data-analysis rather-interesting consider:nonbiological-genomes review</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8e2e925902f0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:arthegall"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:nonbiological-genomes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:review"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1812.05225">
    <title>[1812.05225] Finding the origin of noise transients in LIGO data with machine learning</title>
    <dc:date>2019-01-27T12:35:14+00:00</dc:date>
    <link>https://arxiv.org/abs/1812.05225</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Quality improvement of interferometric data collected by gravitational-wave detectors such as Advanced LIGO and Virgo is mission critical for the success of gravitational-wave astrophysics. Gravitational-wave detectors are sensitive to a variety of disturbances of non-astrophysical origin with characteristic frequencies in the instrument band of sensitivity. Removing non-astrophysical artifacts that corrupt the data stream is crucial for increasing the number and statistical significance of gravitational-wave detections and enabling refined astrophysical interpretations of the data. Machine learning has proved to be a powerful tool for analysis of massive quantities of complex data in astronomy and related fields of study. We present two machine learning methods, based on random forest and genetic programming algorithms, that can be used to determine the origin of non-astrophysical transients in the LIGO detectors. We use two classes of transients with known instrumental origin that were identified during the first observing run of Advanced LIGO to show that the algorithms can successfully identify the origin of non-astrophysical transients in real interferometric data and thus assist in the mitigation of instrumental and environmental disturbances in gravitational-wave searches. While the data sets described in this paper are specific to LIGO, and the exact procedures employed were unique to the same, the random forest and genetic programming code bases and means by which they were applied as a dual machine learning approach are completely portable to any number of instruments in which noise is believed to be generated through mechanical couplings, the source of which is not yet discovered.]]></description>
<dc:subject>genetic-programming hey-I-know-this-guy astrophysics data-analysis data-mining to-understand feature-construction classification</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:97dd967c5c54/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetic-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hey-I-know-this-guy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:astrophysics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:classification"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1612.07545">
    <title>[1612.07545] A Revisit of Hashing Algorithms for Approximate Nearest Neighbor Search</title>
    <dc:date>2019-01-05T13:09:50+00:00</dc:date>
    <link>https://arxiv.org/abs/1612.07545</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Approximate Nearest Neighbor Search (ANNS) is a fundamental problem in many areas of machine learning and data mining. During the past decade, numerous hashing algorithms are proposed to solve this problem. Every proposed algorithm claims outperform other state-of-the-art hashing methods. However, the evaluation of these hashing papers was not thorough enough, and those claims should be re-examined. The ultimate goal of an ANNS method is returning the most accurate answers (nearest neighbors) in the shortest time. If implemented correctly, almost all the hashing methods will have their performance improved as the code length increases. However, many existing hashing papers only report the performance with the code length shorter than 128. In this paper, we carefully revisit the problem of search with a hash index, and analyze the pros and cons of two popular hash index search procedures. Then we proposed a very simple but effective two level index structures and make a thorough comparison of eleven popular hashing algorithms. Surprisingly, the random-projection-based Locality Sensitive Hashing (LSH) is the best performed algorithm, which is in contradiction to the claims in all the other ten hashing papers. Despite the extreme simplicity of random-projection-based LSH, our results show that the capability of this algorithm has been far underestimated. For the sake of reproducibility, all the codes used in the paper are released on GitHub, which can be used as a testing platform for a fair comparison between various hashing algorithms.]]></description>
<dc:subject>hashing algorithms approximation dimension-reduction representation data-analysis feature-extraction nudge-targets consider:looking-to-see to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:daa7a178828f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hashing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:approximation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://pudding.cool/2017/05/song-repetition/">
    <title>Are Pop Lyrics Getting More Repetitive?</title>
    <dc:date>2018-12-23T14:00:25+00:00</dc:date>
    <link>https://pudding.cool/2017/05/song-repetition/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In 1977, the great computer scientist Donald Knuth published a paper called The Complexity of Songs, which is basically one long joke about the repetitive lyrics of newfangled music (example quote: "the advent of modern drugs has led to demands for still less memory, and the ultimate improvement of Theorem 1 has consequently just been announced").

I'm going to try to test this hypothesis with data. I'll be analyzing the repetitiveness of a dataset of 15,000 songs that charted on the Billboard Hot 100 between 1958 and 2017.

]]></description>
<dc:subject>visualization graphic-design data-analysis essay looking-to-see javascript rather-interesting via:cdzombak</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:12930fe77224/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graphic-design"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:essay"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:javascript"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:cdzombak"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0198341">
    <title>How many landmarks are enough to characterize shape and size variation?</title>
    <dc:date>2018-12-10T13:55:27+00:00</dc:date>
    <link>https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0198341</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Accurate characterization of morphological variation is crucial for generating reliable results and conclusions concerning changes and differences in form. Despite the prevalence of landmark-based geometric morphometric (GM) data in the scientific literature, a formal treatment of whether sampled landmarks adequately capture shape variation has remained elusive. Here, I introduce LaSEC (Landmark Sampling Evaluation Curve), a computational tool to assess the fidelity of morphological characterization by landmarks. This task is achieved by calculating how subsampled data converge to the pattern of shape variation in the full dataset as landmark sampling is increased incrementally. While the number of landmarks needed for adequate shape variation is dependent on individual datasets, LaSEC helps the user (1) identify under- and oversampling of landmarks; (2) assess robustness of morphological characterization; and (3) determine the number of landmarks that can be removed without compromising shape information. In practice, this knowledge could reduce time and cost associated with data collection, maintain statistical power in certain analyses, and enable the incorporation of incomplete, but important, specimens to the dataset. Results based on simulated shape data also reveal general properties of landmark data, including statistical consistency where sampling additional landmarks has the tendency to asymptotically improve the accuracy of morphological characterization. As landmark-based GM data become more widely adopted, LaSEC provides a systematic approach to evaluate and refine the collection of shape data––a goal paramount for accumulation and analysis of accurate morphological information.

]]></description>
<dc:subject>inference data-analysis looking-to-see rather-interesting training-data data-balancing to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:16483d624829/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:training-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-balancing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1710.00992">
    <title>[1710.00992] DimReader: Axis lines that explain non-linear projections</title>
    <dc:date>2018-12-09T11:35:59+00:00</dc:date>
    <link>https://arxiv.org/abs/1710.00992</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Non-linear dimensionality reduction (NDR) methods such as LLE and t-SNE are popular with visualization researchers and experienced data analysts, but present serious problems of interpretation. In this paper, we present DimReader, a technique that recovers readable axes from such techniques. DimReader is based on analyzing infinitesimal perturbations of the dataset with respect to variables of interest. The perturbations define exactly how we want to change each point in the original dataset and we measure the effect that these changes have on the projection. The recovered axes are in direct analogy with the axis lines (grid lines) of traditional scatterplots. We also present methods for discovering perturbations on the input data that change the projection the most. The calculation of the perturbations is efficient and easily integrated into programs written in modern programming languages. We present results of DimReader on a variety of NDR methods and datasets both synthetic and real-life, and show how it can be used to compare different NDR methods. Finally, we discuss limitations of our proposal and situations where further research is needed.
]]></description>
<dc:subject>user-interface visualization dimension-reduction rather-interesting data-analysis explanation the-mangle-in-practice to-write-about to-do</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:4618847d7e34/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-interface"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:explanation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:the-mangle-in-practice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-do"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://en.wikipedia.org/wiki/Fr%C3%A9chet_distance">
    <title>Fréchet distance - Wikipedia</title>
    <dc:date>2018-11-28T14:44:30+00:00</dc:date>
    <link>https://en.wikipedia.org/wiki/Fr%C3%A9chet_distance</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In mathematics, the Fréchet distance is a measure of similarity between curves that takes into account the location and ordering of the points along the curves. It is named after Maurice Fréchet.
]]></description>
<dc:subject>measurement metrics data-analysis feature-construction distance computational-geometry to-consider ReQ</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7cb0f92afb10/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:measurement"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:distance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:computational-geometry"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-consider"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ReQ"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://paulromer.net/jupyter-mathematica-and-the-future-of-the-research-paper/index.html">
    <title>Jupyter, Mathematica, and the Future of the Research Paper – Paul Romer</title>
    <dc:date>2018-11-01T09:08:15+00:00</dc:date>
    <link>https://paulromer.net/jupyter-mathematica-and-the-future-of-the-research-paper/index.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Jupyter rewards transparency; Mathematica rationalizes secrecy. Jupyter encourages individual integrity; Mathematica lets individuals hide behind corporate evasion. Jupyter exemplifies the social systems that emerged from the Scientific Revolution and the Enlightenment, systems that make it possible for people to cooperate by committing to objective truth; Mathematica exemplifies the horde of new Vandals whose pursuit of private gain threatens a far greater pubic loss–the collapse of social systems that took centuries to build.

]]></description>
<dc:subject>data-analysis user-experience open-source academic-culture startup-culture-must-die literate-programming open-access literary-criticism</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:11cfc2159c75/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-experience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:academic-culture"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:startup-culture-must-die"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:literate-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-access"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:literary-criticism"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1712.05630">
    <title>[1712.05630] Sparse principal component analysis via random projections</title>
    <dc:date>2018-10-04T10:38:53+00:00</dc:date>
    <link>https://arxiv.org/abs/1712.05630</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We introduce a new method for sparse principal component analysis, based on the aggregation of eigenvector information from carefully-selected random projections of the sample covariance matrix. Unlike most alternative approaches, our algorithm is non-iterative, so is not vulnerable to a bad choice of initialisation. Our theory provides great detail on the statistical and computational trade-off in our procedure, revealing a subtle interplay between the effective sample size and the number of random projections that are required to achieve the minimax optimal rate. Numerical studies provide further insight into the procedure and confirm its highly competitive finite-sample performance.
]]></description>
<dc:subject>dimension-reduction statistics data-analysis algorithms performance-measure consider:lexicase sparseness</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:f7f5a9f87bf7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:lexicase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:sparseness"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://twitter.com/RadarAndStuff/status/1043575880054988800">
    <title>Phil Stepanian on Twitter: &quot;So, fun fact: birds and insects show up on radar. Often. As in, pretty much every day. Can we visually delineate between migrating birds and insects on radar? Usually. Here is a bumbling threaded attempt to show some telltale s</title>
    <dc:date>2018-09-24T12:00:29+00:00</dc:date>
    <link>https://twitter.com/RadarAndStuff/status/1043575880054988800</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>biology migration data-analysis rather-interesting to-write-about ecology technology</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a96b7450e375/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:migration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ecology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:technology"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://www.biorxiv.org/content/early/2018/09/19/421842">
    <title>A non-spatial account of place and grid cells based on clustering models of concept learning | bioRxiv</title>
    <dc:date>2018-09-20T11:38:37+00:00</dc:date>
    <link>https://www.biorxiv.org/content/early/2018/09/19/421842</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[One view is that conceptual knowledge is organized as a "cognitive map" in the brain, using the circuitry in the medial temporal lobe (MTL) that supports spatial navigation. In contrast, we find that a domain-general learning algorithm explains key findings in both spatial and conceptual domains. When the clustering model is applied to spatial navigation tasks, so called place and grid cells emerge because of the relatively uniform sampling of possible inputs in these tasks. The same mechanism applied to conceptual tasks, where the overall space can be higher-dimensional and sampling sparser, leads to representations more aligned with human conceptual knowledge. Although the types of memory supported by the MTL are superficially dissimilar, the information processing steps appear shared.

]]></description>
<dc:subject>models-and-modes emergence data-analysis rather-interesting to-write-about consider:the-mangle</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:10d28be7bd95/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models-and-modes"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:emergence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:the-mangle"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://trent.st/ffx/">
    <title>Trent McConaghy - FFX</title>
    <dc:date>2018-05-28T12:00:54+00:00</dc:date>
    <link>http://trent.st/ffx/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[FFX is a technique for symbolic regression, to induce whitebox models given X/y training data. It does Fast Function Extraction. It is:

Fast - runtime 5-60 seconds, depending on problem size (1GHz cpu)
Scalable - 1000 input variables, no problem!
Deterministic - no need to "hope and pray".
If you ignore the whitebox-model aspect, FFX can be viewed as a regression tool. It's been used this way for thousands of industrial problems with 100K+ input variables. It can also be used as a classifier (FFXC), by wrapping the output with a logistic map. This has also been used successfully on thousands of industrial problems.]]></description>
<dc:subject>hey-I-know-this-guy symbolic-regression algorithms numerical-methods data-analysis to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5d858488992a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hey-I-know-this-guy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:symbolic-regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:numerical-methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://jasmcole.com/2018/01/16/cooking-the-books/">
    <title>Cooking the books – Almost looks like work</title>
    <dc:date>2018-05-03T11:04:32+00:00</dc:date>
    <link>https://jasmcole.com/2018/01/16/cooking-the-books/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Since Christmas, at my house we’ve been cooking with 5 ingredients or fewer thanks to the acquisition of Jamie Oliver’s new book, the contents of which are mostly available online here. The recipes are unanimously very tasty, but that’s besides the point. The real mark of culinary excellence (in my humble opinion) is how efficiently one can buy ingredients to make as many of the recipes as possible in one shopping trip. Let’s investigate while the lamb is on.


Each of the 135 recipes in the book consists of 5 ingredients, some of which overlap. It is therefore not necessary to purchase 675 ingredients, there are actually only 239 unique ones. (Yes, I did spend a Sunday morning typing 675 individual ingredients into a spreadsheet.)

The question is then this:

In which order should I buy my ingredients to maximise the number of possible recipes as a function of number of ingredients?

Let’s start simply, and look at the frequency of occurrence of the ingredients.]]></description>
<dc:subject>mathematical-recreations looking-to-see cooking data-analysis leading-questions rather-interesting</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:11e3e677477f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:mathematical-recreations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cooking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:leading-questions"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1709.07097">
    <title>[1709.07097] Persistence Flamelets: multiscale Persistent Homology for kernel density exploration</title>
    <dc:date>2018-03-19T09:50:49+00:00</dc:date>
    <link>https://arxiv.org/abs/1709.07097</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In recent years there has been noticeable interest in the study of the "shape of data". Among the many ways a "shape" could be defined, topology is the most general one, as it describes an object in terms of its connectivity structure: connected components (topological features of dimension 0), cycles (features of dimension 1) and so on. There is a growing number of techniques, generally denoted as Topological Data Analysis, aimed at estimating topological invariants of a fixed object; when we allow this object to change, however, little has been done to investigate the evolution in its topology. In this work we define the Persistence Flamelets, a multiscale version of one of the most popular tool in TDA, the Persistence Landscape. We examine its theoretical properties and we show how it could be used to gain insights on KDEs bandwidth parameter.]]></description>
<dc:subject>data-analysis feature-extraction representation topology rather-interesting algorithms visualization to-understand exploratory-data-analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:3cfe8030926d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:topology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:exploratory-data-analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1602.01241">
    <title>[1602.01241] Using separable non-negative matrix factorization techniques for the analysis of time-resolved Raman spectra</title>
    <dc:date>2017-10-15T11:35:03+00:00</dc:date>
    <link>https://arxiv.org/abs/1602.01241</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The key challenge of time-resolved Raman spectroscopy is the identification of the constituent species and the analysis of the kinetics of the underlying reaction network. In this work we present an integral approach that allows for determining both the component spectra and the rate constants simultaneously from a series of vibrational spectra. It is based on an algorithm for non-negative matrix factorization which is applied to the experimental data set following a few pre-processing steps. As a prerequisite for physically unambiguous solutions, each component spectrum must include one vibrational band that does not significantly interfere with vibrational bands of other species. The approach is applied to synthetic "experimental" spectra derived from model systems comprising a set of species with component spectra differing with respect to their degree of spectral interferences and signal-to-noise ratios. In each case, the species involved are connected via monomolecular reaction pathways. The potential and limitations of the approach for recovering the respective rate constants and component spectra are discussed.
]]></description>
<dc:subject>spectroscopy data-analysis inference numerical-methods modeling statistics rather-interesting nudge-targets consider:representation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:250c667d5ab4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:spectroscopy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:numerical-methods"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:representation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://caleydo.org/tools/upset/">
    <title>Visualizing Intersecting Sets</title>
    <dc:date>2017-10-10T21:47:08+00:00</dc:date>
    <link>http://caleydo.org/tools/upset/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Understanding relationships between sets is an important analysis task. The major challenge in this context is the combinatorial explosion of the number of set intersections if the number of sets exceeds a trivial threshold. To address this, we introduce UpSet, a novel visualization technique for the quantitative analysis of sets, their intersections, and aggregates of intersections.

]]></description>
<dc:subject>visualization set-theory data-analysis rather-interesting to-write-about to-do</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:e8e4d2f76c94/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:set-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-do"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1606.06159">
    <title>[1606.06159] BiFold visualization of bipartite datasets</title>
    <dc:date>2017-09-27T12:01:59+00:00</dc:date>
    <link>https://arxiv.org/abs/1606.06159</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The emerging domain of data-enabled science necessitates development of algorithms and tools for knowledge discovery. Human interaction with data through well-constructed graphical representation can take special advantage of our visual ability to identify patterns. We develop a data visualization framework, called BiFold, for exploratory analysis of bipartite datasets that describe binary relationships between groups of objects. Typical data examples would include voting records, organizational memberships, and pairwise associations, or other binary datasets. BiFold provides a low dimensional embedding of data that represents similarity by visual nearness, analogous to Multidimensional Scaling (MDS). The unique and new feature of BiFold is its ability to simultaneously capture both within-group and between-group relationships among objects, enhancing knowledge discovery. We benchmark BiFold using the {\it Southern Women Dataset}, where social groups are now visually evident. We construct BiFold plots for two US voting datasets: For the presidential election outcomes since 1976, BiFold illustrates the evolving geopolitical structures that underlie these election results. For Senate congressional voting, BiFold identifies a partisan coordinate, separating senators into two parties while simultaneously visualizing a bipartisan-coalition coordinate which captures the ultimate fate of the bills (pass/fail). Finally, we consider a global cuisine dataset of the association between recipes and food ingredients. BiFold allows us to visually compare and contrast cuisines while also allowing identification of signature ingredients of individual cuisines.
]]></description>
<dc:subject>data-analysis visualization rather-interesting to-write-about consider:looking-to-see algorithms plots statistics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0e216113a4a6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:plots"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://jasmcole.com/2017/04/08/extractor-attractor/#more-76737">
    <title>Extractor attractor – Almost looks like work</title>
    <dc:date>2017-08-12T12:48:08+00:00</dc:date>
    <link>https://jasmcole.com/2017/04/08/extractor-attractor/#more-76737</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Recently the extractor fan in my bathroom has started malfunctioning, occasionally grinding and stalling. The infuriating thing is that the grinding noise isn’t perfectly periodic – it is approximately so, but there are occasionally long gaps and the short gaps vary slightly. This lack of predictability makes the noise incredibly annoying, and hard to tune out. Before getting it fixed, I decided to investigate it a bit further.


The terminally curious may listen to the sound here:

https://www.dropbox.com/s/4xh1gmrjry10eky/FanSound.ts?dl=0

This was recorded from my phone, you can also hear me puttering around in the background.

After dumping the audio data, I looked at the waveform and realised it was quite difficult to extract the temporal locations of the grinding noises from the volume alone. As a good physicist I therefore had another look in the frequency domain, making a spectrogram.]]></description>
<dc:subject>mathematical-recreations looking-to-see data-analysis visualization physics nonlinear-dynamics amusing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0a14d13aad10/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:mathematical-recreations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:physics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nonlinear-dynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:amusing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.stat.columbia.edu/~gelman/research/published/objectivityr5.pdf">
    <title>Beyond subjective and objective in statistics [PDF]</title>
    <dc:date>2017-07-09T11:47:51+00:00</dc:date>
    <link>http://www.stat.columbia.edu/~gelman/research/published/objectivityr5.pdf</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Decisions in statistical data analysis are often justified, criticized, or avoided using concepts of objectivity and subjectivity. We argue that the words “objective” and “subjective” in statis- tics discourse are used in a mostly unhelpful way, and we propose to replace each of them with broader collections of attributes, with objectivity replaced by transparency, consensus, im- partiality, and correspondence to observable reality, and subjectivity replaced by awareness of multiple perspectives and context dependence. Together with stability, these make up a collection of virtues that we think is helpful in discussions of statistical foundations and practice. The advantage of these reformulations is that the replacement terms do not oppose each other and that they give more specific guidance about what statistical science strives to achieve. Instead of debating over whether a given statistical method is subjective or objective (or normatively debating the relative merits of subjectivity and objectivity in statistical practice), we can rec- ognize desirable attributes such as transparency and acknowledgment of multiple perspectives as complementary goals. We demonstrate the implications of our proposal with recent applied examples from pharmacology, election polling, and socioeconomic stratification. The aim of this paper is to push users and developers of statistical methods toward more effective use of diverse sources of information and more open acknowledgement of assumptions and goals.]]></description>
<dc:subject>statistics philosophy-of-science data-analysis looking-to-see hypothesis-testing learning to-read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0800407fc8b7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:philosophy-of-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hypothesis-testing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://en.wikipedia.org/wiki/Abstract_simplicial_complex">
    <title>Abstract simplicial complex - Wikipedia</title>
    <dc:date>2017-06-03T11:23:57+00:00</dc:date>
    <link>https://en.wikipedia.org/wiki/Abstract_simplicial_complex</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In mathematics, an abstract simplicial complex is a purely combinatorial description of the geometric notion of a simplicial complex, consisting of a family of non-empty finite sets closed under the operation of taking non-empty subsets.[1] In the context of matroids and greedoids, abstract simplicial complexes are also called independence systems.[2]

]]></description>
<dc:subject>topology data-analysis data-structures nudge consider:adding-as-primitive hypergraphs to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:63bc99374ef1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:topology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-structures"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:adding-as-primitive"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hypergraphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://github.com/appliedtopology/javaplex/wiki/Tutorial">
    <title>Tutorial · appliedtopology/javaplex Wiki</title>
    <dc:date>2017-06-03T11:23:11+00:00</dc:date>
    <link>https://github.com/appliedtopology/javaplex/wiki/Tutorial</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Javaplex is a Java software package for computing the persistent homology of filtered simplicial complexes (or more generally, filtered chain complexes), with special emphasis on applications arising in topological data analysis (Tausz et al. 2014). The main author is Andrew Tausz. Javaplex is a re-write of the JPlex package, which was written by Harlan Sexton and Mikael Vejdemo-Johansson. The main motivation for the development of Javaplex was the need for a flexible platform that supported new directions of research in topological data analysis and computational persistent homology. The website for Javaplex is http://appliedtopology.github.io/javaplex/, the documentation overview is at https://github.com/appliedtopology/javaplex/wiki/Overview, and the javadoc tree for the library is at http://appliedtopology.github.io/javaplex/doc/.

]]></description>
<dc:subject>topology data-analysis software tutorial feature-extraction</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:9807774dd637/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:topology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:software"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:tutorial"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://en.wikipedia.org/wiki/Topological_data_analysis">
    <title>Topological data analysis - Wikipedia</title>
    <dc:date>2017-06-03T11:20:47+00:00</dc:date>
    <link>https://en.wikipedia.org/wiki/Topological_data_analysis</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In applied mathematics, topological data analysis (TDA) is an approach to the analysis of datasets using techniques from topology. Extraction of information from datasets that are high-dimensional, incomplete and noisy is generally challenging. TDA provides a general framework to analyze such data in a manner that is insensitive to the particular metric chosen and provides dimensionality reduction and robustness to noise. Beyond this, it inherits functoriality, a fundamental concept of modern mathematics, from its topological nature, which allows it to adapt to new mathematical tools.

The initial motivation is to study the shape of data. TDA has combined algebraic topology and other tools from pure mathematics to allow mathematically rigorous study of "shape". The main tool is persistent homology, an adaptation of homology to point cloud data. Persistent homology has been applied to many types of data across many fields. Moreover, its mathematical foundation is also of theoretical importance. The unique features of TDA make it a promising bridge between topology and geometry.

]]></description>
<dc:subject>data-analysis reference to-write-about topology pattern-discovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:11787d5ca831/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:reference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:topology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1104.5557">
    <title>[1104.5557] Randomized algorithms for matrices and data</title>
    <dc:date>2017-05-09T16:02:22+00:00</dc:date>
    <link>https://arxiv.org/abs/1104.5557</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Randomized algorithms for very large matrix problems have received a great deal of attention in recent years. Much of this work was motivated by problems in large-scale data analysis, and this work was performed by individuals from many different research communities. This monograph will provide a detailed overview of recent work on the theory of randomized matrix algorithms as well as the application of those ideas to the solution of practical problems in large-scale data analysis. An emphasis will be placed on a few simple core ideas that underlie not only recent theoretical advances but also the usefulness of these tools in large-scale data applications. Crucial in this context is the connection with the concept of statistical leverage. This concept has long been used in statistical regression diagnostics to identify outliers; and it has recently proved crucial in the development of improved worst-case matrix algorithms that are also amenable to high-quality numerical implementation and that are useful to domain scientists. Randomized methods solve problems such as the linear least-squares problem and the low-rank matrix approximation problem by constructing and operating on a randomized sketch of the input matrix. Depending on the specifics of the situation, when compared with the best previously-existing deterministic algorithms, the resulting randomized algorithms have worst-case running time that is asymptotically faster; their numerical implementations are faster in terms of clock-time; or they can be implemented in parallel computing environments where existing numerical algorithms fail to run at all. Numerous examples illustrating these observations will be described in detail.
]]></description>
<dc:subject>via:arthegall data-analysis matrices feature-extraction learning-from-data data-mining rather-interesting to-read to-understand</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:39273ba6e6fb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:arthegall"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:matrices"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1501.01573">
    <title>[1501.01573] The Temporal Dimension of Risk</title>
    <dc:date>2017-05-09T11:17:35+00:00</dc:date>
    <link>https://arxiv.org/abs/1501.01573</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Multi-period measures of risk account for the path that the value of an investment portfolio takes. In the context of probabilistic risk measures, the focus has traditionally been on the magnitude of investment loss and not on the dimension associated with the passage of time. In this paper, the concept of temporal path-dependent risk measure is mathematically formalized to capture the risk associated with the temporal dimension of a stochastic process. We discuss the properties of temporal measures of risk and show that they can never be coherent. We then study the temporal dimension of investment drawdown, its duration, which measures the length of excursions below a running maximum. Its properties in the context of risk measures are analyzed both theoretically and empirically. In particular, we show that duration captures serial correlation in the returns of two major asset classes. We conclude by discussing the challenges of path-dependent temporal risk estimation in practice.
]]></description>
<dc:subject>portfolio-theory risk risk-management define-your-terms financial-engineering to-write-about time-series data-analysis planning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:9ea56b22b38a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:portfolio-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:risk"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:risk-management"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:define-your-terms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:financial-engineering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:planning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1608.04048">
    <title>[1608.04048] Ultra High-Dimensional Nonlinear Feature Selection for Big Biological Data</title>
    <dc:date>2017-04-17T10:35:26+00:00</dc:date>
    <link>https://arxiv.org/abs/1608.04048</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Machine learning methods are used to discover complex nonlinear relationships in biological and medical data. However, sophisticated learning models are computationally unfeasible for data with millions of features. Here we introduce the first feature selection method for nonlinear learning problems that can scale up to large, ultra-high dimensional biological data. More specifically, we scale up the novel Hilbert-Schmidt Independence Criterion Lasso (HSIC Lasso) to handle millions of features with tens of thousand samples. The proposed method is guaranteed to find an optimal subset of maximally predictive features with minimal redundancy, yielding higher predictive power and improved interpretability. Its effectiveness is demonstrated through applications to classify phenotypes based on module expression in human prostate cancer patients and to detect enzymes among protein structures. We achieve high accuracy with as few as 20 out of one million features --- a dimensionality reduction of 99.998%. Our algorithm can be implemented on commodity cloud computing platforms. The dramatic reduction of features may lead to the ubiquitous deployment of sophisticated prediction models in mobile health care applications.
]]></description>
<dc:subject>feature-selection data-analysis big-data bioinformatics statistics algorithms to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:ef2072225004/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.sciencedirect.com/science/article/pii/S1751157712001034">
    <title>Archetypal scientists</title>
    <dc:date>2017-02-27T01:37:18+00:00</dc:date>
    <link>http://www.sciencedirect.com/science/article/pii/S1751157712001034</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We introduce archetypal analysis as a tool to describe and categorize scientists. This approach identifies typical characteristics of extreme (‘archetypal’) values in a multivariate data set. These positive or negative contextual attributes can be allocated to each scientist under investigation. In our application, we use a sample of seven bibliometric indicators for 29,083 economists obtained from the RePEc database and identify six archetypes. These are mainly characterized by ratios of published work and citations. We discuss applications and limitations of this approach. Finally, we assign relative shares of the identified archetypes to each economist in our sample.

]]></description>
<dc:subject>archetypal-analysis amusing data-analysis machine-learning statistics clustering to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:69a1035ab556/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:archetypal-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:amusing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1609.08827">
    <title>[1609.08827] Anytime Discovery of a Diverse Set of Patterns with Monte Carlo Tree Search</title>
    <dc:date>2016-12-17T14:18:02+00:00</dc:date>
    <link>https://arxiv.org/abs/1609.08827</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Discovering patterns that strongly distinguish one class label from another is a challenging data-mining task. The unsupervised discovery of such patterns would enable the construction of intelligible classifiers and to elicit interesting hypotheses from the data. Subgroup Discovery (SD) is one framework that formally defines this pattern mining task. However, SD still faces two major issues: (i) how to define appropriate quality measures to characterize the uniqueness of a pattern; (ii) how to select an accurate heuristic search technique when exhaustive enumeration of the pattern space is unfeasible. The first issue has been tackled by the Exceptional Model Mining (EMM) framework. This general framework aims to find patterns that cover tuples that locally induce a model that substantially differs from the model of the whole dataset. The second issue has been studied in SD and EMM mainly with the use of beam-search strategies and genetic algorithms for discovering a pattern set that is non-redundant, diverse and of high quality. In this article, we argue that the greedy nature of most of these approaches produce pattern sets that lack of diversity. Consequently, we propose to formally define pattern mining as a single-player game, as in a puzzle, and to solve it with a Monte Carlo Tree Search (MCTS), a recent technique mainly used for artificial intelligence and planning problems. The exploitation/exploration trade-off and the power of random search of MCTS lead to an \emph{any-time mining} approach which tends towards an exhaustive search if given enough time and memory. Given a reasonable time and memory budget, MCTS quickly drives the search towards a diverse pattern set of high quality. MCTS does not need any knowledge of the pattern quality measure, and we show to what extent it is agnostic to the pattern language. We assess our claims with an exhaustive set of experiments.
]]></description>
<dc:subject>feature-extraction feature-construction algorithms information-theory to-understand data-structures data-analysis performance-measure optimization updated</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8602d396c8d2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-structures"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:updated"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://research.googleblog.com/2016/12/open-sourcing-embedding-projector-tool.html">
    <title>Research Blog: Open sourcing the Embedding Projector: a tool for visualizing high dimensional data</title>
    <dc:date>2016-12-11T13:09:25+00:00</dc:date>
    <link>https://research.googleblog.com/2016/12/open-sourcing-embedding-projector-tool.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Recent advances in Machine Learning (ML) have shown impressive results, with applications ranging from image recognition, language translation, medical diagnosis and more. With the widespread adoption of ML systems, it is increasingly important for research scientists to be able to explore how the data is being interpreted by the models. However, one of the main challenges in exploring this data is that it often has hundreds or even thousands of dimensions, requiring special tools to investigate the space. 

To enable a more intuitive exploration process, we are open-sourcing the Embedding Projector, a web application for interactive visualization and analysis of high-dimensional data recently shown as an A.I. Experiment, as part of TensorFlow. We are also releasing a standalone version at projector.tensorflow.org, where users can visualize their high-dimensional data without the need to install and run TensorFlow.
]]></description>
<dc:subject>visualization dimension-reduction data-analysis tools open-source google</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:d2236ad05cc3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:tools"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:google"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1606.00856">
    <title>[1606.00856] Sequential Principal Curves Analysis</title>
    <dc:date>2016-10-18T11:26:18+00:00</dc:date>
    <link>https://arxiv.org/abs/1606.00856</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[This work includes all the technical details of the Sequential Principal Curves Analysis (SPCA) in a single document. SPCA is an unsupervised nonlinear and invertible feature extraction technique. The identified curvilinear features can be interpreted as a set of nonlinear sensors: the response of each sensor is the projection onto the corresponding feature. Moreover, it can be easily tuned for different optimization criteria; e.g. infomax, error minimization, decorrelation; by choosing the right way to measure distances along each curvilinear feature. Even though proposed in [Laparra et al. Neural Comp. 12] and shown to work in multiple modalities in [Laparra and Malo Frontiers Hum. Neuro. 15], the SPCA framework has its original roots in the nonlinear ICA algorithm in [Malo and Gutierrez Network 06]. Later on, the SPCA philosophy for nonlinear generalization of PCA originated substantially faster alternatives at the cost of introducing different constraints in the model. Namely, the Principal Polynomial Analysis (PPA) [Laparra et al. IJNS 14], and the Dimensionality Reduction via Regression (DRR) [Laparra et al. IEEE TGRS 15]. This report illustrates the reasons why we developed such family and is the appropriate technical companion for the missing details in [Laparra et al., NeCo 12, Laparra and Malo, Front.Hum.Neuro. 15]. See also the data, code and examples in the dedicated sites this http URL and this http URL effects.html
]]></description>
<dc:subject>data-analysis dimension-reduction machine-learning algorithms to-understand feature-extraction feature-construction</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a64ff0c226aa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1602.03926">
    <title>[1602.03926] Modelling the level of adoption of analytical tools; An implementation of multi-criteria evidential reasoning</title>
    <dc:date>2016-09-14T13:18:49+00:00</dc:date>
    <link>http://arxiv.org/abs/1602.03926</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In the future, competitive advantages will be given to organisations that can extract valuable information from massive data and make better decisions. In most cases, this data comes from multiple sources. Therefore, the challenge is to aggregate them into a common framework in order to make them meaningful and useful. This paper will first review the most important multi-criteria decision analysis methods (MCDA) existing in current literature. We will offer a novel, practical and consistent methodology based on a type of MCDA, to aggregate data from two different sources into a common framework. Two datasets that are different in nature but related to the same topic are aggregated to a common scale by implementing a set of transformation rules. This allows us to generate appropriate evidence for assessing and finally prioritising the level of adoption of analytical tools in four types of companies. A numerical example is provided to clarify the form for implementing this methodology. A six-step process is offered as a guideline to assist engineers, researchers or practitioners interested in replicating this methodology in any situation where there is a need to aggregate and transform multiple source data.
]]></description>
<dc:subject>multiobjective-optimization decision-making management data-science data-analysis ergonomics user-experience to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:96a68401117f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:multiobjective-optimization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:decision-making"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:management"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ergonomics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-experience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1607.06274">
    <title>[1607.06274] Topological Data Analysis with Bregman Divergences</title>
    <dc:date>2016-08-15T12:31:56+00:00</dc:date>
    <link>http://arxiv.org/abs/1607.06274</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Given a finite set in a metric space, the topological analysis generalizes hierarchical clustering using a 1-parameter family of homology groups to quantify connectivity in all dimensions. The connectivity is compactly described by the persistence diagram. One limitation of the current framework is the reliance on metric distances, whereas in many practical applications objects are compared by non-metric dissimilarity measures. Examples are the Kullback-Leibler divergence, which is commonly used for comparing text and images, and the Itakura-Saito divergence, popular for speech and sound. These are two members of the broad family of dissimilarities called Bregman divergences. 
We show that the framework of topological data analysis can be extended to general Bregman divergences, widening the scope of possible applications. In particular, we prove that appropriately generalized Cech and Delaunay (alpha) complexes capture the correct homotopy type, namely that of the corresponding union of Bregman balls. Consequently, their filtrations give the correct persistence diagram, namely the one generated by the uniformly growing Bregman balls. Moreover, we show that unlike the metric setting, the filtration of Vietoris-Rips complexes may fail to approximate the persistence diagram. We propose algorithms to compute the thus generalized Cech, Vietoris-Rips and Delaunay complexes and experimentally test their efficiency. Lastly, we explain their surprisingly good performance by making a connection with discrete Morse theory.]]></description>
<dc:subject>data-analysis topology metrics to-understand algorithms representation statistics probability-theory models-and-modes</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:6f2fa3f0f89f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:topology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models-and-modes"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1605.08749">
    <title>[1605.08749] Visual Model Validation via Inline Replication</title>
    <dc:date>2016-08-06T13:26:57+00:00</dc:date>
    <link>http://arxiv.org/abs/1605.08749</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Data visualizations typically show retrospective views of an existing dataset with little or no focus on repeatability. However, consumers of these tools often use insights gleaned from retrospective visualizations as the basis for decisions about future events. In this way, visualizations often serve as visual predictive models despite the fact that they are typically designed to present historical views of the data. This "visual predictive model" approach, however, can lead to invalid inferences. In this paper, we describe an approach to visual model validation called Inline Replication (IR) which, similar to the cross-validation technique used widely in machine learning, provides a nonparametric and broadly applicable technique for visual model assessment and repeatability. This paper describes the overall IR process and outlines how it can be integrated into both traditional and emerging "big data" visualization pipelines. Examples are provided showing IR integrated within common visualization techniques (such as bar charts and linear regression lines) as well as a more fully-featured visualization system designed for complex exploratory analysis tasks.
]]></description>
<dc:subject>visualization data-analysis user-interface user-experience statistics exploratory-data-analysis nudge-targets consider:gp-approach</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:283cd372dd4b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-interface"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:user-experience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:exploratory-data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:gp-approach"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1603.04626">
    <title>[1603.04626] TAPER: query-aware, partition-enhancement for large, heterogenous, graphs</title>
    <dc:date>2016-07-25T12:08:22+00:00</dc:date>
    <link>http://arxiv.org/abs/1603.04626</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Graph partitioning has long been seen as a viable approach to address Graph DBMS scalability. A partitioning, however, may introduce extra query processing latency unless it is sensitive to a specific query workload, and optimised to minimise inter-partition traversals for that workload. Additionally, it should also be possible to incrementally adjust the partitioning in reaction to changes in the graph topology, the query workload, or both. Because of their complexity, current partitioning algorithms fall short of one or both of these requirements, as they are designed for offline use and as one-off operations. The TAPER system aims to address both requirements, whilst leveraging existing partitioning algorithms. TAPER takes any given initial partitioning as a starting point, and iteratively adjusts it by swapping chosen vertices across partitions, heuristically reducing the probability of inter-partition traversals for a given pattern matching queries workload. Iterations are inexpensive thanks to time and space optimisations in the underlying support data structures. We evaluate TAPER on two different large test graphs and over realistic query workloads. Our results indicate that, given a hash-based partitioning, TAPER reduces the number of inter-partition traversals by around 80%; given an unweighted METIS partitioning, by around 30%. These reductions are achieved within 8 iterations and with the additional advantage of being workload-aware and usable online.
]]></description>
<dc:subject>data-analysis databases graph-theory algorithms nudge-targets consider:performance-measures</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:13968eee8c32/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:databases"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graph-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:performance-measures"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1605.07030">
    <title>[1605.07030] Isotropic Dynamic Hierarchical Clustering</title>
    <dc:date>2016-07-24T01:43:02+00:00</dc:date>
    <link>http://arxiv.org/abs/1605.07030</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We face a need of discovering a pattern in locations of a great number of points in a high-dimensional space. Goal is to group the close points together. We are interested in a hierarchical structure, like a B-tree. B-Trees are hierarchical, balanced, and they can be constructed dynamically. B-Tree approach allows to determine the structure without any supervised learning or a priori knowlwdge. The space is Euclidean and isotropic. Unfortunately, there are no B-Tree implementations processing indices in a symmetrical and isotropical way. Some implementations are based on constructing compound asymmetrical indices from point coordinates; and the others split the nodes along the coordinate hyper-planes. We need to process tens of millions of points in a thousand-dimensional space. The application has to be scalable. Ideally, a cluster should be an ellipsoid, but it would require to store O(n2) ellipse axes. So, we are using multi-dimensional balls defined by the centers and radii. Calculation of statistical values like the mean and the average deviation, can be done in an incremental way. While adding a point to a tree, the statistical values for nodes recalculated in O(1) time. We support both, brute force O(2n) and greedy O(n2) split algorithms. Statistical and aggregated node information also allows to manipulate (to search, to delete) aggregated sets of closely located points. Hierarchical information retrieval. When searching, the user is provided with the highest appropriate nodes in the tree hierarchy, with the most important clusters emerging in the hierarchy automatically. Then, if interested, the user may navigate down the tree to more specific points. The system is implemented as a library of Java classes representing Points, Sets of points with aggregated statistical information, B-tree, and Nodes with a support of serialization and storage in a MySQL database.
]]></description>
<dc:subject>clustering data-analysis dimension-reduction algorithms machine-learning statistics nudge-targets performance-measure rather-interesting consider:rediscovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:6468b8561d6b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dimension-reduction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:rediscovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1606.01081">
    <title>[1606.01081] Implementing graph grammars for intelligence analysis in OCaml</title>
    <dc:date>2016-06-28T22:38:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1606.01081</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We report on implementing graph grammars for intelligence analysis in OCaml. Graph grammars are represented as elements of an algebraic data type in OCaml. In addition to algebraic data types, we use other concepts from functional programming languages to implement features of graph grammars. We use type checking to perform graph pattern matching. Graph transformations are defined as implicit coercions derived from structural subtyping proofs, subset types, lambda abstractions, and analytics. An analytic is a general-purpose OCaml function whose output is required to match a graph pattern described by an element of an algebraic data type. By using a strongly-typed language for representing graphs, we can ensure graphs produced from a graph transformation will match a specific schema. This is a high priority requirement for intelligence analysis.
]]></description>
<dc:subject>representation data-analysis OCaml graphs data-mining nudge-targets consider:rewriting-rules rather-interesting</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:82fcb555c5b3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:OCaml"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:rewriting-rules"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1511.05271">
    <title>[1511.05271] Enhanced detectability of community structure in multilayer networks through layer aggregation</title>
    <dc:date>2016-05-14T13:09:51+00:00</dc:date>
    <link>http://arxiv.org/abs/1511.05271</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Many systems are naturally represented by a multilayer network in which edges exist in multiple layers that encode different, but potentially related, types of interactions, and it is important to understand limitations on the detectability of community structure in these networks. Using random matrix theory, we analyze detectability limitations for multilayer (specifically, multiplex) stochastic block models (SBMs) in which L layers are derived from a common SBM. We study the effect of layer aggregation on detectability for several aggregation methods, including summation of the layers' adjacency matrices for which we show the detectability limit vanishes as O(L^{-1/2}) with increasing number of layers, L. Importantly, we find a similar scaling behavior when the summation is thresholded at an optimal value, providing insight into the common - but not well understood - practice of thresholding pairwise-interaction data to obtain sparse network representations.
]]></description>
<dc:subject>network-theory data-analysis community-detection data-fusion algorithms rather-interesting probability-theory nudge-targets consider:looking-to-see</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:dcca59014be6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:network-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:community-detection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-fusion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1605.03373">
    <title>[1605.03373] Nonlinear decoding of a complex movie from the mammalian retina</title>
    <dc:date>2016-05-13T11:13:54+00:00</dc:date>
    <link>http://arxiv.org/abs/1605.03373</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Retinal circuitry transforms spatiotemporal patterns of light into spiking activity of ganglion cells, which provide the sole visual input to the brain. Recent advances have led to a detailed characterization of retinal activity and stimulus encoding by large neural populations. The inverse problem of decoding, where the stimulus is reconstructed from spikes, has received less attention, in particular for complex input movies that should be reconstructed "pixel-by-pixel". We recorded around a hundred neurons from a dense patch in a rat retina and decoded movies of multiple small discs executing mutually-avoiding random motions. We constructed nonlinear (kernelized) decoders that improved significantly over linear decoding results, mostly due to their ability to reliably separate between neural responses driven by locally fluctuating light signals, and responses at locally constant light driven by spontaneous or network activity. This improvement crucially depended on the precise, non-Poisson temporal structure of individual spike trains, which originated in the spike-history dependence of neural responses. Our results suggest a general paradigm in which downstream neural circuitry could discriminate between spontaneous and stimulus-driven activity on the basis of higher-order statistical structure intrinsic to the incoming spike trains.
]]></description>
<dc:subject>neurology physiology experiment inverse-problems rather-interesting cognition nudge-targets consider:looking-to-see data-analysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:9b587cbf7067/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:neurology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:physiology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:experiment"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inverse-problems"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cognition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://thenewinquiry.com/essays/view-from-nowhere/">
    <title>View From Nowhere – The New Inquiry</title>
    <dc:date>2016-05-09T11:09:20+00:00</dc:date>
    <link>http://thenewinquiry.com/essays/view-from-nowhere/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Positivism’s intensity has waxed and waned over time, but it never entirely dies out, because its rewards are too seductive. The fantasy of a simple truth that can transcend the divisions that otherwise fragment a society riven by power and competing agendas is too powerful, and too profitable. To be able to assert convincingly that you have modeled the social world accurately is to know how to sell anything from a political position, a product, to one’s own authority. Big Data sells itself as a knowledge that equals power. But in fact, it relies on pre-existing power to equate data with knowledge.

]]></description>
<dc:subject>data-analysis big-data sociology science-criticism</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:6e01b3283960/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:science-criticism"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1601.07996">
    <title>[1601.07996] Feature Selection: A Data Perspective</title>
    <dc:date>2016-05-06T17:59:31+00:00</dc:date>
    <link>http://arxiv.org/abs/1601.07996</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Feature selection, as a data preprocessing strategy, has been proven to be effective and efficient in preparing high-dimensional data for data mining and machine learning problems. The objectives of feature selection include: building simpler and more comprehensible models, improving data mining performance, and preparing clean, understandable data. The recent proliferation of big data has presented some substantial challenges and opportunities of feature selection algorithms. In this survey, we provide a comprehensive and structured overview of recent advances in feature selection research. Motivated by current challenges and opportunities in the big data age, we revisit feature selection research from a data perspective, and review representative feature selection algorithms for generic data, structured data, heterogeneous data and streaming data. Methodologically, to emphasize the differences and similarities of most existing feature selection algorithms for generic data, we generally categorize them into four groups: similarity based, information theoretical based, sparse learning based and statistical based methods. Finally, to facilitate and promote the research in this community, we also present a open-source feature selection repository that consists of most of the popular feature selection algorithms (this http URL). At the end of this survey, we also have a discussion about some open problems and challenges that need to be paid more attention in future research.
]]></description>
<dc:subject>feature-selection data-analysis machine-learning statistics nudge-targets consider:ParetoGP</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:d007a149cf29/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:ParetoGP"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1206.1386">
    <title>[1206.1386] Robust subspace recovery by Tyler's M-estimator</title>
    <dc:date>2016-05-01T12:07:15+00:00</dc:date>
    <link>http://arxiv.org/abs/1206.1386</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[This paper considers the problem of robust subspace recovery: given a set of N points in ℝD, if many lie in a d-dimensional subspace, then can we recover the underlying subspace? We show that Tyler's M-estimator can be used to recover the underlying subspace, if the percentage of the inliers is larger than d/D and the data points lie in general position. Empirically, Tyler's M-estimator compares favorably with other convex subspace recovery algorithms in both simulations and experiments on real data sets.
]]></description>
<dc:subject>approximation statistics algorithms inference rather-interesting data-analysis nudge-targets consider:performance-measures consider:rediscovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:6c56b073a82e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:approximation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:performance-measures"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:rediscovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1503.03332">
    <title>[1503.03332] On the long-term correlations and multifractal properties of electric arc furnace time series</title>
    <dc:date>2016-03-26T20:53:25+00:00</dc:date>
    <link>http://arxiv.org/abs/1503.03332</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In this paper, we study long-term correlations and multifractal properties elaborated from time series of three-phase current signals coming from an industrial electric arc furnace plant. Implicit sinusoidal trends are suitably detected by considering the scaling of the fluctuation functions. Time series are then filtered via a Fourier-based analysis, removing hence such strong periodicities. In the filtered time series we detected long-term, positive correlations. The presence of positive correlations is in agreement with the typical V--I characteristic (hysteresis) of the electric arc furnace, providing thus a sound physical justification for the memory effects found in the current time series. The multifractal signature is strong enough in the filtered time series to be effectively classified as multifractal.
]]></description>
<dc:subject>time-series data-analysis prediction industrial-design nudge-targets consider:pareto-GP symbolic-regression</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7bd70675e16f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:industrial-design"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:pareto-GP"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:symbolic-regression"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1505.05211">
    <title>[1505.05211] Principles of Dataset Versioning: Exploring the Recreation/Storage Tradeoff</title>
    <dc:date>2016-01-29T11:12:35+00:00</dc:date>
    <link>http://arxiv.org/abs/1505.05211</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The relative ease of collaborative data science and analysis has led to a proliferation of many thousands or millions of versions of the same datasets in many scientific and commercial domains, acquired or constructed at various stages of data analysis across many users, and often over long periods of time. Managing, storing, and recreating these dataset versions is a non-trivial task. The fundamental challenge here is the storage−recreationtrade−off: the more storage we use, the faster it is to recreate or retrieve versions, while the less storage we use, the slower it is to recreate or retrieve versions. Despite the fundamental nature of this problem, there has been a surprisingly little amount of work on it. In this paper, we study this trade-off in a principled manner: we formulate six problems under various settings, trading off these quantities in various ways, demonstrate that most of the problems are intractable, and propose a suite of inexpensive heuristics drawing from techniques in delay-constrained scheduling, and spanning tree literature, to solve these problems. We have built a prototype version management system, that aims to serve as a foundation to our DATAHUB system for facilitating collaborative data science. We demonstrate, via extensive experiments, that our proposed heuristics provide efficient solutions in practical dataset versioning scenarios.
]]></description>
<dc:subject>data-analysis data-science collaboration computational-complexity rather-interesting models cultural-dynamics cultural-artifacts reproducibility</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:b5b7f31e9a9d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:collaboration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:computational-complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cultural-dynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cultural-artifacts"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:reproducibility"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://github.com/donnemartin/data-science-ipython-notebooks">
    <title>donnemartin/data-science-ipython-notebooks</title>
    <dc:date>2016-01-12T20:29:31+00:00</dc:date>
    <link>https://github.com/donnemartin/data-science-ipython-notebooks</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Continually updated data science Python notebooks: Deep learning (TensorFlow, Theano, Caffe), scikit-learn, Kaggle, big data (Spark, Hadoop MapReduce, HDFS), matplotlib, pandas, NumPy, SciPy, Python essentials, AWS, and various command lines. https://bit.ly/data-notes
]]></description>
<dc:subject>via? Jupyter open-science data-analysis deep-learning machine-learning scientific-computing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0a1ebb94b7e7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via?"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Jupyter"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:deep-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:scientific-computing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1511.01343">
    <title>[1511.01343] A Family of Blockwise One-Factor Distributions for Modelling High-Dimensional Binary Data</title>
    <dc:date>2015-12-14T12:22:49+00:00</dc:date>
    <link>http://arxiv.org/abs/1511.01343</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We introduce a new family of one factor distributions for high-dimensional binary data. The model provides an explicit probability for each event, thus avoiding the numeric approximations often made by existing methods. Model interpretation is easy since each variable is described by two continuous parameters (corresponding to its marginal probability and to its strength of dependency with the other variables) and by one binary parameter (defining if the dependencies are positive or negative). An extension of this new model is proposed by assuming that the variables are split into independent blocks which follow the new one factor distribution. Parameter estimation is performed by the inference margin procedure where the second step is achieved by an expectation-maximization algorithm. Model selection is carried out by a deterministic approach which strongly reduces the number of competing models. This approach uses a hierarchical ascendant classification of the variables based on the empirical version of Cramer's V for selecting a narrow subset of models. The consistency of such procedure is shown. The new model is evaluated on numerical experiments and on a real data set. The procedure is implemented in the R package MvBinary available on CRAN.
]]></description>
<dc:subject>statistics feature-construction probability-theory data-analysis algorithms performance-measure nudge-targets consider:feature-discovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:e83a283b1fc3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:feature-discovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1505.01866">
    <title>[1505.01866] DART: Dropouts meet Multiple Additive Regression Trees</title>
    <dc:date>2015-09-19T12:18:44+00:00</dc:date>
    <link>http://arxiv.org/abs/1505.01866</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Multiple Additive Regression Trees (MART), an ensemble model of boosted regression trees, is known to deliver high prediction accuracy for diverse tasks, and it is widely used in practice. However, it suffers an issue which we call over-specialization, wherein trees added at later iterations tend to impact the prediction of only a few instances, and make negligible contribution towards the remaining instances. This negatively affects the performance of the model on unseen data, and also makes the model over-sensitive to the contributions of the few, initially added tress. We show that the commonly used tool to address this issue, that of shrinkage, alleviates the problem only to a certain extent and the fundamental issue of over-specialization still remains. In this work, we explore a different approach to address the problem that of employing dropouts, a tool that has been recently proposed in the context of learning deep neural networks. We propose a novel way of employing dropouts in MART, resulting in the DART algorithm. We evaluate DART on ranking, regression and classification tasks, using large scale, publicly available datasets, and show that DART outperforms MART in each of the tasks, with a significant margin. We also show that DART overcomes the issue of over-specialization to a considerable extent.
]]></description>
<dc:subject>machine-learning data-analysis metaheuristics stochastic-resonance performance-measure parsimony nudge-targets Pareto-GP</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:226c5ca42030/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metaheuristics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:stochastic-resonance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:parsimony"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Pareto-GP"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1507.06988">
    <title>[1507.06988] A Binary Data Stream Scripting Language</title>
    <dc:date>2015-09-14T11:16:23+00:00</dc:date>
    <link>http://arxiv.org/abs/1507.06988</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Any file is fundamentally a binary data stream. A practical solution was achieved to interpret binary data stream. A new scripting language named Data Format Scripting Language (DFSL) was developed to describe the physical layout of the data in a structural, more intelligible way. On the basis of the solution, a generic software application was implemented; it parses various binary data streams according to their respective DFSL scripts and generates human-readable result and XML document for data sharing. Our solution helps eliminate the error-prone low-level programming, especially in the hardware devices or network protocol development/debugging processes.
]]></description>
<dc:subject>rather-interesting computer-science programming-language data-analysis specification domain-specific-languages parsing</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:ca286c56e8be/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:computer-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:programming-language"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:specification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:domain-specific-languages"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:parsing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1506.07800">
    <title>[1506.07800] Deducing self-interaction in eye movement data using sequential spatial point processes</title>
    <dc:date>2015-08-22T19:29:09+00:00</dc:date>
    <link>http://arxiv.org/abs/1506.07800</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Eye movement data are outputs of an analyser tracking the gaze when a person is inspecting a scene. This kind of data are of increasing importance in scientific research as well as in applications, e.g. in marketing and man-machine interface planning. Our research objective is to model eye movement sequences using sequential spatial point processes with self-interaction, which describes the effect of the past to the current movement of the gaze. 
We consider three elements of an eye movement sequence: heterogeneity of the target space, contextuality between subsequent movements and time-dependent behaviour describing self-interaction. We propose two constructions: one is based on history-dependent rejection of transitions in a random walk and the other makes use of history-adapted transition kernel. Both models are recognized as inhomogeneous self-interacting random walks. For such processes, statistical inference based on the likelihood is suggested and experimented. This study enlargers the likelihood inference to sequential spatial point processes which have been considered complex.
]]></description>
<dc:subject>statistics modeling rather-interesting data-analysis inference</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:05faab474990/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1408.3600">
    <title>[1408.3600] Uncovering the nutritional landscape of food</title>
    <dc:date>2015-08-07T11:58:33+00:00</dc:date>
    <link>http://arxiv.org/abs/1408.3600</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Recent progresses in data-driven analysis methods, including network-based approaches, are revolutionizing many classical disciplines. These techniques can also be applied to food and nutrition, which must be studied to design healthy diets. Using nutritional information from over 1,000 raw foods, we systematically evaluated the nutrient composition of each food in regards to satisfying daily nutritional requirements. The nutrient balance of a food was quantified herein as nutritional fitness, using the food's frequency of occurrence in nutritionally adequate food combinations. Nutritional fitness offers prioritization of recommendable foods within a global network of foods, in which foods are connected based on the similarities of their nutrient compositions. We identified a number of key nutrients, such as choline and alpha-linolenic acid, whose levels in foods can critically affect the foods' nutritional fitness. Analogously, pairs of nutrients can have the same effect. In fact, two nutrients can impact the nutritional fitness synergistically, although the individual nutrients alone may not. This result, involving the tendency among nutrients to show correlations in their abundances across foods, implies a hidden layer of complexity when exploring for foods whose balance of nutrients within pairs holistically helps meet nutritional requirements. Interestingly, foods with high nutritional fitness successfully maintain this nutrient balance. This effect expands our scope to a diverse repertoire of nutrient-nutrient correlations, integrated under a common network framework that yields unexpected yet coherent associations between nutrients. Our nutrient-profiling approach combined with a network-based analysis provides a more unbiased, global view of the relationships between foods and nutrients, and can be extended towards nutritional policies, food marketing, and personalized nutrition.
]]></description>
<dc:subject>nutrition data-analysis rather-interesting visualization exploratory-data-analysis looking-to-see pattern-discovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:e4e5165f09c9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nutrition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:exploratory-data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1311.1911">
    <title>[1311.1911] Visualizing the Effects of a Changing Distance on Data Using Continuous Embeddings</title>
    <dc:date>2015-07-05T20:18:31+00:00</dc:date>
    <link>http://arxiv.org/abs/1311.1911</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Most ML methods, from clustering to classification, rely on a distance function to describe relationships between datapoints. For complex datasets it is hard to avoid making some arbitrary choices when defining a distance function. To compare images, one must choose a spatial scale, for signals, a temporal scale. The right scale is hard to pin down and it is preferable when results do not depend too tightly on the exact value one picked. Topological data analysis seeks to address this issue by focusing on the notion of neighbourhood instead of distance. Here, we show that in some cases a simpler solution is available. One can check how strongly distance relationships depend on a hyperparameter using dimensionality reduction. We formulate a variant of dynamical multi-dimensional scaling (MDS), which embeds datapoints as curves. The resulting algorithm is based on the Concave-Convex Procedure (CCCP) and provides a simple and efficient way of visualizing changes and invariances in distance patterns as a hyperparameter is varied. We also present a variant to analyze the dependence on multiple hyperparameters. We provide a cMDS algorithm that is straightforward to implement, use and extend. To illustrate the possibilities of cMDS, we apply cMDS to several real-world data sets.
]]></description>
<dc:subject>clustering metrics data-analysis statistics algorithms horse-races visualization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:01f80c802cb0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metrics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:horse-races"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1406.7349">
    <title>[1406.7349] Convex Analysis of Mixtures for Separating Non-negative Well-grounded Sources</title>
    <dc:date>2015-07-05T11:35:57+00:00</dc:date>
    <link>http://arxiv.org/abs/1406.7349</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Blind Source Separation (BSS) has proven to be a powerful tool for the analysis of composite patterns in engineering and science. We introduce Convex Analysis of Mixtures (CAM) for separating non-negative well-grounded sources, which learns the mixing matrix by identifying the lateral edges of the convex data scatter plot. We prove a sufficient and necessary condition for identifying the mixing matrix through edge detection, which also serves as the foundation for CAM to be applied not only to the exact-determined and over-determined cases, but also to the under-determined case. We show the optimality of the edge detection strategy, even for cases where source well-groundedness is not strictly satisfied. The CAM algorithm integrates plug-in noise filtering using sector-based clustering, an efficient geometric convex analysis scheme, and stability-based model order selection. We demonstrate the principle of CAM on simulated data and numerically mixed natural images. The superior performance of CAM against a panel of benchmark BSS techniques is demonstrated on numerically mixed gene expression data. We then apply CAM to dissect dynamic contrast-enhanced magnetic resonance imaging data taken from breast tumors and time-course microarray gene expression data derived from in-vivo muscle regeneration in mice, both producing biologically plausible decomposition results.
]]></description>
<dc:subject>machine-learning source-separation classification algorithms image-processing data-analysis nudge-targets performance-measure rather-interesting</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8fae83fd6c19/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:source-separation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:performance-measure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://vcg.github.io/upset/about/#">
    <title>UpSet</title>
    <dc:date>2015-06-27T18:24:13+00:00</dc:date>
    <link>http://vcg.github.io/upset/about/#</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[UpSet is an interactive, web based visualization technique designed to analyze set-based data. UpSet visualizes both, set intersections and their properties, and the items (elements) in the dataset.

]]></description>
<dc:subject>set-theory data-analysis visualization software javascript</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:90f4589946df/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:set-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:software"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:javascript"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://biorxiv.org/content/early/2015/01/10/013623">
    <title>Software for the analysis and visualization of deep mutational scanning data | bioRxiv</title>
    <dc:date>2015-05-25T12:18:40+00:00</dc:date>
    <link>http://biorxiv.org/content/early/2015/01/10/013623</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Background Deep mutational scanning is a technique to estimate the impacts of mutations on a gene by using deep sequencing to count mutations in a library of variants before and after imposing a functional selection. The impacts of mutations must be inferred from changes in their counts after selection. Results I describe a software package, dms_tools, to infer the impacts of mutations from deep mutational scanning data using a likelihood-based treatment of the mutation counts. I show that dms_tools yields more accurate inferences on simulated data than the widely used but statistically biased approach of calculating ratios of counts pre- and post-selection. Using dms_tools, one can infer the preference of each site for each amino acid given a single selection pressure, or assess the extent to which these preferences change under different selection pressures. The preferences and their changes can be intuitively visualized with sequence-logo-style plots created using an extension to weblogo. Conclusions dms_tools implements a statistically principled approach for the analysis and subsequent visualization of deep mutational scanning data.

]]></description>
<dc:subject>bioinformatics systems-biology molecular-biology statistics data-analysis rather-interesting consider:adapt-for-GP</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:383893036a98/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:systems-biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:molecular-biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:adapt-for-GP"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1403.6804">
    <title>[1403.6804] A simple modification for improving inference of non-linear dynamical systems</title>
    <dc:date>2015-05-25T12:14:19+00:00</dc:date>
    <link>http://arxiv.org/abs/1403.6804</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Particle and ensemble filters are increasingly utilized for inference, optimization, and forecast; however, both filtering methods use discrete distributions to simulate continuous state space, a drawback that can lead to degraded performance for non-linear dynamical systems. Here we propose a simple modification, applicable to both particle and ensemble filters, that compensates for this problem. The method randomly replaces one or more model variables or parameters within a fraction of simulated trajectories at each filtering cycle. This modification, termed space re-probing, expands the state space covered by the filter through the introduction of outlying trajectories. We apply the space re-probing modification to three particle filters and three ensemble filters, and use these modified filters to model and forecast influenza epidemics. For both filter types, the space re-probing improves simulation of influenza epidemic curves and the prediction of influenza outbreak peak timing. Further, as fewer particles are needed for the particle filters, the proposed modification reduces the computational cost of these filters.
]]></description>
<dc:subject>modeling data-analysis prediction statistics nudge-targets simulation data-cleaning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:06ac1fdb8efd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:simulation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-cleaning"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>