<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (Vaguery)</title>
    <link>https://pinboard.in/u:Vaguery/public/</link>
    <description>recent bookmarks from Vaguery</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="https://arxiv.org/abs/1802.06095"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1810.02016"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1812.05225"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1709.05725"/>
	<rdf:li rdf:resource="https://arxiv.org/abs/1104.5557"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1511.05933"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1606.01081"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1503.00306"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1503.00310"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1305.1422"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1501.04759"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1301.1218"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1401.1475"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1407.6439"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1411.4952"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1409.1152"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1409.5400"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1310.8341"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1312.0086"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1311.1704"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1311.2100"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1309.0302"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1307.1584"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1305.7074"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1302.5344"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1302.4211"/>
	<rdf:li rdf:resource="http://www.biomedcentral.com/1471-2350/10/6"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1212.5389"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1212.0504"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1211.3497"/>
	<rdf:li rdf:resource="http://www.sciencedirect.com/science/article/pii/S1097276513000956"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1301.3874"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1206.3268"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1206.0217"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1206.1032"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1201.5568"/>
	<rdf:li rdf:resource="http://venturebeat.com/2011/05/16/datameer-funding/?utm_source=feedburner&amp;utm_medium=feed&amp;utm_campaign=Feed%3A+Venturebeat+%28VentureBeat%29"/>
	<rdf:li rdf:resource="http://flowingdata.com/2011/05/20/growing-need-for-data-heads/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1007.5510"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1006.4968"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1006.5273"/>
	<rdf:li rdf:resource="http://casstools.org/"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1006.4929"/>
	<rdf:li rdf:resource="http://seekingalpha.com/article/210550-a-peek-into-the-future-hft-and-financial-news?source=feed"/>
	<rdf:li rdf:resource="http://radar.oreilly.com/2010/06/what-is-data-science.html"/>
	<rdf:li rdf:resource="http://www.dmg.org/v4-0/GeneralStructure.html"/>
	<rdf:li rdf:resource="http://cscs.umich.edu/~crshalizi/notebooks/model-selection.html"/>
	<rdf:li rdf:resource="http://www.newton.ac.uk/programmes/SCH/seminars/index.html"/>
	<rdf:li rdf:resource="http://polymeme.com/about"/>
	<rdf:li rdf:resource="http://ciir.cs.umass.edu/~strohman/dissertation/"/>
	<rdf:li rdf:resource="http://pyflix.python-hosting.com/"/>
	<rdf:li rdf:resource="http://ideas.repec.org/p/wop/pennin/01-05.html"/>
	<rdf:li rdf:resource="http://www.altsearchengines.com/2008/12/28/how-to-search-for-influencers-with-datanetis/"/>
	<rdf:li rdf:resource="http://www.cs.ucr.edu/~eamonn/TSDMA/tsdma_papers.html"/>
	<rdf:li rdf:resource="http://nltk.sourceforge.net/index.php/Main_Page"/>
	<rdf:li rdf:resource="http://seekingalpha.com/article/99106-where-to-look-for-ideas-in-this-market?source=feed"/>
	<rdf:li rdf:resource="http://www.kddcup2008.com/KDDsite/Challenges.htm"/>
	<rdf:li rdf:resource="http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html"/>
	<rdf:li rdf:resource="http://www.ailab.si/orange/"/>
	<rdf:li rdf:resource="http://www.texturesynthesis.com/links.htm"/>
	<rdf:li rdf:resource="http://theinfo.org/"/>
	<rdf:li rdf:resource="http://precedings.nature.com/documents/1490/version/1"/>
	<rdf:li rdf:resource="http://www.foodpairing.be/"/>
	<rdf:li rdf:resource="http://www.tickdata.com/index.html"/>
	<rdf:li rdf:resource="http://www.vtk.org/"/>
	<rdf:li rdf:resource="http://magia3e.wordpress.com/2007/05/29/semantic-analysis-making-sense-of-the-chaos-of-free-text/"/>
	<rdf:li rdf:resource="http://space4commerce.blogspot.com/2007/05/google-wants-to-know-everything-about.html"/>
	<rdf:li rdf:resource="http://www.iht.com/articles/2007/05/20/news/compute21.php"/>
	<rdf:li rdf:resource="http://labs.google.com/papers/sawzall.html"/>
	<rdf:li rdf:resource="http://rapid-i.com/content/blogcategory/10/21/lang,en/"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="https://arxiv.org/abs/1802.06095">
    <title>[1802.06095] Mining Sub-Interval Relationships In Time Series Data</title>
    <dc:date>2021-05-11T19:03:28+00:00</dc:date>
    <link>https://arxiv.org/abs/1802.06095</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Time-series data is being increasingly collected and stud- ied in several areas such as neuroscience, climate science, transportation, and social media. Discovery of complex patterns of relationships between individual time-series, using data-driven approaches can improve our understanding of real-world systems. While traditional approaches typically study relationships between two entire time series, many interesting relationships in real-world applications exist in small sub-intervals of time while remaining absent or feeble during other sub-intervals. In this paper, we define the notion of a sub-interval relationship (SIR) to capture inter- actions between two time series that are prominent only in certain sub-intervals of time. We propose a novel and efficient approach to find most interesting SIR in a pair of time series. We evaluate our proposed approach on two real-world datasets from climate science and neuroscience domain and demonstrated the scalability and computational efficiency of our proposed approach. We further evaluated our discovered SIRs based on a randomization based procedure. Our results indicated the existence of several such relationships that are statistically significant, some of which were also found to have physical interpretation.
]]></description>
<dc:subject>machine-learning time-series data-mining rather-interesting pattern-discovery representation signal-processing to-write-about</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:cd0b9965a2b0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:signal-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1810.02016">
    <title>[1810.02016] The Four Point Permutation Test for Latent Block Structure in Incidence Matrices</title>
    <dc:date>2019-10-26T12:42:46+00:00</dc:date>
    <link>https://arxiv.org/abs/1810.02016</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Transactional data may be represented as a bipartite graph G:=(L∪R,E), where L denotes agents, R denotes objects visible to many agents, and an edge in E denotes an interaction between an agent and an object. Unsupervised learning seeks to detect block structures in the adjacency matrix Z between L and R, thus grouping together sets of agents with similar object interactions. New results on quasirandom permutations suggest a non-parametric \textbf{four point test} to measure the amount of block structure in G, with respect to vertex orderings on L and R. Take disjoint 4-edge random samples, order these four edges by left endpoint, and count the relative frequencies of the 4! possible orderings of the right endpoint. When these orderings are equiprobable, the edge set E corresponds to a quasirandom permutation π of |E| symbols. Total variation distance of the relative frequency vector away from the uniform distribution on 24 permutations measures the amount of block structure. Such a test statistic, based on ⌊|E|/4⌋ samples, is computable in O(|E|/p) time on p processors. Possibly block structure may be enhanced by precomputing \textbf{natural orders} on L and R, related to the second eigenvector of graph Laplacians. In practice this takes O(d|E|) time, where d is the graph diameter. Five open problems are described.
]]></description>
<dc:subject>combinatorics counting rather-interesting probability-theory data-analysis data-mining graph-theory network-theory hypergraphs to-write-about to-simulate</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5d0e64d5a30a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:combinatorics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:counting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graph-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:network-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hypergraphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-write-about"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-simulate"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1812.05225">
    <title>[1812.05225] Finding the origin of noise transients in LIGO data with machine learning</title>
    <dc:date>2019-01-27T12:35:14+00:00</dc:date>
    <link>https://arxiv.org/abs/1812.05225</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Quality improvement of interferometric data collected by gravitational-wave detectors such as Advanced LIGO and Virgo is mission critical for the success of gravitational-wave astrophysics. Gravitational-wave detectors are sensitive to a variety of disturbances of non-astrophysical origin with characteristic frequencies in the instrument band of sensitivity. Removing non-astrophysical artifacts that corrupt the data stream is crucial for increasing the number and statistical significance of gravitational-wave detections and enabling refined astrophysical interpretations of the data. Machine learning has proved to be a powerful tool for analysis of massive quantities of complex data in astronomy and related fields of study. We present two machine learning methods, based on random forest and genetic programming algorithms, that can be used to determine the origin of non-astrophysical transients in the LIGO detectors. We use two classes of transients with known instrumental origin that were identified during the first observing run of Advanced LIGO to show that the algorithms can successfully identify the origin of non-astrophysical transients in real interferometric data and thus assist in the mitigation of instrumental and environmental disturbances in gravitational-wave searches. While the data sets described in this paper are specific to LIGO, and the exact procedures employed were unique to the same, the random forest and genetic programming code bases and means by which they were applied as a dual machine learning approach are completely portable to any number of instruments in which noise is believed to be generated through mechanical couplings, the source of which is not yet discovered.]]></description>
<dc:subject>genetic-programming hey-I-know-this-guy astrophysics data-analysis data-mining to-understand feature-construction classification</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:97dd967c5c54/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetic-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hey-I-know-this-guy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:astrophysics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:classification"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1709.05725">
    <title>[1709.05725] FlashProfile: Interactive Synthesis of Syntactic Profiles</title>
    <dc:date>2018-01-26T12:16:56+00:00</dc:date>
    <link>https://arxiv.org/abs/1709.05725</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We address the problem of learning comprehensive syntactic profiles for a set of strings. Real-world datasets, typically curated from multiple sources, often contain data in various formats. Thus any data processing task is preceded by the critical step of data format identification. However, manual inspection of data to identify various formats is infeasible in standard big-data scenarios. 
We present a technique for generating comprehensive syntactic profiles in terms of user-defined patterns that also allows for interactive refinement. We define a syntactic profile as a set of succinct patterns that describe the entire dataset. Our approach efficiently learns such profiles, and allows refinement by exposing a desired number of patterns. 
Our implementation, FlashProfile, shows a median profiling time of 0.7s over 142 tasks on 74 real datasets. We also show that access to the generated data profiles allow for more accurate synthesis of programs, using fewer examples in programming-by-example workflows.]]></description>
<dc:subject>pattern-discovery rather-interesting strings data-mining statistics nudge-targets consider:looking-to-see consider:representation consider:performance-measures</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:3b20b6fb4a78/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:strings"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:performance-measures"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://arxiv.org/abs/1104.5557">
    <title>[1104.5557] Randomized algorithms for matrices and data</title>
    <dc:date>2017-05-09T16:02:22+00:00</dc:date>
    <link>https://arxiv.org/abs/1104.5557</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Randomized algorithms for very large matrix problems have received a great deal of attention in recent years. Much of this work was motivated by problems in large-scale data analysis, and this work was performed by individuals from many different research communities. This monograph will provide a detailed overview of recent work on the theory of randomized matrix algorithms as well as the application of those ideas to the solution of practical problems in large-scale data analysis. An emphasis will be placed on a few simple core ideas that underlie not only recent theoretical advances but also the usefulness of these tools in large-scale data applications. Crucial in this context is the connection with the concept of statistical leverage. This concept has long been used in statistical regression diagnostics to identify outliers; and it has recently proved crucial in the development of improved worst-case matrix algorithms that are also amenable to high-quality numerical implementation and that are useful to domain scientists. Randomized methods solve problems such as the linear least-squares problem and the low-rank matrix approximation problem by constructing and operating on a randomized sketch of the input matrix. Depending on the specifics of the situation, when compared with the best previously-existing deterministic algorithms, the resulting randomized algorithms have worst-case running time that is asymptotically faster; their numerical implementations are faster in terms of clock-time; or they can be implemented in parallel computing environments where existing numerical algorithms fail to run at all. Numerous examples illustrating these observations will be described in detail.
]]></description>
<dc:subject>via:arthegall data-analysis matrices feature-extraction learning-from-data data-mining rather-interesting to-read to-understand</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:39273ba6e6fb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:arthegall"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:matrices"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-read"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-understand"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1511.05933">
    <title>[1511.05933] Towards O(1) Seeding of K-Means</title>
    <dc:date>2016-08-15T12:11:46+00:00</dc:date>
    <link>http://arxiv.org/abs/1511.05933</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[K-means is one of the most widely used algorithms for clustering in Data Mining applications, which attempts to minimize the sum of square of Euclidean distance of the points in the clusters from the respective means of the clusters. The simplicity and scalability of K-means makes it very appealing. However, K-means suffers from local minima problem, and comes with no guarantee to converge to the optimal cost. K-means++ tries to address the problem by seeding the means using a distance based sampling scheme. However, seeding the means in K-means++ needs O(K) passes through the entire dataset. This could be very costly in large amount of dataset. Here we propose a method of seeding initial means based on factorizations of higher order moments for bounded data. Our method takes O(1) passes through the entire dataset to extract the initial set of means, and its final cost can be proven to be within O(K‾‾√) of the optimal cost. We demonstrate the performance of our algorithm in comparison with the existing algorithms on various benchmark datasets.]]></description>
<dc:subject>data-mining clustering algorithms computational-complexity initialization horse-races nudge-targets consider:looking-to-see</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:92e73486acf8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:computational-complexity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:initialization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:horse-races"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:looking-to-see"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1606.01081">
    <title>[1606.01081] Implementing graph grammars for intelligence analysis in OCaml</title>
    <dc:date>2016-06-28T22:38:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1606.01081</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We report on implementing graph grammars for intelligence analysis in OCaml. Graph grammars are represented as elements of an algebraic data type in OCaml. In addition to algebraic data types, we use other concepts from functional programming languages to implement features of graph grammars. We use type checking to perform graph pattern matching. Graph transformations are defined as implicit coercions derived from structural subtyping proofs, subset types, lambda abstractions, and analytics. An analytic is a general-purpose OCaml function whose output is required to match a graph pattern described by an element of an algebraic data type. By using a strongly-typed language for representing graphs, we can ensure graphs produced from a graph transformation will match a specific schema. This is a high priority requirement for intelligence analysis.
]]></description>
<dc:subject>representation data-analysis OCaml graphs data-mining nudge-targets consider:rewriting-rules rather-interesting</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:82fcb555c5b3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:OCaml"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graphs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:rewriting-rules"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1503.00306">
    <title>[1503.00306] Fusing Data with Correlations</title>
    <dc:date>2016-03-26T11:27:57+00:00</dc:date>
    <link>http://arxiv.org/abs/1503.00306</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Many applications rely on Web data and extraction systems to accomplish knowledge-driven tasks. Web information is not curated, so many sources provide inaccurate, or conflicting information. Moreover, extraction systems introduce additional noise to the data. We wish to automatically distinguish correct data and erroneous data for creating a cleaner set of integrated data. Previous work has shown that a na\"ive voting strategy that trusts data provided by the majority or at least a certain number of sources may not work well in the presence of copying between the sources. However, correlation between sources can be much broader than copying: sources may provide data from complementary domains (\emph{negative correlation}), extractors may focus on different types of information (\emph{negative correlation}), and extractors may apply common rules in extraction (\emph{positive correlation, without copying}). In this paper we present novel techniques modeling correlations between sources and applying it in truth finding.
]]></description>
<dc:subject>data-mining text-mining feature-construction natural-language-processing nudge-targets algorithms machine-learning consider:representation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0769d0bccce0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:text-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-construction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:natural-language-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:representation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1503.00310">
    <title>[1503.00310] Data Fusion: Resolving Conflicts from Multiple Sources</title>
    <dc:date>2015-09-22T22:09:12+00:00</dc:date>
    <link>http://arxiv.org/abs/1503.00310</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Many data management applications, such as setting up Web portals, managing enterprise data, managing community data, and sharing scientific data, require integrating data from multiple sources. Each of these sources provides a set of values and different sources can often provide conflicting values. To present quality data to users, it is critical to resolve conflicts and discover values that reflect the real world; this task is called {\em data fusion}. This paper describes a novel approach that finds true values from conflicting information when there are a large number of sources, among which some may copy from others. We present a case study on real-world data showing that the described algorithm can significantly improve accuracy of truth discovery and is scalable when there are a large number of data sources.
]]></description>
<dc:subject>data-fusion data-mining statistics formalization to-read</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:acc8cb6cc562/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-fusion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:formalization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:to-read"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1305.1422">
    <title>[1305.1422] Somoclu: An Efficient Parallel Library for Self-Organizing Maps</title>
    <dc:date>2015-02-12T10:23:26+00:00</dc:date>
    <link>http://arxiv.org/abs/1305.1422</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Somoclu is a massively parallel tool for training self-organizing maps on large data sets written in C++. It builds on OpenMP for multicore execution, and on MPI for distributing the workload across the nodes in a cluster. It is also able to boost training by using CUDA if graphics processing units are available. A sparse kernel is included, which is useful for high-dimensional but sparse data, such as the vector spaces common in text mining workflows. Python, R and MATLAB interfaces facilitate interactive use. Apart from fast execution, memory use is highly optimized, enabling training large emergent maps even on a single node.
]]></description>
<dc:subject>self-organization data-mining unsupervised-learning parallel library open-source rather-interesting text-mining nudge-targets consider:feature-discovery</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:176d36e2d995/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:self-organization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:unsupervised-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:parallel"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:text-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:feature-discovery"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1501.04759">
    <title>[1501.04759] Information Recovery In Behavioral Networks</title>
    <dc:date>2015-02-12T10:01:50+00:00</dc:date>
    <link>http://arxiv.org/abs/1501.04759</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In the context of agent based modeling and network theory, we focus on the problem of recovering micro behavior-related choice information from aggregate origin-destination data. As a basis for predicting agents' choices we emphasize the connection between adaptive intelligent behavior, causal entropy maximization and self-organized, equilibrium-seeking behavior in a dynamic system. We cast this problem in the form of a binary network and suggest information theoretic, entropy-driven methods to recover estimates of the unknown parameters connecting the behavioral data. Our objective is to recover the unknown behavioral binary parameters analytically, without explicitly sampling the configuration space. In order to do so, we enlarge the set of estimators commonly employed to make optimal use of the available information. More specifically, we consider the Cressie-Read family of entropic functionals and focus on three cases of particular interest. We then apply this information theoretic method to the analysis of both univariate and bivariate data sets.
]]></description>
<dc:subject>inference network-theory agent-based data-mining pattern-discovery rather-interesting algorithms information-theory nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:90563fda7c5f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:network-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:agent-based"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:information-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1301.1218">
    <title>[1301.1218] Finding the True Frequent Itemsets</title>
    <dc:date>2015-02-12T09:32:07+00:00</dc:date>
    <link>http://arxiv.org/abs/1301.1218</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Frequent Itemsets (FIs) mining is a fundamental primitive in data mining. It requires to identify all itemsets appearing in at least a fraction θ of a transactional dataset . Often though, the ultimate goal of mining  is not an analysis of the dataset \emph{per se}, but the understanding of the underlying process that generated it. Specifically, in many applications  is a collection of samples obtained from an unknown probability distribution π on transactions, and by extracting the FIs in  one attempts to infer itemsets that are frequently (i.e., with probability at least θ) generated by π, which we call the True Frequent Itemsets (TFIs). Due to the inherently stochastic nature of the generative process, the set of FIs is only a rough approximation of the set of TFIs, as it often contains a huge number of \emph{false positives}, i.e., spurious itemsets that are not among the TFIs. In this work we design and analyze an algorithm to identify a threshold θ^ such that the collection of itemsets with frequency at least θ^ in  contains only TFIs with probability at least 1−δ, for some user-specified δ. Our method uses results from statistical learning theory involving the (empirical) VC-dimension of the problem at hand. This allows us to identify almost all the TFIs without including any false positive. We also experimentally compare our method with the direct mining of  at frequency θ and with techniques based on widely-used standard bounds (i.e., the Chernoff bounds) of the binomial distribution, and show that our algorithm outperforms these methods and achieves even better results than what is guaranteed by the theoretical analysis.
]]></description>
<dc:subject>data-mining modeling feature-extraction nudge-targets algorithms the-mangle-in-practice consider:incremental philosophy-of-science</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:19ddbc4fb5cb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:the-mangle-in-practice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:consider:incremental"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:philosophy-of-science"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1401.1475">
    <title>[1401.1475] Belief Revision in Structured Probabilistic Argumentation</title>
    <dc:date>2014-12-20T13:42:59+00:00</dc:date>
    <link>http://arxiv.org/abs/1401.1475</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[In real-world applications, knowledge bases consisting of all the information at hand for a specific domain, along with the current state of affairs, are bound to contain contradictory data coming from different sources, as well as data with varying degrees of uncertainty attached. Likewise, an important aspect of the effort associated with maintaining knowledge bases is deciding what information is no longer useful; pieces of information (such as intelligence reports) may be outdated, may come from sources that have recently been discovered to be of low quality, or abundant evidence may be available that contradicts them. In this paper, we propose a probabilistic structured argumentation framework that arises from the extension of Presumptive Defeasible Logic Programming (PreDeLP) with probabilistic models, and argue that this formalism is capable of addressing the basic issues of handling contradictory and uncertain data. Then, to address the last issue, we focus on the study of non-prioritized belief revision operations over probabilistic PreDeLP programs. We propose a set of rationality postulates -- based on well-known ones developed for classical knowledge bases -- that characterize how such operations should behave, and study a class of operators along with theoretical relationships with the proposed postulates, including a representation theorem stating the equivalence between this class and the class of operators characterized by the postulates.
]]></description>
<dc:subject>argumentation probability-theory belief-revision probabilistic-programming-(?) learning-by-watching data-mining rule-discovery nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8fcac50ac07b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:argumentation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probability-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:belief-revision"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:probabilistic-programming-(?)"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-by-watching"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rule-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1407.6439">
    <title>[1407.6439] Feature Engineering for Knowledge Base Construction</title>
    <dc:date>2014-12-08T21:58:31+00:00</dc:date>
    <link>http://arxiv.org/abs/1407.6439</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Knowledge base construction (KBC) is the process of populating a knowledge base, i.e., a relational database together with inference rules, with information extracted from documents and structured sources. KBC blurs the distinction between two traditional database problems, information extraction and information integration. For the last several years, our group has been building knowledge bases with scientific collaborators. Using our approach, we have built knowledge bases that have comparable and sometimes better quality than those constructed by human volunteers. In contrast to these knowledge bases, which took experts a decade or more human years to construct, many of our projects are constructed by a single graduate student. 
Our approach to KBC is based on joint probabilistic inference and learning, but we do not see inference as either a panacea or a magic bullet: inference is a tool that allows us to be systematic in how we construct, debug, and improve the quality of such systems. In addition, inference allows us to construct these systems in a more loosely coupled way than traditional approaches. To support this idea, we have built the DeepDive system, which has the design goal of letting the user "think about features---not algorithms." We think of DeepDive as declarative in that one specifies what they want but not how to get it. We describe our approach with a focus on feature engineering, which we argue is an understudied problem relative to its importance to end-to-end quality.
]]></description>
<dc:subject>data-mining big-data statistics modeling modeling-is-not-mathematics rather-interesting the-mangle-in-practice out-of-the-box feature-extraction</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:73a392aed320/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling-is-not-mathematics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:rather-interesting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:the-mangle-in-practice"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:out-of-the-box"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1411.4952">
    <title>[1411.4952] From Captions to Visual Concepts and Back</title>
    <dc:date>2014-12-08T11:27:56+00:00</dc:date>
    <link>http://arxiv.org/abs/1411.4952</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[This paper presents a novel approach for automatically generating image descriptions: visual detectors and language models learn directly from a dataset of image captions. We use Multiple Instance Learning to train visual detectors for words that commonly occur in captions, including many different parts of speech such as nouns, verbs, and adjectives. The word detector outputs serve as conditional inputs to a maximum-entropy language model. The language model learns from a set of over 400,000 image descriptions to capture the statistics of word usage. We capture global semantics by re-ranking caption candidates using sentence-level features and a deep multimodal similarity model. When human judges compare the system captions to ones written by other people, the system captions have equal or better quality over 23% of the time.
]]></description>
<dc:subject>image-segmentation image-analysis natural-language-processing machine-learning nudge-targets algorithms data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:3b72bc886ecc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-segmentation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:natural-language-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1409.1152">
    <title>[1409.1152] FS^3: A Sampling based method for top-k Frequent Subgraph Mining</title>
    <dc:date>2014-12-06T22:15:07+00:00</dc:date>
    <link>http://arxiv.org/abs/1409.1152</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Mining labeled subgraph is a popular research task in data mining because of its potential application in many different scientific domains. All the existing methods for this task explicitly or implicitly solve the subgraph isomorphism task which is computationally expensive, so they suffer from the lack of scalability problem when the graphs in the input database are large. In this work, we propose FS^3, which is a sampling based method. It mines a small collection of subgraphs that are most frequent in the probabilistic sense. FS^3 performs a Markov Chain Monte Carlo (MCMC) sampling over the space of a fixed-size subgraphs such that the potentially frequent subgraphs are sampled more often. Besides, FS^3 is equipped with an innovative queue manager. It stores the sampled subgraph in a finite queue over the course of mining in such a manner that the top-k positions in the queue contain the most frequent subgraphs. Our experiments on database of large graphs show that FS^3 is efficient, and it obtains subgraphs that are the most frequent amongst the subgraphs of a given size.
]]></description>
<dc:subject>graph-theory databases data-mining statistics algorithms nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:75b946feec13/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:graph-theory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:databases"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1409.5400">
    <title>[1409.5400] Visual Landmark Recognition from Internet Photo Collections: A Large-Scale Evaluation</title>
    <dc:date>2014-11-13T22:20:51+00:00</dc:date>
    <link>http://arxiv.org/abs/1409.5400</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The task of a visual landmark recognition system is to identify photographed buildings or objects in query photos and to provide the user with relevant information on them. With their increasing coverage of the world's landmark buildings and objects, Internet photo collections are now being used as a source for building such systems in a fully automatic fashion. This process typically consists of three steps: clustering large amounts of images by the objects they depict; determining object names from user-provided tags; and building a robust, compact, and efficient recognition index. To this date, however, there is little empirical information on how well current approaches for those steps perform in a large-scale open-set mining and recognition task. Furthermore, there is little empirical information on how recognition performance varies for different types of landmark objects and where there is still potential for improvement. With this paper, we intend to fill these gaps. Using a dataset of 500k images from Paris, we analyze each component of the landmark recognition pipeline in order to answer the following questions: How many and what kinds of objects can be discovered automatically? How can we best use the resulting image clusters to recognize the object in a query? How can the object be efficiently represented in memory for recognition? How reliably can semantic information be extracted? And finally: What are the limiting factors in the resulting pipeline from query to semantics? We evaluate how different choices of methods and parameters for the individual pipeline steps affect overall system performance and examine their effects for different query categories such as buildings, paintings or sculptures.
]]></description>
<dc:subject>data-mining machine-learning image-processing metadata nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:422c500c0fd7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metadata"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1310.8341">
    <title>[1310.8341] Unexpected links reflect the noise in networks</title>
    <dc:date>2014-01-17T14:59:09+00:00</dc:date>
    <link>http://arxiv.org/abs/1310.8341</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Gene regulatory networks are commonly used for modeling biological processes and revealing underlying molecular mechanisms. The reconstruction of gene regulatory networks from observational data is a challenging task, especially, considering the large number of involved players (e.g. genes) and much fewer biological replicates available for analysis. Herein, we proposed a new statistical method of estimating the number of erroneous edges that strongly enhances the commonly used inference approaches. This method is based on special relationship between correlation and causality, and allows to identify and to remove approximately half of erroneous edges. Using the mathematical model of Bayesian networks and positive correlation inequalities we established a mathematical foundation for our method. Analyzing real biological datasets, we found a strong correlation between the results of our method and the commonly used false discovery rate (FDR) technique. Furthermore, the simulation analysis demonstrates that in large networks, our new method provides a more precise estimation of the proportion of erroneous links than FDR.
]]></description>
<dc:subject>GWAS bioinformatics systems-biology data-mining gene-regulatory-networks statistics interestingness nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7ff6276cf1b2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:GWAS"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:systems-biology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:gene-regulatory-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:interestingness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1312.0086">
    <title>[1312.0086] A Framework for Genetic Algorithms Based on Hadoop</title>
    <dc:date>2014-01-14T11:52:13+00:00</dc:date>
    <link>http://arxiv.org/abs/1312.0086</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Genetic Algorithms (GAs) are powerful metaheuristic techniques mostly used in many real-world applications. The sequential execution of GAs requires considerable computational power both in time and resources. Nevertheless, GAs are naturally parallel and accessing a parallel platform such as Cloud is easy and cheap. Apache Hadoop is one of the common services that can be used for parallel applications. However, using Hadoop to develop a parallel version of GAs is not simple without facing its inner workings. Even though some sequential frameworks for GAs already exist, there is no framework supporting the development of GA applications that can be executed in parallel. In this paper is described a framework for parallel GAs on the Hadoop platform, following the paradigm of MapReduce. The main purpose of this framework is to allow the user to focus on the aspects of GA that are specific to the problem to be addressed, being sure that this task is going to be correctly executed on the Cloud with a good performance. The framework has been also exploited to develop an application for Feature Subset Selection problem. A preliminary analysis of the performance of the developed GA application has been performed using three datasets and shown very promising performance.
]]></description>
<dc:subject>genetic-algorithm distributed-processing data-mining having-a-hammer metaheuristics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:bd985c6bb608/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetic-algorithm"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:distributed-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:having-a-hammer"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:metaheuristics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1311.1704">
    <title>[1311.1704] Scalable Recommendation with Poisson Factorization</title>
    <dc:date>2013-12-04T21:39:19+00:00</dc:date>
    <link>http://arxiv.org/abs/1311.1704</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We develop a Bayesian Poisson matrix factorization model for forming recommendations from sparse user behavior data. These data are large user/item matrices where each user has provided feedback on only a small subset of items, either explicitly (e.g., through star ratings) or implicitly (e.g., through views or purchases). In contrast to traditional matrix factorization approaches, Poisson factorization implicitly models each user's limited attention to consume items. Moreover, because of the mathematical form of the Poisson likelihood, the model needs only to explicitly consider the observed entries in the matrix, leading to both scalable computation and good predictive performance. We develop a variational inference algorithm for approximate posterior inference that scales up to massive data sets. This is an efficient algorithm that iterates over the observed entries and adjusts an approximate posterior over the user/item representations. We apply our method to large real-world user data containing users rating movies, users listening to songs, and users reading scientific papers. In all these settings, Bayesian Poisson factorization outperforms state-of-the-art matrix factorization methods.
]]></description>
<dc:subject>recommendations data-mining modeling prediction feature-extraction algorithms nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:03dd54eb16f7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:recommendations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1311.2100">
    <title>[1311.2100] Querying Knowledge Graphs by Example Entity Tuples</title>
    <dc:date>2013-12-04T21:37:05+00:00</dc:date>
    <link>http://arxiv.org/abs/1311.2100</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We witness an unprecedented proliferation of knowledge graphs that record millions of entities and their relationships. While knowledge graphs are structure-flexible and content rich, they are difficult to use. The challenge lies in the gap between their overwhelming complexity and the limited database knowledge of non-professional users. If writing structured queries over simple tables is difficult, complex graphs are only harder to query. As an initial step toward improving the usability of knowledge graphs, we propose to query such data by example entity tuples, without requiring users to form complex graph queries. Our system, GQBE (Graph Query By Example), automatically derives a weighted hidden maximal query graph based on input query tuples, to capture a user's query intent. It efficiently finds and ranks the top approximate answer tuples. For fast query processing, GQBE only partially evaluates query graphs. We conducted experiments and user studies on the large Freebase and DBpedia datasets and observed appealing accuracy and efficiency. Our system provides a complementary approach to the existing keyword-based methods, facilitating user-friendly graph querying. To the best of our knowledge, there was no such proposal in the past in the context of graphs.
]]></description>
<dc:subject>information-architecture data-mining database representation algorithms</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:24fd7662389a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:information-architecture"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:database"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1309.0302">
    <title>[1309.0302] Unmixing Incoherent Structures of Big Data by Randomized or Greedy Decomposition</title>
    <dc:date>2013-09-06T15:08:11+00:00</dc:date>
    <link>http://arxiv.org/abs/1309.0302</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Learning big data by matrix decomposition always suffers from expensive computation, mixing of complicated structures and noise. In this paper, we study more adaptive models and efficient algorithms that decompose a data matrix as the sum of semantic components with incoherent structures. We firstly introduce "GO decomposition (GoDec)", an alternating projection method estimating the low-rank part $L$ and the sparse part $S$ from data matrix $X=L+S+G$ corrupted by noise $G$. Two acceleration strategies are proposed to obtain scalable unmixing algorithm on big data: 1) Bilateral random projection (BRP) is developed to speed up the update of $L$ in GoDec by a closed-form built from left and right random projections of $X-S$ in lower dimensions; 2) Greedy bilateral (GreB) paradigm updates the left and right factors of $L$ in a mutually adaptive and greedy incremental manner, and achieve significant improvement in both time and sample complexities. Then we proposes three nontrivial variants of GoDec that generalizes GoDec to more general data type and whose fast algorithms can be derived from the two strategies......
]]></description>
<dc:subject>image-segmentation data-mining data-cleaning algorithms nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:029d6db373a4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:image-segmentation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1307.1584">
    <title>[1307.1584] Comparing Data-mining Algorithms Developed for Longitudinal Observational Databases</title>
    <dc:date>2013-07-22T16:38:57+00:00</dc:date>
    <link>http://arxiv.org/abs/1307.1584</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Longitudinal observational databases have become a recent interest in the post marketing drug surveillance community due to their ability of presenting a new perspective for detecting negative side effects. Algorithms mining longitudinal observation databases are not restricted by many of the limitations associated with the more conventional methods that have been developed for spontaneous reporting system databases. In this paper we investigate the robustness of four recently developed algorithms that mine longitudinal observational databases by applying them to The Health Improvement Network (THIN) for six drugs with well document known negative side effects. Our results show that none of the existing algorithms was able to consistently identify known adverse drug reactions above events related to the cause of the drug and no algorithm was superior.
]]></description>
<dc:subject>data-mining big-data medical-technology bioinformatics pharmaceutical statistics horse-races woopsie algorithms</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:716154c02d5f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:medical-technology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pharmaceutical"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:horse-races"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:woopsie"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1305.7074">
    <title>[1305.7074] Machine Learning of Molecular Electronic Properties in Chemical Compound Space</title>
    <dc:date>2013-07-21T14:05:04+00:00</dc:date>
    <link>http://arxiv.org/abs/1305.7074</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[The combination of modern scientific computing with electronic structure theory can lead to an unprecedented amount of data amenable to intelligent data analysis for the identification of meaningful, novel, and predictive structure-property relationships. Such relationships enable high-throughput screening for relevant properties in an exponentially growing pool of virtual compounds that are synthetically accessible. Here, we present a machine learning (ML) model, trained on a data base of \textit{ab initio} calculation results for thousands of organic molecules, that simultaneously predicts multiple electronic ground- and excited-state properties. The properties include atomization energy, polarizability, frontier orbital eigenvalues, ionization potential, electron affinity, and excitation energies. The ML model is based on a deep multi-task artificial neural network, exploiting underlying correlations between various molecular properties. The input is identical to \emph{ab initio} methods, \emph{i.e.} nuclear charges and Cartesian coordinates of all atoms. For small organic molecules the accuracy of such a "Quantum Machine" is similar, and sometimes superior, to modern quantum-chemical methods---at negligible computational cost.
]]></description>
<dc:subject>machine-learning data-mining cheminformatics nudge-targets algorithms structure-fnuction-inference</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:202b70abc9fb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cheminformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:structure-fnuction-inference"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1302.5344">
    <title>[1302.5344] From data towards knowledge: Revealing the architecture of signaling systems by unifying knowledge mining and data mining of systematic perturbation data</title>
    <dc:date>2013-04-21T15:12:11+00:00</dc:date>
    <link>http://arxiv.org/abs/1302.5344</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Genetic and pharmacological perturbation experiments, such as deleting a gene and monitoring gene expression responses, are powerful tools for studying cellular signal transduction pathways. However, it remains a challenge to automatically derive knowledge of a cellular signaling system at a conceptual level from systematic perturbation-response data. In this study, we explored a framework that unifies knowledge mining and data mining approaches towards the goal. The framework consists of the following automated processes: 1) applying an ontology-driven knowledge mining approach to identify functional modules among the genes responding to a perturbation in order to reveal potential signals affected by the perturbation; 2) applying a graph-based data mining approach to search for perturbations that affect a common signal with respect to a functional module, and 3) revealing the architecture of a signaling system organize signaling units into a hierarchy based on their relationships. Applying this framework to a compendium of yeast perturbation-response data, we have successfully recovered many well-known signal transduction pathways; in addition, our analysis have led to many hypotheses regarding the yeast signal transduction system; finally, our analysis automatically organized perturbed genes as a graph reflecting the architect of the yeast signaling system. Importantly, this framework transformed molecular findings from a gene level to a conceptual level, which readily can be translated into computable knowledge in the form of rules regarding the yeast signaling system, such as "if genes involved in MAPK signaling are perturbed, genes involved in pheromone responses will be differentially expressed".
]]></description>
<dc:subject>bioinformatics algorithms data-mining nudge-targets inference combined-source-modeling</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:c40db426f677/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:combined-source-modeling"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1302.4211">
    <title>[1302.4211] Multivariate varying coefficient model for functional responses</title>
    <dc:date>2013-04-10T11:05:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1302.4211</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Motivated by recent work studying massive imaging data in the neuroimaging literature, we propose multivariate varying coefficient models (MVCM) for modeling the relation between multiple functional responses and a set of covariates. We develop several statistical inference procedures for MVCM and systematically study their theoretical properties. We first establish the weak convergence of the local linear estimate of coefficient functions, as well as its asymptotic bias and variance, and then we derive asymptotic bias and mean integrated squared error of smoothed individual functions and their uniform convergence rate. We establish the uniform convergence rate of the estimated covariance function of the individual functions and its associated eigenvalue and eigenfunctions. We propose a global test for linear hypotheses of varying coefficient functions, and derive its asymptotic distribution under the null hypothesis. We also propose a simultaneous confidence band for each individual effect curve. We conduct Monte Carlo simulation to examine the finite-sample performance of the proposed procedures. We apply MVCM to investigate the development of white matter diffusivities along the genu tract of the corpus callosum in a clinical study of neurodevelopment.]]></description>
<dc:subject>representation subsampling functional-MRI data-mining data-analysis approximation nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:00e51603d1e8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:subsampling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:functional-MRI"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:approximation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.biomedcentral.com/1471-2350/10/6">
    <title>BMC Medical Genetics | Full text | An Open Access Database of Genome-wide Association Results</title>
    <dc:date>2013-03-25T12:38:30+00:00</dc:date>
    <link>http://www.biomedcentral.com/1471-2350/10/6</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We collected available results from 118 GWAS articles into a database of 56,411 significant SNP-phenotype associations and accompanying information, making this database freely available here. In doing so, we met and describe here a number of challenges to creating an open access database of GWAS results. Through preliminary analyses and characterization of available GWAS, we demonstrate the potential to gain new insights by querying a database across GWAS.

]]></description>
<dc:subject>GWAS database SNP bioinformatics nudge-targets data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:c68275ec42d6/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:GWAS"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:database"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:SNP"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1212.5389">
    <title>[1212.5389] Relationship-aware sequential pattern mining</title>
    <dc:date>2013-03-15T11:22:21+00:00</dc:date>
    <link>http://arxiv.org/abs/1212.5389</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Relationship-aware sequential pattern mining is the problem of mining frequent patterns in sequences in which the events of a sequence are mutually related by one or more concepts from some respective hierarchical taxonomies, based on the type of the events. Additionally events themselves are also described with a certain number of taxonomical concepts. We present RaSP an algorithm that is able to mine relationship-aware patterns over such sequences; RaSP follows a two stage approach. In the first stage it mines for frequent type patterns and {\em all} their occurrences within the different sequences. In the second stage it performs hierarchical mining where for each frequent type pattern and its occurrences it mines for more specific frequent patterns in the lower levels of the taxonomies. We test RaSP on a real world medical application, that provided the inspiration for its development, in which we mine for frequent patterns of medical behavior in the antibiotic treatment of microbes and show that it has a very good computational performance given the complexity of the relationship-aware sequential pattern mining problem.]]></description>
<dc:subject>ontology data-analysis data-mining pattern-discovery nudge-targets They're-all-there-Eddie-now-we've-got-to-get-them-in-the-right-order</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:610ed161cc3f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ontology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:They're-all-there-Eddie-now-we've-got-to-get-them-in-the-right-order"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1212.0504">
    <title>[1212.0504] Machine learning prediction of cancer cell sensitivity to drugs based on genomic and chemical properties</title>
    <dc:date>2013-03-07T00:53:19+00:00</dc:date>
    <link>http://arxiv.org/abs/1212.0504</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Predicting the response of a specific cancer to a therapy is a major goal in modern oncology that should ultimately lead to a personalised treatment. High-throughput screenings of potentially active compounds against a panel of genomically heterogeneous cancer cell lines have unveiled multiple relationships between genomic alterations and drug responses. Various computational approaches have been proposed to predict sensitivity based on genomic features, while others have used the chemical properties of the drugs to ascertain their effect. In an effort to integrate these orthogonal but complementary approaches, we developed machine learning models to predict the response of cancer cell lines to drug treatment, quantified through IC50 values, based on both the genomic features of the cell lines and the chemical properties of the considered drugs. Models predicted IC50 values in a 8-fold cross-validation and an independent blind test with Pearson correlations of 0.85 and 0.79 respectively. Furthermore, models were able to predict with comparable accuracy (Pearson correlation 0.79) IC50s of cell lines from a tissue not used in the training stage. As they stand, our in silico models can be used to optimise the experimental design of drug-cell screenings by accurately predicting a large proportion of missing IC50 values rather than experimentally measure them. The implications of our results go beyond virtual drug screening design: thousands of drugs could be probed in silico to systematically test their potential efficacy as anti-tumour agents based on their structure, thus providing a computational framework for identifying new drug repositioning opportunities.]]></description>
<dc:subject>biomedicine machine-learning prediction healthcare nudge-targets data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:49e24809865e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:biomedicine"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:healthcare"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1211.3497">
    <title>[1211.3497] Ontology Based Information Extraction for Disease Intelligence</title>
    <dc:date>2013-03-03T13:08:38+00:00</dc:date>
    <link>http://arxiv.org/abs/1211.3497</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Disease Intelligence (DI) is based on the acquisition and aggregation of fragmented knowledge of diseases at multiple sources all over the world to provide valuable information to doctors, researchers and information seeking community. Some diseases have their own characteristics changed rapidly at different places of the world and are reported on documents as unrelated and heterogeneous information which may be going unnoticed and may not be quickly available. This research presents an Ontology based theoretical framework in the context of medical intelligence and country/region. Ontology is designed for storing information about rapidly spreading and changing diseases with incorporating existing disease taxonomies to genetic information of both humans and infectious organisms. It further maps disease symptoms to diseases and drug effects to disease symptoms. The machine understandable disease ontology represented as a website thus allows the drug effects to be evaluated on disease symptoms and exposes genetic involvements in the human diseases. Infectious agents which have no known place in an existing classification but have data on genetics would still be identified as organisms through the intelligence of this system. It will further facilitate researchers on the subject to try out different solutions for curing diseases.]]></description>
<dc:subject>natural-language-processing data-mining text-mining ontology formalization domain-knowledge nudge-targets epidemiology</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:e6a58b286e16/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:natural-language-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:text-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:ontology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:formalization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:domain-knowledge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:epidemiology"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.sciencedirect.com/science/article/pii/S1097276513000956">
    <title>ScienceDirect.com - Molecular Cell - New Insights from Existing Sequence Data: Generating Breakthroughs without a Pipette</title>
    <dc:date>2013-02-21T21:58:33+00:00</dc:date>
    <link>http://www.sciencedirect.com/science/article/pii/S1097276513000956</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>via:mymarkup genetics bioinformatics data-mining scientific-computing</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:55fbec130308/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:mymarkup"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:scientific-computing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1301.3874">
    <title>[1301.3874] Risk Agoras: Dialectical Argumentation for Scientific Reasoning</title>
    <dc:date>2013-02-17T12:47:44+00:00</dc:date>
    <link>http://arxiv.org/abs/1301.3874</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[We propose a formal framework for intelligent systems which can reason about scientific domains, in particular about the carcinogenicity of chemicals, and we study its properties. Our framework is grounded in a philosophy of scientific enquiry and discourse, and uses a model of dialectical argumentation. The formalism enables representation of scientific uncertainty and conflict in a manner suitable for qualitative reasoning about the domain.]]></description>
<dc:subject>collective-intelligence reasoning agent-based data-mining nudge-targets logic-prograaming toulminesque</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:4f9798977742/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:collective-intelligence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:reasoning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:agent-based"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:logic-prograaming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:toulminesque"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1206.3268">
    <title>[1206.3268] Feature Selection via Block-Regularized Regression</title>
    <dc:date>2012-06-22T12:03:24+00:00</dc:date>
    <link>http://arxiv.org/abs/1206.3268</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["In this paper, we considered the problem of finding a subset of covariates in a high-dimensional space that affect the output variable when there is a block struc- ture in the covariates. In the context of association mapping, we proposed a regression-based model with a Markov chain prior that encodes the information in the correlation structure such as distance and re- combination rate between adjacent SNP markers. We demonstrated on the simulated and mouse data that our proposed algorithm can be used to identify groups of SNP markers as a relevant block of causal SNPs.

The idea of representing the correlation structure as a Markov chain in a variable selection method to learn grouped relevant variables can be generalized to use a graphical model as a prior in a variable selection prob- lem to represent an arbitrary correlation structure in variables in a high-dimensional space. Another inter- esting extension of the model is to model a structure in output variables as well when measurements of mul- tiple output variables are available."]]></description>
<dc:subject>statistics bioinformatics algorithms data-mining feature-extraction</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5472508df8bf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-extraction"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1206.0217">
    <title>[1206.0217] Efficient techniques for mining spatial databases</title>
    <dc:date>2012-06-09T10:36:09+00:00</dc:date>
    <link>http://arxiv.org/abs/1206.0217</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Clustering is one of the major tasks in data mining. In the last few years, Clustering of spatial data has received a lot of research attention. Spatial databases are components of many advanced information systems like geographic information systems VLSI design systems. In this thesis, we introduce several efficient algorithms for clustering spatial data. First, we present a grid-based clustering algorithm that has several advantages and comparable performance to the well known efficient clustering algorithm. The algorithm has several advantages. The algorithm does not require many input parameters. It requires only three parameters, the number of the points in the data space, the number of the cells in the grid and a percentage. The number of the cells in the grid reflects the accuracy that should be achieved by the algorithm. The algorithm is capable of discovering clusters of arbitrary shapes. The computational complexity of the algorithm is comparable to the complexity of the most efficient clustering algorithm. The algorithm has been implemented and tested against different ranges of database sizes. The performance results show that the running time of the algorithm is superior to the most well known algorithms (CLARANS [23]). The results show also that the performance of the algorithm do not degrade as the number of the data points increases."]]></description>
<dc:subject>GIS statistics clustering context-sensitive-data nudge-targets data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5baf6ef23848/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:GIS"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:context-sensitive-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1206.1032">
    <title>[1206.1032] Frequent Patterns mining in time-sensitive Data Stream</title>
    <dc:date>2012-06-07T20:27:12+00:00</dc:date>
    <link>http://arxiv.org/abs/1206.1032</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Mining frequent itemsets through static Databases has been extensively studied and used and is always considered a highly challenging task. For this reason it is interesting to extend it to data streams field. In the streaming case, the frequent patterns' mining has much more information to track and much greater complexity to manage. Infrequent items can become frequent later on and hence cannot be ignored. The output structure needs to be dynamically incremented to reflect the evolution of itemset frequencies over time. In this paper, we study this problem and specifically the methodology of mining time-sensitive data streams. We tried to improve an existing algorithm by increasing the temporal accuracy and discarding the out-of-date data by adding a new concept called the "Shaking Point". We presented as well some experiments illustrating the time and space required."]]></description>
<dc:subject>pattern-discovery time-series data-mining algorithms trading nudge-targets</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:d63d1c5d2134/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:trading"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1201.5568">
    <title>[1201.5568] Dynamic trees for streaming and massive data contexts</title>
    <dc:date>2012-01-30T21:11:20+00:00</dc:date>
    <link>http://arxiv.org/abs/1201.5568</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Data collection at a massive scale is becoming ubiquitous in a wide variety of settings, from vast offline databases to streaming real-time information. Learning algorithms deployed in such contexts must rely on single-pass inference, where the data history is never revisited. In streaming contexts, learning must also be temporally adaptive to remain up-to-date against unforeseen changes in the data generating mechanism. Although rapidly growing, the online Bayesian inference literature remains challenged by massive data and transient, evolving data streams. Non-parametric modelling techniques can prove particularly ill-suited, as the complexity of the model is allowed to increase with the sample size. In this work, we take steps to overcome these challenges by porting standard streaming techniques, like data discarding and downweighting, into a fully Bayesian framework via the use of informative priors and active learning heuristics. We showcase our methods by augmenting a modern non-parametric modelling framework, dynamic trees, and illustrate its performance on a number of practical examples. The end product is a powerful streaming regression and classification tool, whose performance compares favourably to the state-of-the-art."]]></description>
<dc:subject>data-analysis learning-from-data algorithms drinking-from-the-firehose nudge data-mining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:3f6d28022889/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:drinking-from-the-firehose"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://venturebeat.com/2011/05/16/datameer-funding/?utm_source=feedburner&amp;utm_medium=feed&amp;utm_campaign=Feed%3A+Venturebeat+%28VentureBeat%29">
    <title>Datameer snags $9.25M more to analyze massive amounts of data | VentureBeat</title>
    <dc:date>2011-06-10T12:30:44+00:00</dc:date>
    <link>http://venturebeat.com/2011/05/16/datameer-funding/?utm_source=feedburner&amp;utm_medium=feed&amp;utm_campaign=Feed%3A+Venturebeat+%28VentureBeat%29</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Datameer, a company that allows users to analyze massive amounts of data without technical know-how, today announced a second round of funding for $9.25 million. The money will be used to hire additional employees for its engineering, sales, and marketing teams."]]></description>
<dc:subject>data-analysis data-mining startups funding bubblicious</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:788fe01685c9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:startups"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:funding"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bubblicious"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://flowingdata.com/2011/05/20/growing-need-for-data-heads/">
    <title>Growing need for data heads</title>
    <dc:date>2011-05-22T12:03:27+00:00</dc:date>
    <link>http://flowingdata.com/2011/05/20/growing-need-for-data-heads/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["I've said it before, but if digging into data is your idea of fun, there's a whole mess of excitement and adventure headed your way. There are lots of opportunities already out there in marketing, journalism, tech, the Web, government, and pretty much everywhere you look. And more importantly, there are lots of opportunities that you can make for yourself. This is a great time for data heads."]]></description>
<dc:subject>data-science data-mining statistics jobs advice</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:Vaguery/b:9fe30b81aad3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:jobs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:advice"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1007.5510">
    <title>[1007.5510] An algorithm for the principal component analysis of large data sets</title>
    <dc:date>2010-08-02T21:11:53+00:00</dc:date>
    <link>http://arxiv.org/abs/1007.5510</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Recently popularized randomized methods for principal component analysis (PCA) efficiently and reliably produce nearly optimal accuracy - even on parallel processors - unlike the classical (deterministic) alternatives. We adapt one of these randomized methods for use with data sets that are too large to be stored in random-access memory (RAM). (The traditional terminology is that our procedure works efficiently "out-of-core.") We illustrate the performance of the algorithm via several numerical examples. For example, we report on the PCA of a data set stored on disk that is so large that less than a hundredth of it can fit in our computer's RAM."
]]></description>
<dc:subject>algorithms big-data-will-lead-to-big-inference statistics data-mining exploratory-data-analysis</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:33563458c28d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:big-data-will-lead-to-big-inference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:exploratory-data-analysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1006.4968">
    <title>[1006.4968] Validation of credit default probabilities via multiple testing procedures</title>
    <dc:date>2010-06-29T15:08:25+00:00</dc:date>
    <link>http://arxiv.org/abs/1006.4968</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["We apply multiple testing procedures to the validation of estimated default probabilities in credit rating systems. The goal is to identify rating classes for which the probability of default is estimated inaccurately, while still maintaining a predefined level of committing type I errors as measured by the familywise error rate (FWER) and the false discovery rate (FDR). For FWER, we also consider procedures that take possible discreteness of the data resp. test statistics into account. The performance of these methods is illustrated in a simulation setting and for empirical default data."
]]></description>
<dc:subject>finance prediction data-mining models statistics machine-learning nudge-targets</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:e24f99824c0c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:finance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1006.5273">
    <title>[1006.5273] Linear Detrending Subsequence Matching in Time-Series Databases</title>
    <dc:date>2010-06-29T14:14:27+00:00</dc:date>
    <link>http://arxiv.org/abs/1006.5273</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Each time-series has its own linear trend, the directionality of a timeseries, and removing the linear trend is crucial to get the more intuitive matching results. Supporting the linear detrending in subsequence matching is a challenging problem due to a huge number of possible subsequences. In this paper we define this problem the linear detrending subsequence matching and propose its efficient index-based solution. To this end, we first present a notion of LD-windows (LD means linear detrending), which is obtained as follows: we eliminate the linear trend from a subsequence rather than each window itself and obtain LD-windows by dividing the subsequence into windows. Using the LD-windows we then present a lower bounding theorem for the index-based matching solution and formally prove its correctness.…"
]]></description>
<dc:subject>time-series data-mining data-analysis prediction statistics nudge-targets</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0609124189e2/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://casstools.org/">
    <title>CASS</title>
    <dc:date>2010-06-29T14:09:36+00:00</dc:date>
    <link>http://casstools.org/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["In the social sciences, it is useful to understand the relative similarities of concepts that are embedded in a particular text (from a particular group or a particular person). For example, in trying to estimate conservative bias in FoxNews, one might estimate its tendency to associate conservative concepts (conservative, republican) and good concepts (good, positive, etc.), compared to conservative and bad concepts. The output would indicate conservative favoritism. This comparison could be further refined by taking into account important "baseline" information about the valences associated with liberal, namely liberal and good in comparison to liberal and bad.…"
]]></description>
<dc:subject>text-mining natural-language-processing data-mining machine-learning Ruby library</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a4126fce7108/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:text-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:natural-language-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Ruby"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1006.4929">
    <title>[1006.4929] Detecting epistasis via Markov bases</title>
    <dc:date>2010-06-29T13:35:48+00:00</dc:date>
    <link>http://arxiv.org/abs/1006.4929</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Specifically: "Genome-wide association study of hair length in dogs"
]]></description>
<dc:subject>nudge-targets epistasis bioinformatics genomics data-mining firehose-drinking phenotype-genotype-stuff</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:370651310c07/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge-targets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:epistasis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genomics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:firehose-drinking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:phenotype-genotype-stuff"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://seekingalpha.com/article/210550-a-peek-into-the-future-hft-and-financial-news?source=feed">
    <title>A Peek Into the Future: HFT and Financial News -- Seeking Alpha</title>
    <dc:date>2010-06-19T21:22:53+00:00</dc:date>
    <link>http://seekingalpha.com/article/210550-a-peek-into-the-future-hft-and-financial-news?source=feed</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["A still more realistic and subtle, but much more troublesome scenario: Financial Undetectable Journalistic Engineering (FUJE). Financial news journalists could word the reports differently and send very different signals to the robot army. Here're two actual news headlines re. the May NFP number (incidentally, both are from the same outlet, same day, different reporter -- just a random google search):

US adds 431,000 jobs in May, unemployment down to 9.7 pct
vs.

Despite Adding 431K Jobs, May Non-Farm Payroll Figures Disappoint
The first is factual; the second contains more in-depth analysis. It takes an experienced human to parse and reconcile the two. You can see how robot readers may assign opposite signs to each."
]]></description>
<dc:subject>data-mining high-frequency-trading trading news learning-from-data boy-am-I-glad-we-folded-the-startup</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:8c9c88f571b5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:high-frequency-trading"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:trading"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:news"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:boy-am-I-glad-we-folded-the-startup"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://radar.oreilly.com/2010/06/what-is-data-science.html">
    <title>What is data science? - O'Reilly Radar</title>
    <dc:date>2010-06-04T10:40:04+00:00</dc:date>
    <link>http://radar.oreilly.com/2010/06/what-is-data-science.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["We've all heard it: according to Hal Varian, statistics is the next sexy job. Five years ago, in What is Web 2.0, Tim O'Reilly said that "data is the next Intel Inside." But what does that statement mean? Why do we suddenly care about statistics and about data?

In this post, I examine the many sides of data science -- the technologies, the companies and the unique skill sets."
]]></description>
<dc:subject>data-analysis data-mining learning-from-data statistics futurism drinking-from-the-firehose nudge via:tsuomela</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:21c1a71ad17e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:futurism"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:drinking-from-the-firehose"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:tsuomela"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.dmg.org/v4-0/GeneralStructure.html">
    <title>Data Mining Group - PMML 4.0 - General Structure of a PMML Document</title>
    <dc:date>2009-10-22T13:56:21+00:00</dc:date>
    <link>http://www.dmg.org/v4-0/GeneralStructure.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["PMML uses XML to represent mining models. The structure of the models is described by an XML Schema. One or more mining models can be contained in a PMML document. A PMML document is an XML document with a root element of type PMML. The general structure of a PMML document is:..."
]]></description>
<dc:subject>data-mining models learning-from-data machine-learning standards XML Nudge</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:79573e332bd5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:standards"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:XML"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://cscs.umich.edu/~crshalizi/notebooks/model-selection.html">
    <title>Model Selection</title>
    <dc:date>2009-09-28T15:05:19+00:00</dc:date>
    <link>http://cscs.umich.edu/~crshalizi/notebooks/model-selection.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["In statistics and machine learning, "model selection" is the problem of picking among different mathematical models which all purport to describe the same data set. This notebook will not (for now) give advice on it; as usual, it's more of a place to organize my thoughts and references..."
]]></description>
<dc:subject>Cosma-R-Shalizi Nudge reference statistics data-mining theory</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:ebfb67a011a3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Cosma-R-Shalizi"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:reference"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:theory"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.newton.ac.uk/programmes/SCH/seminars/index.html">
    <title>&quot;Statistical Theory and Methods for Complex, High-Dimensional Data&quot;</title>
    <dc:date>2009-06-20T17:32:55+00:00</dc:date>
    <link>http://www.newton.ac.uk/programmes/SCH/seminars/index.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[To read in context of current practices of Pareto-GP model discovery: are there any cultural similarities <i>at all</i> between these people and the GP practitioners' approach?
]]></description>
<dc:subject>via:cshalizi data-mining models model-discovery heuristics statistics fat-data</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:538a57921f3a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:cshalizi"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:model-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:heuristics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:fat-data"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://polymeme.com/about">
    <title>About Us | Polymeme</title>
    <dc:date>2009-03-04T21:51:54+00:00</dc:date>
    <link>http://polymeme.com/about</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Polymeme helps you navigate the new networked public sphere and keep your fingers on the intellectual pulse of the blogosphere.

Polymeme helps you discover intelligent content that lies beyond the usual echo chambers of tech news, celebrity gossip or American politics.

Our site uses a unique buzz-tracking approach to identify what's currently hot in 20 areas, ranging from economics to evolution, and present it to the reader along with all sources that are currently talking about it. Thus, you can track how ideas – or memes – propagate through this new emerging networked public sphere. We would consider our mission a success if we expose you to the maximum number of new ideas on every 100 news items you read!"
]]></description>
<dc:subject>social-software social-networks marketing madness-of-crowds blogging media data-mining trends aggregation</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:1211c4c477d8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:social-software"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:social-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:marketing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:madness-of-crowds"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:blogging"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:media"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:trends"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:aggregation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://ciir.cs.umass.edu/~strohman/dissertation/">
    <title>CIIR: Trevor Strohman: Dissertation</title>
    <dc:date>2009-01-05T19:54:44+00:00</dc:date>
    <link>http://ciir.cs.umass.edu/~strohman/dissertation/</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>research thesis MapReduce databases data-mining analytics architecture statistics</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:b9f1a6f01920/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:research"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:thesis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:MapReduce"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:databases"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:architecture"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:statistics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://pyflix.python-hosting.com/">
    <title>Pyflix - Trac</title>
    <dc:date>2009-01-05T12:19:07+00:00</dc:date>
    <link>http://pyflix.python-hosting.com/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Pyflix is a small package written in Python that provides an easy entry point for getting up and running in the Netflix Prize competition. It combines an efficient storage scheme with an intuitive high-level API that allows contestants to focus on the real problem, the recommendation system algorithm. To get started with Pyflix, keep reading."
]]></description>
<dc:subject>via:jhofman data-mining prediction analytics recommendations modeling learning-from-data competition programming library python scripting netflix</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:ec84dc4bb248/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:jhofman"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:recommendations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:competition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:python"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:scripting"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:netflix"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://ideas.repec.org/p/wop/pennin/01-05.html">
    <title>Variable Selection in Data Mining: Building a Predictive Model for Bankruptcy</title>
    <dc:date>2009-01-05T11:56:00+00:00</dc:date>
    <link>http://ideas.repec.org/p/wop/pennin/01-05.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[reproduce this using Pareto-GP?
]]></description>
<dc:subject>data-mining prediction modeling variable-selection regression analytics Nudge</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:c070fe9bfa2b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:variable-selection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:regression"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.altsearchengines.com/2008/12/28/how-to-search-for-influencers-with-datanetis/">
    <title>AltSearchEngines » Blog Archive » How to Search for Influencers with Datanetis</title>
    <dc:date>2008-12-29T15:35:21+00:00</dc:date>
    <link>http://www.altsearchengines.com/2008/12/28/how-to-search-for-influencers-with-datanetis/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Be braced:

"For someone that has been working building software for the marketing automation industry over 8 years now and is familiar with multiple solutions for finding the right prospect out of many, it was an eye opener. I’m evidencing the progression from mass email campaigns through marketing to target individuals with a matching/relevant offers (data mining, behavioral pattern, collaborate filtering, recommendation engines) to finding customers that can market for you - agents."
]]></description>
<dc:subject>social-networks marketing influence advertising data-mining networks search-engines</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:01107b40e78f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:social-networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:marketing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:influence"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:advertising"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:search-engines"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cs.ucr.edu/~eamonn/TSDMA/tsdma_papers.html">
    <title>The UCR Time Series Data Mining Archive - Papers | Keogh, E. &amp; Folias, T. (2002)</title>
    <dc:date>2008-11-25T11:37:20+00:00</dc:date>
    <link>http://www.cs.ucr.edu/~eamonn/TSDMA/tsdma_papers.html</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[aka "Nothing is Miscellaneous"
]]></description>
<dc:subject>via:arsyed time-series data-mining models prediction visualization learning-from-data papers archive</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:cc1dfc71b590/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:arsyed"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:time-series"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:models"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:archive"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://nltk.sourceforge.net/index.php/Main_Page">
    <title>Main Page - NLTK</title>
    <dc:date>2008-10-13T04:04:44+00:00</dc:date>
    <link>http://nltk.sourceforge.net/index.php/Main_Page</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>Nudge toolkit library data-analysis data-mining natural-language-processing NLP python</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:4dffe6ab04b9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:toolkit"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:natural-language-processing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:NLP"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:python"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://seekingalpha.com/article/99106-where-to-look-for-ideas-in-this-market?source=feed">
    <title>Where to Look for Ideas in This Market - Seeking Alpha</title>
    <dc:date>2008-10-11T00:49:03+00:00</dc:date>
    <link>http://seekingalpha.com/article/99106-where-to-look-for-ideas-in-this-market?source=feed</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["Last year, more than 16,000 companies filed 10-Ks or 10KSBs with the SEC. Assuming they come in evenly (they don't, but we'll say this for simplicity's sake), that would be more than 4,000 annual reports a quarter, or roughly 44 a day, each and every day of the year. Forget holidays, vacations, or your kids' birthdays — you've got annual reports to read!"
]]></description>
<dc:subject>Nudge sentiment prediction mining data-mining datasets genetic-programming training validation</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7e4c8ebd3daa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:sentiment"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:datasets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetic-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:training"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:validation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.kddcup2008.com/KDDsite/Challenges.htm">
    <title>Siemens KDD Cup 2008 - Registration</title>
    <dc:date>2008-08-06T11:12:51+00:00</dc:date>
    <link>http://www.kddcup2008.com/KDDsite/Challenges.htm</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>KDD machine-learning Nudge data-mining feature-detection classification challenge competition contest conferences dataset</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:1818db56ca75/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:KDD"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:feature-detection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:challenge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:competition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:contest"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:conferences"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dataset"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html">
    <title>Official Google Research Blog: All Our N-gram are Belong to You</title>
    <dc:date>2008-07-01T19:27:05+00:00</dc:date>
    <link>http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>data-analysis data-mining n-grams Google nudge analytics dataset language linguistics machine-learning genetic-programming learning-from-data</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:26c2335741d0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:n-grams"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Google"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:nudge"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dataset"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:language"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:linguistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:genetic-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning-from-data"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.ailab.si/orange/">
    <title>Orange</title>
    <dc:date>2008-04-28T14:02:53+00:00</dc:date>
    <link>http://www.ailab.si/orange/</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>visualization visual-programming algorithms analysis analytics classification data-mining learning machine-learning mining modeling prediction Python open-source GPL</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:81ee06cec103/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visual-programming"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:modeling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:prediction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Python"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:GPL"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.texturesynthesis.com/links.htm">
    <title>Texture Synthesis Links</title>
    <dc:date>2008-03-21T12:53:03+00:00</dc:date>
    <link>http://www.texturesynthesis.com/links.htm</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Various potentially useful resources for texture synthesis and image analysis applications of genetic programming.
]]></description>
<dc:subject>resources library machine-learning datasets data-analysis data-mining test-cases</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:0fd1fb6f045a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:resources"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:library"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:datasets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:test-cases"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://theinfo.org/">
    <title>(theinfo)</title>
    <dc:date>2008-01-16T21:54:30+00:00</dc:date>
    <link>http://theinfo.org/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA["This is a site for large data sets and the people who love them: the scrapers and crawlers who collect them, the academics and geeks who process them, the designers and artists who visualize them. It's a place where they can exchange tips and tricks, dev
]]></description>
<dc:subject>via:arthegall algorithms analytics collaboration collection data data-analysis data-mining hacking open research tools</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:56bc745fcac7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:arthegall"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:collaboration"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:collection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:hacking"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:research"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:tools"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://precedings.nature.com/documents/1490/version/1">
    <title>Understanding Hydrogen-Bond Patterns in Proteins using a Novel Statistical Model</title>
    <dc:date>2008-01-06T14:45:38+00:00</dc:date>
    <link>http://precedings.nature.com/documents/1490/version/1</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Bond motifs.
]]></description>
<dc:subject>protein-folding crystallography machine-learning pattern-discovery data-mining bioinformatics structural-biology</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:aee500c2adfa/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:protein-folding"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:crystallography"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:pattern-discovery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:bioinformatics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:structural-biology"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.foodpairing.be/">
    <title>FOODPAIRING</title>
    <dc:date>2007-12-02T01:27:39+00:00</dc:date>
    <link>http://www.foodpairing.be/</link>
    <dc:creator>Vaguery</dc:creator><description><![CDATA[Some kind of network of interchangeable and complementary food ingredients. Somewhat questionably vague.
]]></description>
<dc:subject>food flavor cookery networks data-mining visualization recommendations recipes</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:5f8d173fb44a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:food"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:flavor"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:cookery"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:networks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:recommendations"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:recipes"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.tickdata.com/index.html">
    <title>Tick Data - Data Solutions for Investment Professionals</title>
    <dc:date>2007-09-07T21:14:11+00:00</dc:date>
    <link>http://www.tickdata.com/index.html</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>trading data data-mining finance markets database data-cleaning stocks equities</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:c1a957833e70/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:trading"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:finance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:markets"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:database"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:stocks"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:equities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.vtk.org/">
    <title>VTK Home Page</title>
    <dc:date>2007-06-08T21:42:50+00:00</dc:date>
    <link>http://www.vtk.org/</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>visualization open-source research analytics software MacOS data-mining data-analysis dynamics 3d</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:4d81b974ab64/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:research"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:software"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:MacOS"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:dynamics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:3d"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://magia3e.wordpress.com/2007/05/29/semantic-analysis-making-sense-of-the-chaos-of-free-text/">
    <title>Semantic analysis: Making sense of the chaos of free text « Matt’s Musings</title>
    <dc:date>2007-06-08T12:16:49+00:00</dc:date>
    <link>http://magia3e.wordpress.com/2007/05/29/semantic-analysis-making-sense-of-the-chaos-of-free-text/</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>via:tsuomela semantic analysis text mining data-mining summary machine-learning analytics web2.0</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:decbeceeb1e3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:tsuomela"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:semantic"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:text"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:summary"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:web2.0"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://space4commerce.blogspot.com/2007/05/google-wants-to-know-everything-about.html">
    <title>Space For Commerce, by Brian Dunbar: Google wants to know everything about you</title>
    <dc:date>2007-05-30T21:36:27+00:00</dc:date>
    <link>http://space4commerce.blogspot.com/2007/05/google-wants-to-know-everything-about.html</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>transparency future data openness government Google sharing information-overload data-mining self-definition Privacy society</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:58f10f170b6e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:transparency"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:future"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:openness"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:government"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Google"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:sharing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:information-overload"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:self-definition"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Privacy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:society"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.iht.com/articles/2007/05/20/news/compute21.php">
    <title>Data-mining moves into the mainstream, in search of profit - International Herald Tribune</title>
    <dc:date>2007-05-21T13:58:02+00:00</dc:date>
    <link>http://www.iht.com/articles/2007/05/20/news/compute21.php</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>via:logista data-mining analytics decision-support automation policy applications</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:b124c3b4afcb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:via:logista"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:decision-support"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:automation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:policy"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:applications"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://labs.google.com/papers/sawzall.html">
    <title>Google Research Publication: Sawzall</title>
    <dc:date>2007-04-28T16:21:34+00:00</dc:date>
    <link>http://labs.google.com/papers/sawzall.html</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>data-mining language analytics computing Google distributed-processing</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:7d59c70779ff/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:language"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:Google"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:distributed-processing"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://rapid-i.com/content/blogcategory/10/21/lang,en/">
    <title>YALE for machine learning</title>
    <dc:date>2007-04-03T20:32:12+00:00</dc:date>
    <link>http://rapid-i.com/content/blogcategory/10/21/lang,en/</link>
    <dc:creator>Vaguery</dc:creator><dc:subject>machine-learning science scientific-computing engineering data-mining software open-source</dc:subject>
<dc:identifier>https://pinboard.in/u:Vaguery/b:a50a8340b08f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:machine-learning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:science"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:scientific-computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:engineering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:data-mining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:software"/>
	<rdf:li rdf:resource="https://pinboard.in/u:Vaguery/t:open-source"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>