<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (jm)</title>
    <link>https://pinboard.in/u:jm/public/</link>
    <description>recent bookmarks from jm</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="http://blog.cloudera.com/blog/2015/10/how-to-index-scanned-pdfs-at-scale-using-fewer-than-50-lines-of-code/"/>
	<rdf:li rdf:resource="http://muratbuffalo.blogspot.co.uk/2015/03/paper-review-simple-testing-can-prevent.html"/>
	<rdf:li rdf:resource="http://highscalability.com/blog/2011/11/29/datasift-architecture-realtime-datamining-at-120000-tweets-p.html"/>
	<rdf:li rdf:resource="http://developer.yahoo.com/blogs/ydn/posts/2013/02/storm-and-hadoop-convergence-of-big-data-and-low-latency-processing/"/>
	<rdf:li rdf:resource="http://blog.markedup.com/2013/02/cassandra-hive-and-hadoop-how-we-picked-our-analytics-stack/"/>
	<rdf:li rdf:resource="http://blog.sematext.com/2012/04/22/hbase-real-time-analytics-rollbacks-via-append-based-updates/"/>
	<rdf:li rdf:resource="http://perspectives.mvdirona.com/2011/10/25/StorageInfrastructureBehindFacebookMessages.aspx"/>
	<rdf:li rdf:resource="http://www.cloudera.com/blog/2011/03/avoiding-full-gcs-in-hbase-with-memstore-local-allocation-buffers-part-3/"/>
	<rdf:li rdf:resource="http://highscalability.com/blog/2011/3/22/facebooks-new-realtime-analytics-system-hbase-to-process-20.html"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="http://blog.cloudera.com/blog/2015/10/how-to-index-scanned-pdfs-at-scale-using-fewer-than-50-lines-of-code/">
    <title>How-to: Index Scanned PDFs at Scale Using Fewer Than 50 Lines of Code</title>
    <dc:date>2015-10-21T09:36:31+00:00</dc:date>
    <link>http://blog.cloudera.com/blog/2015/10/how-to-index-scanned-pdfs-at-scale-using-fewer-than-50-lines-of-code/</link>
    <dc:creator>jm</dc:creator><description><![CDATA[using Spark, Tesseract, HBase, Solr and Leptonica.  Actually pretty feasible]]></description>
<dc:subject>spark tesseract hbase solr leptonica pdfs scanning cloudera hadoop architecture</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:2b695d958d5f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:spark"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:tesseract"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:solr"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:leptonica"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:pdfs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:scanning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:cloudera"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hadoop"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:architecture"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://muratbuffalo.blogspot.co.uk/2015/03/paper-review-simple-testing-can-prevent.html">
    <title>Paper review: &quot;Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-Intensive Systems&quot;</title>
    <dc:date>2015-03-27T09:36:04+00:00</dc:date>
    <link>http://muratbuffalo.blogspot.co.uk/2015/03/paper-review-simple-testing-can-prevent.html</link>
    <dc:creator>jm</dc:creator><description><![CDATA[Race conditions, and errors at startup, seem to be particularly problematic]]></description>
<dc:subject>race-conditions startup bugs failure fault-tolerance hbase redis reliability ops papers concurrency exception-handling cassandra hdfs mapreduce</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:3dd7b48e5fed/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:race-conditions"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:startup"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:bugs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:failure"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:fault-tolerance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:redis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:reliability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:ops"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:papers"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:concurrency"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:exception-handling"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:cassandra"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hdfs"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:mapreduce"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://highscalability.com/blog/2011/11/29/datasift-architecture-realtime-datamining-at-120000-tweets-p.html">
    <title>DataSift Architecture: Realtime Datamining at 120,000 Tweets Per Second</title>
    <dc:date>2013-04-23T13:03:14+00:00</dc:date>
    <link>http://highscalability.com/blog/2011/11/29/datasift-architecture-realtime-datamining-at-120000-tweets-p.html</link>
    <dc:creator>jm</dc:creator><description><![CDATA[250 million tweets per day, 30-node HBase cluster, 400TB of storage, Kafka and 0mq.

This is from 2011, hence this dated line: 'for a distributed application they thought AWS was too limited, especially in the network. AWS doesn’t do well when nodes are connected together and they need to talk to each other. Not low enough latency network. Their customers care about latency.'  (Nowadays, it would be damn hard to build a lower-latency network than that attached to a cc2.8xlarge instance.)]]></description>
<dc:subject>datasift architecture scalability data twitter firehose hbase kafka zeromq</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:5c07ab4273cd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:datasift"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:architecture"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:scalability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:twitter"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:firehose"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:kafka"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:zeromq"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://developer.yahoo.com/blogs/ydn/posts/2013/02/storm-and-hadoop-convergence-of-big-data-and-low-latency-processing/">
    <title>Storm and Hadoop: Convergence of Big-Data and Low-Latency Processing</title>
    <dc:date>2013-02-28T09:57:37+00:00</dc:date>
    <link>http://developer.yahoo.com/blogs/ydn/posts/2013/02/storm-and-hadoop-convergence-of-big-data-and-low-latency-processing/</link>
    <dc:creator>jm</dc:creator><description><![CDATA[Yahoo! are going big with Storm for their next-generation internal cloud platform:

'Yahoo! engineering teams are developing technologies to enable Storm applications and Hadoop applications to be hosted on a single cluster.

• We have enhanced Storm to support Hadoop style security mechanism (including Kerberos authentication), and thus enable Storm applications authorized to access Hadoop datasets on HDFS and HBase.
• Storm is being integrated into Hadoop YARN for resource management. Storm-on-YARN enables Storm applications to utilize the computation resources in our tens of thousands of Hadoop computation nodes. YARN is used to launch Storm application master (Nimbus) on demand, and enables Nimbus to request resources for Storm application slaves (Supervisors).']]></description>
<dc:subject>yahoo yarn cloud-computing private-clouds big-data latency storm hadoop elastic-computing hbase</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:350773902d3f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:yahoo"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:yarn"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:cloud-computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:private-clouds"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:latency"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:storm"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hadoop"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:elastic-computing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://blog.markedup.com/2013/02/cassandra-hive-and-hadoop-how-we-picked-our-analytics-stack/">
    <title>Cassandra, Hive, and Hadoop: How We Picked Our Analytics Stack</title>
    <dc:date>2013-02-25T15:35:01+00:00</dc:date>
    <link>http://blog.markedup.com/2013/02/cassandra-hive-and-hadoop-how-we-picked-our-analytics-stack/</link>
    <dc:creator>jm</dc:creator><description><![CDATA[reasonably good whole-stack performance testing and analysis; HBase, Riak, MongoDB, and Cassandra compared.  Riak did pretty badly :(]]></description>
<dc:subject>riak mongodb cassandra hbase performance analytics hadoop hive big-data storage databases nosql</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:b335abec7a75/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:riak"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:mongodb"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:cassandra"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:performance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hadoop"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hive"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:big-data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:storage"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:databases"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:nosql"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://blog.sematext.com/2012/04/22/hbase-real-time-analytics-rollbacks-via-append-based-updates/">
    <title>HBase Real-time Analytics &amp; Rollbacks via Append-based Updates</title>
    <dc:date>2012-12-17T13:59:22+00:00</dc:date>
    <link>http://blog.sematext.com/2012/04/22/hbase-real-time-analytics-rollbacks-via-append-based-updates/</link>
    <dc:creator>jm</dc:creator><description><![CDATA[Interesting concept for scaling up the write rate on massive key-value counter stores:

<blockquote>'Replace update (Get+Put) operations at write time with simple append-only writes and defer processing of updates to periodic jobs or perform aggregations on the fly if user asks for data earlier than individual additions are processed. The idea is simple and not necessarily novel, but given the specific qualities of HBase, namely fast range scans and high write throughput, this approach works very well.'</blockquote>

]]></description>
<dc:subject>counters analytics hbase append sematext aggregation big-data</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:158d78e4914f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:counters"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:append"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:sematext"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:aggregation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:big-data"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://perspectives.mvdirona.com/2011/10/25/StorageInfrastructureBehindFacebookMessages.aspx">
    <title>Storage Infrastructure Behind Facebook Messages</title>
    <dc:date>2011-10-25T22:35:16+00:00</dc:date>
    <link>http://perspectives.mvdirona.com/2011/10/25/StorageInfrastructureBehindFacebookMessages.aspx</link>
    <dc:creator>jm</dc:creator><description><![CDATA[HBase and Haystack; all data LZO-compressed; very interesting approach to testing -- they 'shadow the real production workload into the test cluster to test before going into production'. This catches a 'high percentage' of issues before production.  nice]]></description>
<dc:subject>testing shadowing haystack hbase facebook scalability lzo messaging sms via:james-hamilton</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:ad64e79b1478/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:testing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:shadowing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:haystack"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:facebook"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:scalability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:lzo"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:messaging"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:sms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:via:james-hamilton"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cloudera.com/blog/2011/03/avoiding-full-gcs-in-hbase-with-memstore-local-allocation-buffers-part-3/">
    <title>Avoiding Full GCs in HBase with MemStore-Local Allocation Buffers</title>
    <dc:date>2011-10-22T21:20:06+00:00</dc:date>
    <link>http://www.cloudera.com/blog/2011/03/avoiding-full-gcs-in-hbase-with-memstore-local-allocation-buffers-part-3/</link>
    <dc:creator>jm</dc:creator><description><![CDATA[Fascinating. Evading the Java GC by reimplementing a slab allocator, basically]]></description>
<dc:subject>memory allocation java gc jvm hbase memstore via:dehora slab-allocator</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:jm/b:76e4aad99f52/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:memory"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:allocation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:java"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:gc"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:jvm"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:memstore"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:via:dehora"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:slab-allocator"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://highscalability.com/blog/2011/3/22/facebooks-new-realtime-analytics-system-hbase-to-process-20.html">
    <title>Facebook's New Realtime Analytics System: HBase to Process 20 Billion Events Per Day</title>
    <dc:date>2011-03-28T21:11:31+00:00</dc:date>
    <link>http://highscalability.com/blog/2011/3/22/facebooks-new-realtime-analytics-system-hbase-to-process-20.html</link>
    <dc:creator>jm</dc:creator><description><![CDATA[Scribe logs events, "ptail" (parallel tail presumably) tails logs from Scribe stores, Puma batch-aggregates, writes to HBase.  Java and Thrift on the backend, PHP in front]]></description>
<dc:subject>facebook hbase scalability performance hadoop scribe events analytics architecture tail append</dc:subject>
<dc:identifier>https://pinboard.in/u:jm/b:4f62efcb61b3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:facebook"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hbase"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:scalability"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:performance"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:hadoop"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:scribe"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:events"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:analytics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:architecture"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:tail"/>
	<rdf:li rdf:resource="https://pinboard.in/u:jm/t:append"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>