<?xml version="1.0" encoding="UTF-8"?>
 <rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
  <channel rdf:about="http://pinboard.in">
    <title>Pinboard (rybesh)</title>
    <link>https://pinboard.in/u:rybesh/public/</link>
    <description>recent bookmarks from rybesh</description>
    <items>
      <rdf:Seq>	<rdf:li rdf:resource="http://ai.stanford.edu/blog/text-causal-inference/"/>
	<rdf:li rdf:resource="http://graphics.cs.wisc.edu/Vis/EmbVis/"/>
	<rdf:li rdf:resource="http://digitalethics.org/essays/using-facebook-identify-potential-problem-drinkers/"/>
	<rdf:li rdf:resource="http://www.aclweb.org/anthology/P14-2050.pdf"/>
	<rdf:li rdf:resource="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0137041"/>
	<rdf:li rdf:resource="http://web.stanford.edu/class/cs124/kwc-unix-for-poets.pdf"/>
	<rdf:li rdf:resource="http://onlinelibrary.wiley.com/doi/10.1002/asi.23510/full"/>
	<rdf:li rdf:resource="http://www.markhneedham.com/blog/2015/03/22/python-simplifying-the-creation-of-a-stop-word-list-with-defaultdict/"/>
	<rdf:li rdf:resource="https://code.google.com/p/word2vec/"/>
	<rdf:li rdf:resource="http://onlinelibrary.wiley.com/doi/10.1002/widm.1071/abstract"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/P/P12/P12-1078.pdf"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/P/P12/P12-1091.pdf"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/P/P12/P12-1012.pdf"/>
	<rdf:li rdf:resource="http://www.morganclaypool.com/doi/abs/10.2200/S00436ED1V01Y201207HLT017"/>
	<rdf:li rdf:resource="http://www.zora.uzh.ch/32532/4/gir-2010v.pdf"/>
	<rdf:li rdf:resource="http://students.washington.edu/climb/forum/viewtopic.php?p=39389"/>
	<rdf:li rdf:resource="http://software-and-algorithms.blogspot.com/2012/09/damerau-levenshtein-edit-distance.html"/>
	<rdf:li rdf:resource="http://www.scottbot.net/HIAL/?page_id=21794"/>
	<rdf:li rdf:resource="https://doucet.users.greyc.fr/StructureExtraction2009/"/>
	<rdf:li rdf:resource="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.191.105"/>
	<rdf:li rdf:resource="http://d2i.indiana.edu/htrc/uncamp2012/"/>
	<rdf:li rdf:resource="http://www.nd.edu/~mwilkens/Wilkens_DH_Syllabus_Init.pdf"/>
	<rdf:li rdf:resource="http://www.chilton-computing.org.uk/acl/applications/cocoa/p001.htm"/>
	<rdf:li rdf:resource="http://turing.cs.washington.edu/papers/akbc-wekex12-balasubramanian.pdf"/>
	<rdf:li rdf:resource="http://dl.acm.org/citation.cfm?id=2160165&amp;picked=prox"/>
	<rdf:li rdf:resource="http://old-site.clsp.jhu.edu/~ves/papers/hltcoe_akbc_wekex.pdf"/>
	<rdf:li rdf:resource="https://blogs.princeton.edu/hrc/2010/05/how_to_ocr_and_save_pdfs_as_text_files_using_batch_processing.html"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/N/N12/N12-1064.pdf"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/N/N12/N12-1001.pdf"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology-new/N/N12/N12-1022.pdf"/>
	<rdf:li rdf:resource="http://digitalhumanitiesnow.org/2012/06/editors-choice-quantitative-approaches-to-nineteenth-century-literary-and-intellectual-history/"/>
	<rdf:li rdf:resource="http://llc.oxfordjournals.org/content/early/2012/06/01/llc.fqs017.short"/>
	<rdf:li rdf:resource="http://www.journalism.org/commentary_backgrounder/About+Campaign+2012+in+the+Media+"/>
	<rdf:li rdf:resource="http://earlymodernonlinebib.wordpress.com/2012/05/13/neh-digital-humanities-startup-grants-funding-the-future/"/>
	<rdf:li rdf:resource="http://www.youtube.com/watch?v=CE68-lKtVE0#t=11m31s"/>
	<rdf:li rdf:resource="http://bigthink.com/users/aditimuralidharan2"/>
	<rdf:li rdf:resource="http://tedunderwood.wordpress.com/2012/05/13/its-the-data-a-plan-of-action/"/>
	<rdf:li rdf:resource="http://www.jstor.org/stable/10.1086/663350"/>
	<rdf:li rdf:resource="http://www.scottbot.net/HIAL/?p=16713"/>
	<rdf:li rdf:resource="http://www.cs.princeton.edu/~blei/papers/ChaneyBlei2012.pdf"/>
	<rdf:li rdf:resource="http://gking.harvard.edu/data?dvn_subpage=/faces/study/StudyPage.xhtml?globalId=hdl:1902.1/FYXLAWZRIA"/>
	<rdf:li rdf:resource="https://github.com/timjurka/RTextTools"/>
	<rdf:li rdf:resource="http://coursework.stanford.edu/homepage/Sp12/Sp12-ENGLISH-162-01.html"/>
	<rdf:li rdf:resource="http://jmlr.csail.mit.edu/proceedings/papers/v17/sudhahar11a/sudhahar11a.pdf"/>
	<rdf:li rdf:resource="http://arxiv.org/abs/1003.0783"/>
	<rdf:li rdf:resource="http://www.cs.princeton.edu/~chongw/slda/"/>
	<rdf:li rdf:resource="http://www.cs.princeton.edu/~blei/lda-c/"/>
	<rdf:li rdf:resource="http://stanford.edu/~jgrimmer/tad2.pdf"/>
	<rdf:li rdf:resource="http://vis.stanford.edu/papers/designing-model-driven-vis"/>
	<rdf:li rdf:resource="http://aclweb.org/anthology/J/J79/J79-1070.pdf"/>
	<rdf:li rdf:resource="http://blog.semantic-web.at/2012/02/02/automatic-text-analytics-using-dbpedia-and-poolparty-a-live-demo/"/>
	<rdf:li rdf:resource="http://www.dictionsoftware.com/"/>
	<rdf:li rdf:resource="http://post45.research.yale.edu/archives/1944"/>
	<rdf:li rdf:resource="http://post45.research.yale.edu/archives/1805"/>
	<rdf:li rdf:resource="http://post45.research.yale.edu/archives/574"/>
	<rdf:li rdf:resource="http://mininghumanities.com/2011/12/07/beautiful-in-shakespeare/"/>
	<rdf:li rdf:resource="http://dl.acm.org/citation.cfm?id=1600193.1600237"/>
	<rdf:li rdf:resource="http://lingpipe-blog.com/2011/05/27/price-is-right-binary-search-suffix-array-document/"/>
	<rdf:li rdf:resource="http://docs.python.org/release/2.6.1/library/difflib.html#sequencematcher-objects"/>
	<rdf:li rdf:resource="http://pypi.python.org/pypi/python-Levenshtein/"/>
	<rdf:li rdf:resource="http://metaoptimize.com/qa"/>
	<rdf:li rdf:resource="http://lingpipe-blog.com/2010/01/06/blegging-for-help-web-scraping-for-content/#comments"/>
	<rdf:li rdf:resource="http://cran.r-project.org/web/packages/lda/index.html"/>
      </rdf:Seq>
    </items>
  </channel><item rdf:about="http://ai.stanford.edu/blog/text-causal-inference/">
    <title>Text Feature Selection for Causal Inference | SAIL Blog</title>
    <dc:date>2019-12-14T20:51:56+00:00</dc:date>
    <link>http://ai.stanford.edu/blog/text-causal-inference/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Identifying the linguistic features that cause people to act a certain way after reading a text, regardless of confounding variables, is something people do all the time without even realizing it. For example,

Consider university course catalogues. Students peruse these each semester before signing up. What’s the magic 200-word blurb that jives with students enough to sign up? What kind of writing style recommendations could you give to any professor, regarding any subject?
Consider crowdfunding campaigns [1]. We want to know which writing styles pull in the most money, but the effect of language is confounded by the subject of the campaign – a campaign for someone’s medical bills will be written differently than a campaign for building wells. We want to find writing styles that could help any campaign.
Consider comments on reddit, where each post has a popularity score. Say that we’re interested in finding what writing styles will help posts become popular. Some authors list their genders on reddit, and a user’s gender may also affect popularity through tone, style, or topic choices [2]. How do you decide what kind of language to reccomend to any person, regardless of their gender.]]></description>
<dc:subject>textanalysis inls201 inference</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:e64bdbe9009f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inls201"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inference"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://graphics.cs.wisc.edu/Vis/EmbVis/">
    <title>Interactive Analysis of Word Vector Embeddings</title>
    <dc:date>2018-08-03T04:06:27+00:00</dc:date>
    <link>http://graphics.cs.wisc.edu/Vis/EmbVis/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Word vector embeddings are an emerging tool for natural language processing. They have proven beneficial for a wide variety of language processing tasks. Their utility stems from the ability to encode word relationships within the vector space. Applications range from components in natural language processing systems to tools for linguistic analysis in the study of language and literature. In many of these applications, interpreting embeddings and understanding the encoded grammatical and semantic relations between words is useful, but challenging. Visualization can aid in such interpretation of embeddings. In this paper, we examine the role for visualization in working with word vector embeddings. We provide a literature survey to catalogue the range of tasks where the embeddings are employed across a broad range of applications. Based on this survey, we identify key tasks and their characteristics. Then, we present visual interactive designs that address many of these tasks. The designs integrate into an exploration and analysis environment for embeddings. Finally, we provide example use cases for them and discuss domain user feedback.]]></description>
<dc:subject>textanalysis infoviz visualization vectorspace</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:8330dd841ea0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:visualization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:vectorspace"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://digitalethics.org/essays/using-facebook-identify-potential-problem-drinkers/">
    <title>Using Facebook to Identify Potential Problem Drinkers, Is It Ever Justified? - Center for Digital Ethics and Policy | Loyola University Chicago</title>
    <dc:date>2017-08-22T16:21:16+00:00</dc:date>
    <link>http://digitalethics.org/essays/using-facebook-identify-potential-problem-drinkers/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[A study, “College Students’ Drinking and Posting About Alcohol: Forwarding a Model of Motivations, Behaviors, and Consequences,” by researchers at the University of North Carolina and Ohio University, reveals an interesting correlation between drinking alcohol and posting about it on social media. Specifically, the study of 364 college students, which was also published in the Journal of Health Communication, found that those who had an “alcohol identity” were more likely to post about their alcohol consumption on social media.]]></description>
<dc:subject>inls201 classification textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:b0996e03334f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inls201"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.aclweb.org/anthology/P14-2050.pdf">
    <title>Dependency-Based Word Embeddings</title>
    <dc:date>2015-11-03T16:30:46+00:00</dc:date>
    <link>http://www.aclweb.org/anthology/P14-2050.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[While continuous word embeddings are gaining popularity, current models are based solely on linear contexts. In this work, we generalize the skip-gram model with negative sampling introduced by Mikolov et al. to include arbitrary con- texts. In particular, we perform exper- iments with dependency-based contexts, and show that they produce markedly different embeddings. The dependency- based embeddings are less topical and ex- hibit more functional similarity than the original skip-gram embeddings.]]></description>
<dc:subject>textanalysis embedding</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:122917a55984/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:embedding"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0137041">
    <title>PLOS ONE: Characterizing the Google Books Corpus: Strong Limits to Inferences of Socio-Cultural and Linguistic Evolution</title>
    <dc:date>2015-10-14T12:37:45+00:00</dc:date>
    <link>http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0137041</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[It is tempting to treat frequency trends from the Google Books data sets as indicators of the “true” popularity of various words and phrases. Doing so allows us to draw quantitatively strong conclusions about the evolution of cultural perception of a given topic, such as time or gender. However, the Google Books corpus suffers from a number of limitations which make it an obscure mask of cultural popularity. A primary issue is that the corpus is in effect a library, containing one of each book. A single, prolific author is thereby able to noticeably insert new phrases into the Google Books lexicon, whether the author is widely read or not. With this understood, the Google Books corpus remains an important data set to be considered more lexicon-like than text-like. Here, we show that a distinct problematic feature arises from the inclusion of scientific texts, which have become an increasingly substantive portion of the corpus throughout the 1900s. The result is a surge of phrases typical to academic articles but less common in general, such as references to time in the form of citations. We use information theoretic methods to highlight these dynamics by examining and comparing major contributions via a divergence measure of English data sets between decades in the period 1800–2000. We find that only the English Fiction data set from the second version of the corpus is not heavily affected by professional texts. Overall, our findings call into question the vast majority of existing claims drawn from the Google Books corpus, and point to the need to fully characterize the dynamics of the corpus before using these data sets to draw broad conclusions about cultural and linguistic evolution.]]></description>
<dc:subject>digitalhumanities textanalysis libraries bigdata</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:820c79b26249/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:libraries"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:bigdata"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://web.stanford.edu/class/cs124/kwc-unix-for-poets.pdf">
    <title>Unix for Poets</title>
    <dc:date>2015-07-30T14:00:05+00:00</dc:date>
    <link>http://web.stanford.edu/class/cs124/kwc-unix-for-poets.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[• Text is available like never before
• Dictionaries, corpora, etc.
• Data Collection Efforts:
ACL/DCI, BNC, CLR, ECI, EDR, ICAME, LDC
• Information Super Highway Roadkill: email, bboards, faxes
• Billions and billions of words
• What can we do with it all?]]></description>
<dc:subject>unix education textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:178d50fd4c1c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:unix"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:education"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://onlinelibrary.wiley.com/doi/10.1002/asi.23510/full">
    <title>Text representation strategies: An example with the State of the union addresses - Savoy - 2015 - Journal of the Association for Information Science and Technology - Wiley Online Library</title>
    <dc:date>2015-05-27T12:07:28+00:00</dc:date>
    <link>http://onlinelibrary.wiley.com/doi/10.1002/asi.23510/full</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Based on State of the Union addresses from 1790 to 2014 (225 speeches delivered by 42 presidents), this paper describes and evaluates different text representation strategies. To determine the most important words of a given text, the term frequencies (tf) or the tf idf weighting scheme can be applied. Recently, latent Dirichlet allocation (LDA) has been proposed to define the topics included in a corpus. As another strategy, this study proposes to apply a vocabulary specificity measure (Z score) to determine the most significantly overused word-types or short sequences of them. Our experiments show that the simple term frequency measure is not able to discriminate between specific terms associated with a document or a set of texts. Using the tf idf or LDA approach, the selection requires some arbitrary decisions. Based on the term-specific measure (Z score), the term selection has a clear theoretical basis. Moreover, the most significant sentences for each presidency can be determined. As another facet, we can visualize the dynamic evolution of usage of some terms associated with their specificity measures. Finally, this technique can be employed to define the most important lexical leaders introducing terms overused by the k following presidencies.]]></description>
<dc:subject>textanalysis representation topicmodels</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:a4f29473e844/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:representation"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.markhneedham.com/blog/2015/03/22/python-simplifying-the-creation-of-a-stop-word-list-with-defaultdict/">
    <title>Python: Simplifying the creation of a stop word list with defaultdict at Mark Needham</title>
    <dc:date>2015-03-23T13:49:17+00:00</dc:date>
    <link>http://www.markhneedham.com/blog/2015/03/22/python-simplifying-the-creation-of-a-stop-word-list-with-defaultdict/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[A good heuristic for identifying such words is to remove those that occur in more than 5-10% of documents (most common) and those that occur fewer than 5-10 times in the entire corpus (least common).]]></description>
<dc:subject>textanalysis howto python</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:f56aab9c2e89/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:howto"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:python"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://code.google.com/p/word2vec/">
    <title>word2vec - Tool for computing continuous distributed representations of words. - Google Project Hosting</title>
    <dc:date>2013-08-16T04:10:02+00:00</dc:date>
    <link>https://code.google.com/p/word2vec/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The word2vec tool takes a text corpus as input and produces the word vectors as output. It first constructs a vocabulary from the training text data and then learns vector representation of words. The resulting word vector file can be used as features in many natural language processing and machine learning applications.

A simple way to investigate the learned representations is to find the closest words for a user-specified word. The distance tool serves that purpose. For example, if you enter 'france', distance will display the most similar words and their distances to 'france'.]]></description>
<dc:subject>nlp similarity textanalysis semantics inls520</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:766699a08a09/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:similarity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:semantics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inls520"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://onlinelibrary.wiley.com/doi/10.1002/widm.1071/abstract">
    <title>Seeing beyond reading: a survey on visual text analytics - Alencar - 2012 - Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery - Wiley Online Library</title>
    <dc:date>2013-06-03T19:20:59+00:00</dc:date>
    <link>http://onlinelibrary.wiley.com/doi/10.1002/widm.1071/abstract</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We review recent visualization techniques aimed at supporting tasks that require the analysis of text documents, from approaches targeted at visually summarizing the relevant content of a single document to those aimed at assisting exploratory investigation of whole collections of documents.Techniques are organized considering their target input material—either single texts or collections of texts—and their focus, which may be at displaying content, emphasizing relevant relationships, highlighting the temporal evolution of a document or collection, or helping users to handle results from a query posed to a search engine.We describe the approaches adopted by distinct techniques and briefly review the strategies they employ to obtain meaningful text models, discuss how they extract the information required to produce representative visualizations, the tasks they intend to support and the interaction issues involved, and strengths and limitations. Finally, we show a summary of techniques, highlighting their goals and distinguishing characteristics. We also briefly discuss some open problems and research directions in the fields of visual text mining and text analytics. ]]></description>
<dc:subject>infoviz textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:ce5223c24c51/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/P/P12/P12-1078.pdf">
    <title>ACL 2012/Historical Analysis of Legal Opinions with a Sparse Mixed-Effects Latent Variable Model</title>
    <dc:date>2012-10-06T23:51:16+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/P/P12/P12-1078.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We show that the joint learning scheme of our sparse mixed-effects model improves on other state-of-the-art generative and discriminative models on the region and time period identification tasks.]]></description>
<dc:subject>textanalysis law periodization place machinelearning history opinion datamining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:6c3b2a080cb1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:law"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:periodization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:place"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:opinion"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:datamining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/P/P12/P12-1091.pdf">
    <title>ACL 2012/Modeling Sentences in the Latent Space</title>
    <dc:date>2012-10-06T23:48:23+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/P/P12/P12-1091.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Sentence Similarity is the process of computing a similarity score between two sentences. Previous sentence similarity work finds that latent semantics approaches to the problem do not perform well due to insufficient information in single sentences. In this paper, we show that by carefully handling words that are not in the sentences (missing words), we can train a reliable latent variable model on sentences. In the process, we propose a new evaluation framework for sentence similarity: Concept Definition Retrieval. The new framework allows for large scale tuning and testing of Sentence Similarity models. Experiments on the new task and previous data sets show significant improvement of our model over baselines and other traditional latent variable models. Our results indicate comparable and even better performance than current state of the art systems addressing the problem of sentence similarity.]]></description>
<dc:subject>similarity nlp textanalysis datamining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:eb476f9cb3b3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:similarity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:datamining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/P/P12/P12-1012.pdf">
    <title>ACL 2012/Temporally Anchored Relation Extraction</title>
    <dc:date>2012-10-06T23:47:06+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/P/P12/P12-1012.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Although much work on relation extraction has aimed at obtaining static facts, many of the target relations are actually fluents , as their validity is naturally anchored to a certain time period. This paper proposes a methodological approach to temporally anchored relation extraction. Our proposal performs distant supervised learning to extract a set of relations from a natural language corpus, and anchors each of them to an interval of temporal validity, aggregating evidence from documents supporting the relation. We use a rich graphbased document-level representation to generate novel features for this task. Results show that our implementation for temporal anchoring is able to achieve a 69% of the upper bound performance imposed by the relation extraction step. Compared to the state of the art, the overall system achieves the highest precision reported.]]></description>
<dc:subject>relationships datamining textanalysis temporal nlp</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:90afa4d0126e/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:relationships"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:datamining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:temporal"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.morganclaypool.com/doi/abs/10.2200/S00436ED1V01Y201207HLT017">
    <title>Morgan &amp; Claypool Publishers - Synthesis Lectures on Human Language Technologies - 5(2):1 - Abstract</title>
    <dc:date>2012-09-28T01:02:21+00:00</dc:date>
    <link>http://www.morganclaypool.com/doi/abs/10.2200/S00436ED1V01Y201207HLT017</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[More and more historical texts are becoming available in digital form. Digitization of paper documents is motivated by the aim of preserving cultural heritage and making it more accessible, both to laypeople and scholars. As digital images cannot be searched for text, digitization projects increasingly strive to create digital text, which can be searched and otherwise automatically processed, in addition to facsimiles. Indeed, the emerging field of digital humanities heavily relies on the availability of digital text for its studies.

Together with the increasing availability of historical texts in digital form, there is a growing interest in applying natural language processing (NLP) methods and tools to historical texts. However, the specific linguistic properties of historical texts -- the lack of standardized orthography, in particular -- pose special challenges for NLP.

This book aims to give an introduction to NLP for historical texts and an overview of the state of the art in this field. The book starts with an overview of methods for the acquisition of historical texts (scanning and OCR), discusses text encoding and annotation schemes, and presents examples of corpora of historical texts in a variety of languages. The book then discusses specific methods, such as creating part-of-speech taggers for historical languages or handling spelling variation. A final chapter analyzes the relationship between NLP and the digital humanities.

Certain recently emerging textual genres, such as SMS, social media, and chat messages, or newsgroup and forum postings share a number of properties with historical texts, for example, nonstandard orthography and grammar, and profuse use of abbreviations. The methods and techniques required for the effective processing of historical texts are thus also of interest for research in other domains.]]></description>
<dc:subject>nlp history textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:c37199cac926/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.zora.uzh.ch/32532/4/gir-2010v.pdf">
    <title>Towards Mapping of Alpine Route Descriptions</title>
    <dc:date>2012-09-25T01:30:47+00:00</dc:date>
    <link>http://www.zora.uzh.ch/32532/4/gir-2010v.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We describe a corpus of historic mountaineering accounts and ongoing work on geocoding toponyms and route descriptions in these accounts. Mountaineering accounts contain a wealth of geographic information but its extraction for purposes of geographic information retrieval poses speciﬁc challenges, in particular the distinction between toponyms pertinent to route descriptions and those mentioned in descriptions of panoramas. We describe some preliminary considerations for natural language cues to distinguish between these two types of occurrences.]]></description>
<dc:subject>textanalysis discourse mapping digitalhumanities language narrative history events</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:9d935f71d21d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:discourse"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:mapping"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:language"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:narrative"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://students.washington.edu/climb/forum/viewtopic.php?p=39389">
    <title>The Climbing Club • View topic - Alpinism &amp; Natural Language Processing</title>
    <dc:date>2012-09-25T01:29:20+00:00</dc:date>
    <link>http://students.washington.edu/climb/forum/viewtopic.php?p=39389</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[A large portion of the corpus consists of alpine literature, a literary genre of its own, which includes both fiction (poetry and prose) and non-fiction work on mountaineering and general alpine topics. Most of the non-fiction works are mountaineering accounts, i.e., reports of ascents or expeditions; these texts are the ones that are most interesting for researchers, as they reflect the reality of the time and its contemporary perception. While the accounts are factual reports, their style is nevertheless frequently more literary and narrative than expository. For example, interspersed between the descriptions of the legs of the itinerary are often passionate digressions on the majesty of the mountains, the beauty of nature, and the value of friendship. The literary style is also evident in the use of analepses, recounting previous expeditions or events.]]></description>
<dc:subject>digitalhumanities textanalysis nlp history events</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:647e13207c3d/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://software-and-algorithms.blogspot.com/2012/09/damerau-levenshtein-edit-distance.html">
    <title>Damerau-Levenshtein Edit Distance | Software and Algorithms</title>
    <dc:date>2012-09-22T21:58:07+00:00</dc:date>
    <link>http://software-and-algorithms.blogspot.com/2012/09/damerau-levenshtein-edit-distance.html</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The Damerau-Levenshtein distance admits all of the operations from the Levenshtein distance and further allows for swapping of adjacent characters, with the caveat that cost of two adjacent character swaps be at least the cost of a character deletion plus the cost of a character insertion (this caveat enables a fast dynamic programming solution to the problem). ]]></description>
<dc:subject>algorithms textanalysis similarity metrics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:1ffa8df31fdc/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:algorithms"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:similarity"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:metrics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.scottbot.net/HIAL/?page_id=21794">
    <title>curated syllabi » the scottbot irregular</title>
    <dc:date>2012-09-22T03:37:34+00:00</dc:date>
    <link>http://www.scottbot.net/HIAL/?page_id=21794</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[There are a lot of cool courses out there about digital humanities. Many exist as part of library programs, and are focused on preservation and access, or as part of archival and editorial programs, with a focus on digitization. I like all of those things, but most of all I like courses which focus on teaching humanists how to computationally augment their research. The focus can be on the use of pre-packaged tools or online services, learning to program, algorithmic theory as applied to the humanities, statistics and visualizations, or whatever else might serve the goal of teaching humanities students to apply and think critically about computational methods.]]></description>
<dc:subject>digitalhumanities syllabus quantitative textanalysis algorithmic computational</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:64aae475e028/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:syllabus"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:quantitative"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:algorithmic"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:computational"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://doucet.users.greyc.fr/StructureExtraction2009/">
    <title>Structure Extraction competition @ ICDAR 2009</title>
    <dc:date>2012-09-11T13:34:35+00:00</dc:date>
    <link>https://doucet.users.greyc.fr/StructureExtraction2009/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The goal of this task is to test and compare automatic techniques for deriving structural information from digitized books in order to build a hyperlinked table of contents that could then be used to navigate inside the books.]]></description>
<dc:subject>textanalysis data cleaning ocr layout</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:88cb77df2c11/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ocr"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:layout"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.191.105">
    <title>CiteSeerX — Unsupervised Method to Generate Page Templates</title>
    <dc:date>2012-09-11T13:33:17+00:00</dc:date>
    <link>http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.191.105</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[In this paper, we propose a method for automatically inferring the different page templates used to layout the document content. The first step of the method consists in performing a logical analysis of the document. Depending of the coverage of this step, a given number of document elements will be labeled. Then geometric relations are computed between these labeled elements, and page templates candidates are generated using frequent related elements. A fuzzy matching operation allows for selecting the most frequent and relevant page templates for a given document. Such page templates can be used to correct errors produced during the different previous steps of the document analysis: zoning, OCR, and logical analysis. Evaluation has been performed using the INEX book track collection.]]></description>
<dc:subject>textanalysis cleaning layout data ocr</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:cf339bb8b852/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:cleaning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:layout"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ocr"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://d2i.indiana.edu/htrc/uncamp2012/">
    <title>HTRC UnCamp2012 | Data to Insight Center</title>
    <dc:date>2012-09-10T16:49:13+00:00</dc:date>
    <link>http://d2i.indiana.edu/htrc/uncamp2012/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[HTRC UnCamp. HTRC is hosting its first annual HTRC UnCamp in September 2012 at Indiana University in Bloomington. The UnCamp is different: it is part hands-on coding and demonstration, part inspirational use-cases, part community building, and a part informational, all structured in the dynamic setting of an un-conference programming format. It has visionary speakers mixed with boot-camp activities and hands-on sessions with HTRC infrastructure and tools. Through the HTRC Data API, attendees will be able to browse and run applications (yours or ours) against the full 2.8M volumes of the public domain corpus of HathiTrust. Bloomington is lovely in September and the IU campus is noted as one of the most beautiful public university campuses in the nation.]]></description>
<dc:subject>events digital libraries textanalysis tools</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:d3a1746ed31a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digital"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:libraries"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:tools"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.nd.edu/~mwilkens/Wilkens_DH_Syllabus_Init.pdf">
    <title>Matthew Wilkens - Fall 2012 Digital Humanities Syllabus</title>
    <dc:date>2012-09-09T23:49:25+00:00</dc:date>
    <link>http://www.nd.edu/~mwilkens/Wilkens_DH_Syllabus_Init.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[A graduate-level introduction to problems and methods in digital humanities with an emphasis on computational and quantitative literary studies.]]></description>
<dc:subject>digitalhumanities textanalysis syllabus</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:c76a3c2c3116/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:syllabus"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.chilton-computing.org.uk/acl/applications/cocoa/p001.htm">
    <title>COCOA - A Word-Count and Concordance Generator</title>
    <dc:date>2012-09-04T02:11:42+00:00</dc:date>
    <link>http://www.chilton-computing.org.uk/acl/applications/cocoa/p001.htm</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[COCOA is a system which allows users to generate word-counts and concordances from literary (or other) texts. It was written originally for Atlas after consultation with various British Universities, and is currently being implemented for System 4-75 at Edinburgh.

The output from a COCOA word-count consists of a table containing every word in the author's vocabu1ary for that particular text, together with a number indicating how many times that word was used. This table is output three times in different orders: frequency ordering, with the most popular words first; alphabetic ordering, as in a conventional dictionary; and rhyme ordering, which is alphabetic on word endings. In addition a frequency profile table is produced showing how many words were used once each, twice each and so on.

The output from a COCOA concordance contains, for every occurrence of every word (or of a selected group of words), a line giving: a reference, e.g. HAMLET 1: 1: 173; and a limited amount of the context in which the word appears, i.e. a line, a sentence, or as much as possible. The printing of the context is adjusted on the line so that the indexed word appears in a column at the centre of the page.

It is clear that a word-count is a compression process, that is, the amount of output is perhaps a tenth of the amount of input, whereas a full concordance is an expansion process producing output some ten times as large as the input. Thus a user is likely to want to be selective when concording lest he drown himself in output.]]></description>
<dc:subject>digitalhumanities textanalysis history standards documents</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:d38351ed1643/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:standards"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:documents"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://turing.cs.washington.edu/papers/akbc-wekex12-balasubramanian.pdf">
    <title>Rel-grams: A Probabilistic Model of Relations in Text</title>
    <dc:date>2012-07-16T18:45:25+00:00</dc:date>
    <link>http://turing.cs.washington.edu/papers/akbc-wekex12-balasubramanian.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We introduce the Rel-grams language model, which is analogous to an n-grams model, but is computed over relations rather than over words. The model encodes the conditional probability of observing a relational tuple R , given that R was observed in a window of prior relational tuples. We build a database of Rel-grams co-occurence statistics from ReVerb extractions over 1.8M news wire documents and show that a graphical model based on these statistics is useful for automatically discovering event templates. We make this database freely available and hope it will prove a useful resource for a wide variety of NLP tasks.]]></description>
<dc:subject>nlp extraction events relationships textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:a127986cfeb8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:extraction"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:relationships"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://dl.acm.org/citation.cfm?id=2160165&amp;picked=prox">
    <title>Journal on Computing and Cultural Heritage (JOCCH)</title>
    <dc:date>2012-07-01T12:46:41+00:00</dc:date>
    <link>http://dl.acm.org/citation.cfm?id=2160165&amp;picked=prox</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The articles in this issue make two complementary assertions: first, language and linguistic sources are a key element of human cultural heritage and, second, we need to integrate the ancient goals of philology with rapidly emerging methods from fields such as Corpus and Computational Linguistics. The first 15,000,000 volumes digitized by Google contained data from more than 400 languages covering more than four thousand years of the human record. We need to develop methods to explore linguistic changes and the ideas that languages encode as these evolve and circulate over millennia and on a global scale.]]></description>
<dc:subject>linguistics history textanalysis digitalhumanities nlp</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:a29798aa99e3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:linguistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://old-site.clsp.jhu.edu/~ves/papers/hltcoe_akbc_wekex.pdf">
    <title>A Context-Aware Approach to Entity Linking</title>
    <dc:date>2012-06-21T21:32:25+00:00</dc:date>
    <link>http://old-site.clsp.jhu.edu/~ves/papers/hltcoe_akbc_wekex.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Entity linking refers to the task of assigning mentions in documents to their corresponding knowledge base entities. Entity linking is a central step in knowledge base population. Current entity linking systems do not explicitly model the discourse context in which the communication occurs. Nevertheless, the notion of shared context is central to the linguistic theory of pragmatics and plays a crucial role in Grice's cooperative communication principle. Furthermore, modeling context facilitates joint resolution of entities, an important problem in entity linking yet to be addressed satisfactorily. This paper describes an approach to context-aware entity linking.]]></description>
<dc:subject>entitydetection textanalysis context discourse nlp</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:982684aba6b5/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:entitydetection"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:context"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:discourse"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://blogs.princeton.edu/hrc/2010/05/how_to_ocr_and_save_pdfs_as_text_files_using_batch_processing.html">
    <title>How to OCR and Save PDFs as Text Files Using Batch Processing - HRC Blog</title>
    <dc:date>2012-06-21T16:15:28+00:00</dc:date>
    <link>https://blogs.princeton.edu/hrc/2010/05/how_to_ocr_and_save_pdfs_as_text_files_using_batch_processing.html</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Here's a walk through on how to OCR PDFs and save them as text files using the Batch Processing feature and ClearScan feature in Adobe Acrobat 9 Pro.]]></description>
<dc:subject>pdf digitization textanalysis ocr</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:881b96a08fc4/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:pdf"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ocr"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/N/N12/N12-1064.pdf">
    <title>How Text Segmentation Algorithms Gain from Topic Models</title>
    <dc:date>2012-06-14T17:09:25+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/N/N12/N12-1064.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[This paper introduces a general method to incorporate the LDA Topic Model into text segmentation algorithms. We show that semantic information added by Topic Models significantly improves the performance of two wordbased algorithms, namely TextTiling and C99. Additionally, we introduce the new TopicTiling algorithm that is designed to take better advantage of topic information. We show consistent improvements over word-based methods and achieve state-of-the art performance on a standard dataset.]]></description>
<dc:subject>nlp topicmodels textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:025a56fe2181/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/N/N12/N12-1001.pdf">
    <title>Multiple Narrative Disentanglement: Unraveling Inﬁnite Jest</title>
    <dc:date>2012-06-14T17:06:25+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/N/N12/N12-1001.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Many works (of both fiction and non-fiction) span multiple, intersecting narratives, each of which constitutes a story in its own right. In this work I introduce the task of multiple narrative disentanglement (MND), in which the aim is to tease these narratives apart by assigning passages from a text to the sub-narratives to which they belong. The motivating example I use is David Foster Wallace's fictional text Infinite Jest . I selected this book because it contains multiple, interweaving narratives within its sprawling 1,000-plus pages. I propose and evaluate a novel unsupervised approach to MND that is motivated by the theory of narratology . This method achieves strong empirical results, successfully disentangling the threads in Infinite Jest and significantly outperforming baseline strategies in doing so.]]></description>
<dc:subject>narrative textanalysis nlp</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:3e276a5a0cbd/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:narrative"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology-new/N/N12/N12-1022.pdf">
    <title>Topical Segmentation: a Study of Human Performance and a New Measure of Quality</title>
    <dc:date>2012-06-14T16:55:42+00:00</dc:date>
    <link>http://aclweb.org/anthology-new/N/N12/N12-1022.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[In a large-scale study of how people find topical shifts in written text, 27 annotators were asked to mark topically continuous segments in 20 chapters of a novel. We analyze the resulting corpus for inter-annotator agreement and examine disagreement patterns. The results suggest that, while the overall agreement is relatively low, the annotators show high agreement on a subset of topical breaks --places where most prominent topic shifts occur. We recommend taking into account the prominence of topical shifts when evaluating topical segmentation, effectively penalizing more severely the errors on more important breaks. We propose to account for this in a simple modification of the windowDiff metric. We discuss the experimental results of evaluating several topical segmenters with and without considering the importance of the individual breaks, and emphasize the more insightful nature of the latter analysis.]]></description>
<dc:subject>nlp textanalysis annotation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:b810a5d5fb07/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:annotation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://digitalhumanitiesnow.org/2012/06/editors-choice-quantitative-approaches-to-nineteenth-century-literary-and-intellectual-history/">
    <title>Editors’ Choice: Quantitative Approaches to Nineteenth Century Literary and Intellectual History : Digital Humanities Now</title>
    <dc:date>2012-06-11T08:33:30+00:00</dc:date>
    <link>http://digitalhumanitiesnow.org/2012/06/editors-choice-quantitative-approaches-to-nineteenth-century-literary-and-intellectual-history/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Two new publications using quantitative methods to study the literary and intellectual history of nineteenth century Britain have been released. The first by Ryan Heuser and Long Le-Khac from the Stanford Literary Lab, and the second from Dan Cohen and Fred Gibbs. Excerpts and links to the original texts are included below.]]></description>
<dc:subject>literature digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:bc6c3e12cbb9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:literature"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://llc.oxfordjournals.org/content/early/2012/06/01/llc.fqs017.short">
    <title>Experiments in 17th century English: manual versus automatic conceptual history</title>
    <dc:date>2012-06-02T10:05:57+00:00</dc:date>
    <link>http://llc.oxfordjournals.org/content/early/2012/06/01/llc.fqs017.short</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Previous research in conceptual history, the study of change over time of key terms and value systems, has been carried out manually using a restricted number of pre-identified texts. We propose that a method combining techniques from corpus and computational linguistics can be exploited to support conceptual history with semantic searches on a vast sample of texts. To exemplify this method, we focus on a fundamental concept in modern science, the experimental method, in order to trace when the pre-existing and primarily religious concept of experiment (or experience) took on its modern, scientific meaning. We contrast a manual approach using the existing Early English Books Online search interface with an automatic method using corpus linguistics software and methods to turn the transcribed portion of the same dataset into a corpus. Both approaches allow us to separate the religious and scientific senses and plot their change over time. We observe a rapid change in the meaning of experimental from overwhelmingly religious to largely scientific within the 1660s. However, the automatic corpus method is much more efficient and will support future scholars in carrying out iterative studies in a matter of minutes rather than through weeks of painstaking work. Such methodological innovation has the potential to support the formation of new research questions, which could not have been considered previously.]]></description>
<dc:subject>textanalysis concepts history</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:2ba889538dae/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:concepts"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:history"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.journalism.org/commentary_backgrounder/About+Campaign+2012+in+the+Media+">
    <title>About Campaign 2012 in the Media | Project for Excellence in Journalism (PEJ)</title>
    <dc:date>2012-06-01T12:03:06+00:00</dc:date>
    <link>http://www.journalism.org/commentary_backgrounder/About+Campaign+2012+in+the+Media+</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[To arrive at the results regarding the tone of coverage, PEJ employed computer coding software developed by Crimson Hexagon along with PEJ's traditional media research methods.

The technology for Crimson Hexagon is rooted in an algorithm created by Gary King, a professor at Harvard University's Institute for Quantitative Social Science. (Click here to view the study explaining the algorithm.)

According to Crimson Hexagon, the purpose of computer coding is to "take as data a potentially large set of text documents, of which a small subset is hand coded into an investigator-chosen set of mutually exclusive and exhaustive categories. As output, the methods give approximately unbiased and statistically consistent estimates of the proportion of all documents in each category."]]></description>
<dc:subject>news textanalysis sentiment machinelearning classification</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:81e5928599f7/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:news"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:sentiment"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://earlymodernonlinebib.wordpress.com/2012/05/13/neh-digital-humanities-startup-grants-funding-the-future/">
    <title>NEH Digital Humanities Startup Grants: Funding the Future « Early Modern Online Bibliography</title>
    <dc:date>2012-05-17T20:05:56+00:00</dc:date>
    <link>http://earlymodernonlinebib.wordpress.com/2012/05/13/neh-digital-humanities-startup-grants-funding-the-future/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The video “How Natural Language Processing is Changing Research” provides a more extended look at WordSeer’s usefulness for analyzing slave narratives, but its purpose is also to underscore how such a tool can benefit humanities scholars. In this video the discussion veers toward presenting reading as a chore from which humanities scholars seek relief. On that note, a student in Dr. Michael Ullyot’s undergraduate ENG 203 course, “Hamlet in the Humanities Lab” at the University of Calgary offers some pertinent comments. In her penultimate blog post for the course, Stephanie Vandework devotes a section to “The Pros and Cons of Exploratory Analysis” and examines more closely the claims in the WordSeer Shakespeare demo, finding some to suffer from overgeneralization. (For a view of the course from the instructor’s perspective, see Dr. Ullyot’s presentation, Teaching Hamlet in the Humanities Lab, for the Renaissance Society of America conference this past March 2012.)]]></description>
<dc:subject>nlp digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:4d3327b879c1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.youtube.com/watch?v=CE68-lKtVE0#t=11m31s">
    <title>NEH Digital Humanities Lightning Round 2011 Part 2 - YouTube</title>
    <dc:date>2012-05-17T20:00:58+00:00</dc:date>
    <link>http://www.youtube.com/watch?v=CE68-lKtVE0#t=11m31s</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[NEH DH Lightning Round on Wordseer.]]></description>
<dc:subject>nlp digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:29036e35ca77/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://bigthink.com/users/aditimuralidharan2">
    <title>Aditi Muralidharan | &quot;How NLP is Changing Research&quot;</title>
    <dc:date>2012-05-17T19:59:49+00:00</dc:date>
    <link>http://bigthink.com/users/aditimuralidharan2</link>
    <dc:creator>rybesh</dc:creator><dc:subject>nlp digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:3039db992694/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://tedunderwood.wordpress.com/2012/05/13/its-the-data-a-plan-of-action/">
    <title>It’s the data: a plan of action. | The Stone and the Shell</title>
    <dc:date>2012-05-16T17:56:14+00:00</dc:date>
    <link>http://tedunderwood.wordpress.com/2012/05/13/its-the-data-a-plan-of-action/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[What we need are collections in the 5,000 – 500,000 volume range, cleaned up to at least (say) 95% recall and 99% precision. Precision is more important than recall, because false negatives drop out of many kinds of analysis — as long as they’re randomly distributed (i.e. you can’t just ignore the f/s problem in the 18c). Collections of that kind are going to generate insights that we can’t glimpse as individual readers. They’ll be especially valuable once we enrich the metadata with information about (for instance) genre, gender, and nationality. I’m not confident that we can crowdsource OCR correction (it’s an awful lot of work), but I am confident that we could crowdsource some light enrichment of metadata.]]></description>
<dc:subject>digitalhumanities ocr digitization textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:48d638d2b41a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ocr"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitization"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.jstor.org/stable/10.1086/663350">
    <title>JSTOR: The Journal of Modern History, Vol. 84, No. 1 (March 2012), pp. 116-144</title>
    <dc:date>2012-05-11T20:34:03+00:00</dc:date>
    <link>http://www.jstor.org/stable/10.1086/663350</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[ by using multiple databases and keyword variants, the historian may gain confidence in a particular chronological intervention. Large databases, the result of scanned microfilm collections or mass digitization initiatives across multiple libraries, provide enough texts to bridge generation and genre, incorporating authors from a variety of backgrounds. Sheer number of texts is important here: ECCO indexes 200,000 works from eighteenth- and nineteenth-century Britain with 33 million pages of text; Google Books Search has 42 million books from all periods. If the historian’s goal is to show a shift in common word usage, the size of a database is more important than its genre specificity; in the case examined in the present article, for instance, Google Book Search and ECCO were superior to the available poetry databases. Iterative visitation of multiple databases provided another potential source of richness for extracting meaning from these tools.]]></description>
<dc:subject>textanalysis search digitalhumanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:23f9249eb078/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:search"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.scottbot.net/HIAL/?p=16713">
    <title>The Myth of Text Analytics and Unobtrusive Measurement » the scottbot irregular</title>
    <dc:date>2012-05-07T17:10:23+00:00</dc:date>
    <link>http://www.scottbot.net/HIAL/?p=16713</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Text analytics are often used in the social sciences as a way of unobtrusively observing people and their interactions. Humanists tend to approach the supporting algorithms with skepticism, and with good reason. This post is about the difficulties of using words or counts as a proxy for some secondary or deeper meaning.]]></description>
<dc:subject>digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:ccd2b3d0479c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cs.princeton.edu/~blei/papers/ChaneyBlei2012.pdf">
    <title>Visualizing Topic Models</title>
    <dc:date>2012-03-25T16:43:33+00:00</dc:date>
    <link>http://www.cs.princeton.edu/~blei/papers/ChaneyBlei2012.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Managing large collections of documents is an important problem for many areas of science, industry, and culture. Probabilistic topic modeling offers a promising solution. Topic modeling is an unsupervised machine learning method that learns the underlying themes in a large collection of otherwise unorganized documents. This discovered structure summarizes and organizes the documents. However, topic models are high-level statistical tools--a user must scrutinize numerical distributions to understand and explore their results. In this paper, we present a method for visualizing topic models. Our method creates a navigator of the documents, allowing users to explore the hidden structure that a topic model discovers. These browsing interfaces reveal meaningful patterns in a collection, helping end-users explore and understand its contents in new ways. We provide open source software of our method. Understanding and navigating large collections of documents has become an important activity in many spheres. However, many document collections are not coherently organized and organizing them by hand is impractical. We need automated ways to discover and visualize the structure of a collection in order to more easily explore its contents. Probabilistic topic modeling is a set of machine learning tools that may provide a solution (Blei and Lafferty 2009). Topic modeling algorithms discover a hidden thematic structure in a collection of documents; they find salient themes and represent each document as a combination of themes. However, topic models are high-level statistical tools. A user must scrutinize numerical distributions to understand and explore their results; the raw output of the model is not enough to create an easily explored corpus. We propose a method for using a fitted topic model to organize, summarize, visualize, and interact with a corpus. With our method, users can explore the corpus, moving between high level discovered summaries (the "topics") and the documents themselves, as Figure 1 illustrates.]]></description>
<dc:subject>topicmodels textanalysis infoviz visualization</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:f9e8f5c5e2b9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:visualization"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://gking.harvard.edu/data?dvn_subpage=/faces/study/StudyPage.xhtml?globalId=hdl:1902.1/FYXLAWZRIA">
    <title>10 MILLION INTERNATIONAL DYADIC EVENTS</title>
    <dc:date>2012-03-21T22:47:23+00:00</dc:date>
    <link>http://gking.harvard.edu/data?dvn_subpage=/faces/study/StudyPage.xhtml?globalId=hdl:1902.1/FYXLAWZRIA</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[When the Palestinians launch a mortar attack into Israel, the Israeli army does not wait until the end of the calendar year to react. Yet, most modern data collections are aggregated to the month or year. The data available here include almost 10 million individual events, each coded to the exact day they occur or become known. Each event is summarized in the data as "Actor A does something to Actor B", with Actors A and B recording about 450 countries and other (within-country) actors and "does something to" coded in an ontology of about 200 types of actions. The data are coded by computer from millions of Reuters news reports. The software system (produced by VRA) that performs this task has been independently evaluated by King and Lowe (2003). This article found that for the numbers of events it was possible to convince humans (trained Harvard undergraduates) to code by hand, the machine did as well as the humans. For much larger numbers of events for which no expert coder could keep up, the machine dominates.]]></description>
<dc:subject>events politicalscience data machinelearning textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:ea4e26b4c1c8/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:politicalscience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:data"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="https://github.com/timjurka/RTextTools">
    <title>timjurka/RTextTools</title>
    <dc:date>2012-03-16T20:41:17+00:00</dc:date>
    <link>https://github.com/timjurka/RTextTools</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[RTextTools is a free, open source machine learning package for automatic text classification that makes it simple for both novice and advanced users to get started with supervised learning. The package includes nine algorithms for ensemble classification (svm, slda, boosting, bagging, random forests, glmnet, decision trees, neural networks, maximum entropy), comprehensive analytics, and thorough documentation.]]></description>
<dc:subject>textanalysis classification tools research</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:84457eba5643/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:tools"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:research"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://coursework.stanford.edu/homepage/Sp12/Sp12-ENGLISH-162-01.html">
    <title>Sp12-ENGLISH-162-01 : Critical Methods: Introduction to Digital Humanities</title>
    <dc:date>2012-03-16T15:49:55+00:00</dc:date>
    <link>http://coursework.stanford.edu/homepage/Sp12/Sp12-ENGLISH-162-01.html</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Digital texts and digital libraries offer us new opportunities for searching and accessing literary material. But more interesting and exciting than the mere searching of digital texts is the ability to leverage computation in order to process and analyze textual data, to provide new methods for reading, analyzing, and understanding literature.

This course provides an introduction to the field of humanities computing with a special emphasis on literary text-analysis. Students learn about the preparation and processing of digital texts while exploring literary methods which help us explain and interpret literary texts, genres, and movements. The course includes units dealing with "stylometry" (computer based stylistic analysis), authorship attribution, gender detection, text encoding, and the visualization of literary information using such open source tools as R and Gephi.

Throughout the course we consider the theoretical issues associated with employing quantitative methodologies in a traditionally qualitative discipline; we read and discuss landmark essays in the field; and we end with an informed discussion of how digital libraries and computation are taking literary scholarship "beyond the book." Students will develop basic coding skills in an environment in which understanding literature is the only prerequisite. No programming experience is required; students will develop fluency in XML and R through exercises and work on a collaborative text-analysis project.]]></description>
<dc:subject>digitalhumanities syllabus textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:007e145c55bb/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:syllabus"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://jmlr.csail.mit.edu/proceedings/papers/v17/sudhahar11a/sudhahar11a.pdf">
    <title>Automating Quantitative Narrative Analysis of News Data</title>
    <dc:date>2012-03-07T17:45:56+00:00</dc:date>
    <link>http://jmlr.csail.mit.edu/proceedings/papers/v17/sudhahar11a/sudhahar11a.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We present a working system for large scale quantitative narrative analysis (QNA) of news corpora, which includes various recent ideas from text mining and pattern analysis in order to solve a problem arising in computational social sciences. The task is that of identifying the key actors in a body of news, and the actions they perform, so that further analysis can be carried out. This step is normally performed by hand and is very labour intensive.  We then characterise the actors by: studying their position in the overall network of actors and actions; studying the time series associated with some of their properties; generating scatter plots describing the subject/object bias of each actor; and investigating the types of actions each actor is most associated with. The system is demonstrated on a set of 100,000 articles about crime appeared on the New York Times between 1987 and 2007.  As an example, we nd that Men were most commonly responsible for crimes against the person, while Women and Children were most often victims of those crimes.]]></description>
<dc:subject>textanalysis textmining events sociology news</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:cca208ccd2c0/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textmining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:events"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:sociology"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:news"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://arxiv.org/abs/1003.0783">
    <title>[1003.0783] Supervised Topic Models</title>
    <dc:date>2012-03-06T19:58:14+00:00</dc:date>
    <link>http://arxiv.org/abs/1003.0783</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[We introduce supervised latent Dirichlet allocation (sLDA), a statistical model of labelled documents. The model accommodates a variety of response types. We derive an approximate maximum-likelihood procedure for parameter estimation, which relies on variational methods to handle intractable posterior expectations. Prediction problems motivate this research: we use the fitted model to predict response values for new documents. We test sLDA on two real-world problems: movie ratings predicted from reviews, and the political tone of amendments in the U.S. Senate based on the amendment text. We illustrate the benefits of sLDA versus modern regularized regression, as well as versus an unsupervised LDA analysis followed by a separate regression.]]></description>
<dc:subject>slda classification lda topicmodels textanalysis machinelearning</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:9c900b5fec08/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:slda"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:lda"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cs.princeton.edu/~chongw/slda/">
    <title>Supervised latent Dirichlet allocation for classification</title>
    <dc:date>2012-03-06T19:57:23+00:00</dc:date>
    <link>http://www.cs.princeton.edu/~chongw/slda/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[This is a C++ implementation of supervised latent Dirichlet allocation (sLDA) for classification.]]></description>
<dc:subject>c++ slda classification topicmodels lda machinelearning textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:83cdb66b3f94/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:c++"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:slda"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:lda"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.cs.princeton.edu/~blei/lda-c/">
    <title>Latent Dirichlet Allocation in C</title>
    <dc:date>2012-03-06T19:49:03+00:00</dc:date>
    <link>http://www.cs.princeton.edu/~blei/lda-c/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[This is a C implementation of variational EM for latent Dirichlet allocation (LDA), a topic model for text or other discrete data. LDA allows you to analyze of corpus, and extract the topics that combined to form its documents. For example, click here to see the topics estimated from a small corpus of Associated Press documents. LDA is fully described in Blei et al. (2003) .

This code contains:

an implementation of variational inference for the per-document topic proportions and per-word topic assignments
a variational EM procedure for estimating the topics and exchangeable Dirichlet hyperparameter]]></description>
<dc:subject>lda c linguistics machinelearning textanalysis textmining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:2469cf74384a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:lda"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:c"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:linguistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textmining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://stanford.edu/~jgrimmer/tad2.pdf">
    <title>Text as Data: The Promise and Pitfalls of Automatic Content Analysis Methods for Political Texts</title>
    <dc:date>2012-03-06T14:30:00+00:00</dc:date>
    <link>http://stanford.edu/~jgrimmer/tad2.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Politics and political con ict often occur in the written and spoken word. Scholars have long recognized this, but the massive costs of analyzing even moderately sized collections of texts have hindered their use in political science research. Here lies the promise of automated text analysis: it substantially reduces the costs of analyzing large collections of text. We provide a guide to this exciting new area of research and show how, in many instances, the methods have already obtained part of their promise. But there are pitfalls to using automated methods: they are no substitute for careful thought and close reading and require extensive and problem specic validation. We survey a wide range of new methods, provide guidance on how to validate the output of the models, and clarify misconceptions and errors in the literature. To conclude, we argue that for automated text methods to become a standard tool for political scientists, methodologists must contribute new methods and new methods of validation.]]></description>
<dc:subject>textanalysis politicalscience socialscience digitalhumanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:b343cc6110b9/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:politicalscience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:socialscience"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://vis.stanford.edu/papers/designing-model-driven-vis">
    <title>Stanford Vis Group | Interpretation and Trust: Designing Model-Driven Visualizations for Text Analysis</title>
    <dc:date>2012-02-24T16:17:38+00:00</dc:date>
    <link>http://vis.stanford.edu/papers/designing-model-driven-vis</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Statistical topic models can help analysts discover patterns in large text corpora by identifying recurring sets of words and enabling exploration by topical concepts. However, understanding and validating the output of these models can itself be a challenging analysis task. In this paper, we offer two design considerations - interpretation and trust - for designing visualizations based on data-driven models. Interpretation refers to the facility with which an analyst makes inferences about the data through the lens of a model abstraction. Trust refers to the actual and perceived accuracy of an analyst's inferences. These considerations derive from our experiences developing the Stanford Dissertation Browser, a tool for exploring over 9,000 Ph.D. theses by topical similarity, and a subsequent review of existing literature. We contribute a novel similarity measure for text collections based on a notion of "word-borrowing" that arose from an iterative design process. Based on our experiences and a literature review, we distill a set of design recommendations and describe how they promote interpretable and trustworthy visual analysis tools.]]></description>
<dc:subject>infoviz textanalysis topicmodels</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:f46ea8bc547f/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://aclweb.org/anthology/J/J79/J79-1070.pdf">
    <title>Robert Young - Text Understanding: A Survey</title>
    <dc:date>2012-02-17T14:48:40+00:00</dc:date>
    <link>http://aclweb.org/anthology/J/J79/J79-1070.pdf</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The goal of the study is to examine work that has something to offer toward the construction of a computable model of text understanding. It focuses on those aspects of meaning that are conveyed only by groups of connected sentences—texts. Additionally, only work that attempts to deal with the semantics or understanding of texts, as opposed to statistical or syntactic analysis, is considered.]]></description>
<dc:subject>nlp textanalysis semantics</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:a55c2bf489cf/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:semantics"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://blog.semantic-web.at/2012/02/02/automatic-text-analytics-using-dbpedia-and-poolparty-a-live-demo/">
    <title>Automatic text analytics using DBpedia and PoolParty – A Live Demo |The Semantic Puzzle</title>
    <dc:date>2012-02-03T20:34:46+00:00</dc:date>
    <link>http://blog.semantic-web.at/2012/02/02/automatic-text-analytics-using-dbpedia-and-poolparty-a-live-demo/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Let me show you which steps have to be taken to generate a high-quality text mining application, ready to be used to annotate and to categorize any kind of text or documents covering nearly any domain. With our approach of thesaurus based text mining your documents can also be linked to the world of linked (open) data; enrich your documents with data from the LOD cloud!]]></description>
<dc:subject>inls520 semweb textanalysis classification skos tools inls620</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:6124a14c0d7b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inls520"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:semweb"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:classification"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:skos"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:tools"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:inls620"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://www.dictionsoftware.com/">
    <title>Diction Software - Home</title>
    <dc:date>2012-01-18T14:39:37+00:00</dc:date>
    <link>http://www.dictionsoftware.com/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Diction 6.0 uses dictionaries (word-lists) to search a text for these qualities:

· Certainty - Language indicating resoluteness, inflexibility, and completeness and a tendency to speak ex cathedra.

· Activity - Language featuring movement, change, the implementation of ideas and the avoidance of inertia.

· Optimism - Language endorsing some person, group, concept or event, or highlighting their positive entailments.

· Realism - Language describing tangible, immediate, recognizable matters that affect people's everyday lives.

· Commonality - Language highlighting the agreed-upon values of a group and rejecting idiosyncratic modes of engagement.]]></description>
<dc:subject>textanalysis sentiment digitalhumanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:6c552483697a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:sentiment"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://post45.research.yale.edu/archives/1944">
    <title>Scale and Method: A Reply to Jeremy Rosen « Post45</title>
    <dc:date>2012-01-02T20:59:11+00:00</dc:date>
    <link>http://post45.research.yale.edu/archives/1944</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[The piece had two aims, namely to advocate for the addition of computational methods to our critical repertoire and to give a sample of recent computational work of the sort I find useful. I mention these goals up front because I think some of Rosen’s criticisms follow from the failure (mine, to be sure) to specify exactly what my essay was and was not doing and arguing. So to be clear: it was an argument for methodological expansion, especially for those of us working with contemporary sources, and a high-level synopsis of the results of that expansion.]]></description>
<dc:subject>literarystudies textanalysis digitalhumanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:d5f8daec4f92/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:literarystudies"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://post45.research.yale.edu/archives/1805">
    <title>Combining Close and Distant, or, the Utility of Genre Analysis: A Response to Matthew Wilkens’s “Contemporary Fiction by the Numbers” « Post45</title>
    <dc:date>2012-01-02T20:58:17+00:00</dc:date>
    <link>http://post45.research.yale.edu/archives/1805</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Wilkens neglects other equally pressing problems with the computational practices he advocates—limitations that reveal themselves in the very analysis he proffers as a sample of the kind of scholarship such practices might enable. Two problems with Wilkens’s method strike me as most urgent. First and most glaringly, he inadvertently demonstrates how easily data may be misinterpreted to serve conclusions that are sought by the analyst. And second, though he and others doing similar work purport to offer analysis of neutral data sets—say, all the fiction published in a given year—by working with existing bibliographies they perpetuate the selection criteria that governed the initial compilation. Doing so artificially reifies bodies of texts that might in fact be far more heterogeneous and unruly.]]></description>
<dc:subject>literarystudies digitalhumanities textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:b86982e74731/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:literarystudies"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://post45.research.yale.edu/archives/574">
    <title>Contemporary Fiction by the Numbers « Post45</title>
    <dc:date>2012-01-02T20:57:18+00:00</dc:date>
    <link>http://post45.research.yale.edu/archives/574</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[A short illustration of the underlying problem of literary and cultural abundance, a quick tour of several techniques that we might use to expand our analytical repertoire so as to deal with that problem more effectively, and, finally, a consideration of the substantial challenges these methods face in the short-to-medium term.]]></description>
<dc:subject>literarystudies textanalysis digitalhumanities</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:b5fe7c0d197a/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:literarystudies"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://mininghumanities.com/2011/12/07/beautiful-in-shakespeare/">
    <title>“Beautiful” in Shakespeare « Text Mining and the Digital Humanities</title>
    <dc:date>2011-12-15T19:59:07+00:00</dc:date>
    <link>http://mininghumanities.com/2011/12/07/beautiful-in-shakespeare/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Great, clear example of text mining using Wordseer.]]></description>
<dc:subject>digitalhumanities textmining textanalysis nlp infoviz examples</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:515bc4908744/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:digitalhumanities"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textmining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:examples"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://dl.acm.org/citation.cfm?id=1600193.1600237">
    <title>A panlingual anomalous text detector</title>
    <dc:date>2011-10-30T21:31:41+00:00</dc:date>
    <link>http://dl.acm.org/citation.cfm?id=1600193.1600237</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[In a large-scale book scanning operation, material can vary widely in language, script, genre, domain, print quality, and other factors, giving rise to a corresponding variability in the OCRed text. It is often desirable to automatically detect errorful and otherwise anomalous text segments, so that they can be filtered out or appropriately flagged, for such applications as indexing, mining, analyzing, displaying, and selectively re-processing such data. Moreover, it is advantageous to require that the automated detector be independent of the underlying OCR engine (or engines), that it work over a broad range of languages, that it seamlessly handle mixed-language material, and that it accommodate documents that contain domain-specific and otherwise rare terminology. A technique is presented that satisfies these requirements, using an adaptive mixture of character-level N-gram language models. Its design, training, implementation, and evaluation are described within the context of high-volume book scanning.]]></description>
<dc:subject>ocr textanalysis textmining evalulation</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:115cd34b297b/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ocr"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textmining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:evalulation"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://lingpipe-blog.com/2011/05/27/price-is-right-binary-search-suffix-array-document/">
    <title>Price-is-Right Binary Search (for Suffix Arrays of Documents) « LingPipe Blog</title>
    <dc:date>2011-06-01T15:53:12+00:00</dc:date>
    <link>http://lingpipe-blog.com/2011/05/27/price-is-right-binary-search-suffix-array-document/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Suffix arrays are useful if you’re looking for anything from plagiarized passages in a pile of writing assignments, cut-and-paste code blocks in a large project, or just commonly repeated phrases on Twitter.]]></description>
<dc:subject>search textanalysis textmining</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:0c2e4071747c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:search"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textmining"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://docs.python.org/release/2.6.1/library/difflib.html#sequencematcher-objects">
    <title>difflib — SequenceMatcher</title>
    <dc:date>2011-05-17T04:11:26+00:00</dc:date>
    <link>http://docs.python.org/release/2.6.1/library/difflib.html#sequencematcher-objects</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[SequenceMatcher is a flexible class for comparing pairs of sequences of any type, so long as the sequence elements are hashable. ]]></description>
<dc:subject>python textanalysis</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:a417be4d872c/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:python"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://pypi.python.org/pypi/python-Levenshtein/">
    <title>Python Package Index : python-Levenshtein 0.10.2</title>
    <dc:date>2011-05-17T04:03:40+00:00</dc:date>
    <link>http://pypi.python.org/pypi/python-Levenshtein/</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Python extension computing string distances and similarities.

]]></description>
<dc:subject>python textanalysis search</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:13682adb90b1/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:python"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:search"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://metaoptimize.com/qa">
    <title>Training Examples Q&amp;A - machine learning, natural language processing, artificial intelligence, text analysis, information retrieval, search, data mining, statistical modeling, and data visualization</title>
    <dc:date>2010-06-30T00:53:11+00:00</dc:date>
    <link>http://metaoptimize.com/qa</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[Where data geeks ask and answer questions on machine learning, natural language processing, artificial intelligence, text analysis, information retrieval, search, data mining, statistical modeling, and data visualization!]]></description>
<dc:subject>ai machinelearning nlp textanalysis ir datamining search statistics infoviz reference</dc:subject>
<dc:source>https://pinboard.in/</dc:source>
<dc:identifier>https://pinboard.in/u:rybesh/b:8360af74d3ea/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ai"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:machinelearning"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:ir"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:datamining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:search"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:statistics"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:infoviz"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:reference"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://lingpipe-blog.com/2010/01/06/blegging-for-help-web-scraping-for-content/#comments">
    <title>Blegging for Help: Web Scraping for Content? « LingPipe Blog</title>
    <dc:date>2010-01-07T05:01:50+00:00</dc:date>
    <link>http://lingpipe-blog.com/2010/01/06/blegging-for-help-web-scraping-for-content/#comments</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[In search of a good general-purpose method of pulling the content out of arbitrary web pages and leaving the boilerplate, advertising, navigation, etc. behind. See also http://bit.ly/4SFOIH
]]></description>
<dc:subject>web nlp html parsing textanalysis</dc:subject>
<dc:identifier>https://pinboard.in/u:rybesh/b:4904bb4ca396/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:web"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:nlp"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:html"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:parsing"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
</rdf:Bag></taxo:topics>
</item>
<item rdf:about="http://cran.r-project.org/web/packages/lda/index.html">
    <title>lda: Collapsed Gibbs sampling methods for topic models</title>
    <dc:date>2009-11-24T04:32:02+00:00</dc:date>
    <link>http://cran.r-project.org/web/packages/lda/index.html</link>
    <dc:creator>rybesh</dc:creator><description><![CDATA[This package implements latent Dirichlet allocation (LDA) and related models. This includes (but is not limited to) sLDA, corrLDA, and the mixed-membership stochastic blockmodel.
]]></description>
<dc:subject>clustering textanalysis datamining R topicmodels</dc:subject>
<dc:identifier>https://pinboard.in/u:rybesh/b:90b9dd8919f3/</dc:identifier>
<taxo:topics><rdf:Bag>	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:clustering"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:textanalysis"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:datamining"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:R"/>
	<rdf:li rdf:resource="https://pinboard.in/u:rybesh/t:topicmodels"/>
</rdf:Bag></taxo:topics>
</item>
</rdf:RDF>