<?xml-stylesheet type="text/xsl" href="./local.xsl"?> 
<!DOCTYPE TEI.2 PUBLIC '-//C. M. Sperberg-McQueen//DTD
          TEI Lite 1.0 plus SWeb (XML)//EN'
          './../../../lib/swebxml.dtd' [
<!ATTLIST list type CDATA 'bullets' >
<!ATTLIST seg  rend CDATA 'incremental' >
<!ATTLIST xref href CDATA '' >

<!ATTLIST item id ID #IMPLIED >
<!ATTLIST div id ID #IMPLIED >
<!ATTLIST item id ID #IMPLIED >

<!ENTITY date.last.touched '7 August 2013'>

<!ENTITY S "<hi rend='ital'>S</hi>">
<!ENTITY A "<hi rend='ital'>A</hi>">


<!ENTITY equiv  "&#x2261;" ><!--/equiv R: =identical with-->
<!ENTITY exist  "&#x2203;" ><!--/exists =at least one exists-->
<!ENTITY forall "&#x2200;" ><!--/forall =for all-->
<!ENTITY iff    "&#x2194;" ><!--/iff =if and only if-->
<!ENTITY iff    "&#x21D4;" ><!--/iff =if and only if-->
<!ENTITY isin   "&#x2208;" ><!--/in R: =set membership-->
<!ENTITY not    "&#xAC;" ><!--/neg /lnot =not sign-->
<!ENTITY rArr   "&#x21D2;" ><!--/Rightarrow A: =implies-->
<!ENTITY rarr   "&#x2192;" ><!--/rightarrow /to A: =rightward arrow-->
<!ENTITY rarr   "&#x2192;" ><!--/rightarrow /to A: =rightward arrow-->
<!ENTITY rarr   "&#x2192;" ><!--/rightarrow /to A: =rightward arrow-->
<!ENTITY cap    "&#x2229;" ><!--/cap B: =intersection-->
<!ENTITY cup    "&#x222A;" ><!--/cup B: =union or logical sum-->
<!ENTITY equiv  "&#x2261;" ><!--/equiv R: =identical with-->
<!ENTITY empty  "&#x2205;" ><!--empty set-->


<!ENTITY exist  "&#x2203;" ><!--/exists =at least one exists-->
<!ENTITY Sigma  "&#x03A3;"><!--=capital Sigma, Greek-->

<!ENTITY Acirc   "&#194;" ><!-- capital A, circumflex accent -->
<!ENTITY Ecirc   "&#202;" ><!-- capital E, circumflex accent -->
<!ENTITY Icirc   "&#206;" ><!-- capital I, circumflex accent -->
<!ENTITY Ocirc   "&#212;" ><!-- capital O, circumflex accent -->
<!ENTITY Ucirc   "&#219;" ><!-- capital U, circumflex accent -->
<!ENTITY Uuml    "&#220;" ><!-- capital U, dieresis or umlaut mark -->
<!ENTITY acirc   "&#226;" ><!-- small a, circumflex accent -->
<!ENTITY aelig   "&#230;" ><!-- small ae diphthong (ligature) -->
<!ENTITY ap     "&#x2249;" ><!--/approx R: =approximate-->
<!ENTITY approx "&#x2245;" ><!-- approximately equal to -->
<!ENTITY auml    "&#228;" ><!-- small a, dieresis or umlaut mark -->
<!ENTITY ccedil  "&#231;" ><!-- small c, cedilla -->
<!ENTITY darr   "&#x2193;" ><!--/downarrow A: =downward arrow-->
<!ENTITY eacute  "&#233;" ><!-- small e, acute accent -->
<!ENTITY ecirc   "&#234;" ><!-- small e, circumflex accent -->
<!ENTITY equiv  "&#x2261;" ><!--/equiv R: =identical with-->
<!ENTITY ge     "&#x2265;" ><!--/geq /ge R: =greater-than-or-equal-->
<!ENTITY hellip "&#x2026;" ><!--=ellipsis (horizontal)-->
<!ENTITY icirc   "&#238;" ><!-- small i, circumflex accent -->
<!ENTITY iquest "&#xBF;" ><!--=inverted question mark-->
<!ENTITY larr   "&#x2190;" ><!--/leftarrow /gets A: =leftward arrow-->
<!ENTITY ldquo  "&#x201C;" ><!--=double quotation mark, left-->
<!ENTITY le     "&#x2264;" ><!--/leq /le R: =less-than-or-equal-->
<!ENTITY lsquo  "&#x2018;" ><!--=single quotation mark, left-->
<!ENTITY mdash  "&#x2014;" ><!--=em dash-->
<!ENTITY nbsp   "&#160;" ><!--=no break (required) space-->
<!ENTITY ne     "&#x2260;" ><!--/ne /neq R: =not equal-->
<!ENTITY ntilde  "&#241;" ><!-- small n, tilde -->
<!ENTITY ocirc   "&#244;" ><!-- small o, circumflex accent -->
<!ENTITY ouml    "&#246;" ><!-- small o, dieresis or umlaut mark -->
<!ENTITY ouml    "&#246;" ><!-- small o, dieresis or umlaut mark -->
<!ENTITY quest  "?" ><!--=question mark-->
<!ENTITY rArr   "&#x21D2;" ><!--/Rightarrow A: =implies-->
<!ENTITY rarr   "&#x2192;" ><!--/rightarrow /to A: =rightward arrow-->
<!ENTITY rdquo  "&#x201D;" ><!--=double quotation mark, right-->
<!ENTITY rsquo  "&#x2019;" ><!--=single quotation mark, right-->
<!ENTITY sim    "&#x223C;" ><!--/sim R: =similar-->
<!ENTITY sime   "&#x2243;" ><!--/simeq R: =similar, equals-->
<!ENTITY szlig   "&#223;" ><!-- small sharp s, German (sz ligature) -->
<!ENTITY times  "&#215;" ><!--/times B: =multiply sign-->
<!ENTITY uarr   "&#x2191;" ><!--/uparrow A: =upward arrow-->
<!ENTITY ucirc   "&#251;" ><!-- small u, circumflex accent -->
<!ENTITY uuml    "&#252;" ><!-- small u, dieresis or umlaut mark -->

<!ENTITY forall "for all" ><!--/forall =for all-->
<!ENTITY implies "&rarr;" ><!-- my generic implication -->

<!ENTITY P "<ident>P</ident>" >
<!ENTITY Q "<ident>Q</ident>" >

<!NOTATION PNG SYSTEM "image/png">
<!NOTATION JPEG SYSTEM "image/jpeg">
<!NOTATION BMP SYSTEM "image/bmp">
<!NOTATION SVG SYSTEM "image/svg+xml">

<!ENTITY biblfull.bf.gluschkov.dot SYSTEM "images/biblFull.bf.gluschkov.dot.png" NDATA PNG>
<!ENTITY biblFull.i5.gluschkov.dot SYSTEM "images/biblFull.i5.gluschkov.dot.png" NDATA PNG>
<!ENTITY biblFull.bf.sfdp SYSTEM "images/biblFull.bf.gluschkov.sfdp.png" NDATA PNG>

<!ENTITY p.i5.gluschkov.dot SYSTEM "images/p.i5.gluschkov.dot.svg" NDATA SVG>
<!--* <!ENTITY p.i5.gluschkov.neato SYSTEM "images/p.i5.gluschkov.neato.svg" NDATA SVG> *-->
<!ENTITY p.i5.gluschkov.circo SYSTEM "images/p.i5.gluschkov.circo.svg" NDATA SVG>
<!ENTITY p.i5.colored.circo SYSTEM "images/p.i5.colored.circo.svg" NDATA SVG>

<!ENTITY p.i5.gluschkov.neato SYSTEM "images/p.i5.gluschkov.neato.png" NDATA PNG>

<!--*
<!ENTITY p.i5.gluschkov.dot SYSTEM "images/p.i5.gluschkov.dot.png" NDATA PNG>
<!ENTITY p.i5.gluschkov.circo SYSTEM "images/p.i5.gluschkov.circo.png" NDATA PNG>
<!ENTITY p.i5.colored.circo SYSTEM "images/p.i5.colored.circo.png" NDATA PNG>
*-->

<!ENTITY p.i5.min.dot SYSTEM "images/p.i5.min.dot.png" NDATA PNG>

]>
<TEI.2>
<teiHeader>
<fileDesc>
<titleStmt>
<title type="main">Igel</title>
<title type="sub">Comparing document grammars using XQuery</title>
</titleStmt>
<publicationStmt>
<pubPlace>Espa&ntilde;ola, New Mexico</pubPlace>
<publisher>Black Mesa Technologies LLC</publisher>
<date>2013</date>
</publicationStmt>
<sourceDesc>
<p>No source; created in electronic form.</p>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<front>
<titlePage>
<docTitle>
<titlePart>Igel</titlePart>
<titlePart>Comparing document grammars using XQuery</titlePart>
</docTitle>

<docAuthor>C. M. Sperberg-McQueen, Black Mesa Technologies LLC</docAuthor>
<docAuthor>Oliver Schonefeld, Marc Kupietz, Harald L&uuml;ngen, Andreas
	  Witt, Institut f&uuml;r Deutsche Sprache</docAuthor>
<docDate>&date.last.touched;</docDate>

</titlePage>

<div id="navbar" type="navbar">
<head>Nearby documents</head>
<divGen type="toc"/>
<list>
<!--* <item><xref href="online.html">Online interface to Thutmose II</xref></item>
      <item><xref href="progdoc.xml">Programmers' documentation</xref></item>
    *-->
<item id="siteroot"><xref href="./../../..">Home</xref></item>
</list>
</div>
</front>
<body>

<div>
<head>Overview</head>
<list>
  <item>Context</item>
  <item>Current state</item>
  <item>Further work</item>
</list>
</div>

<div>
<head>Context</head>
<list>
<item>Institut f&uuml;r Deutsche Sprache (IDS), Mannheim</item>
<item>DeReKo</item>
<item>I5 (IDS flavor of TEI P5)</item>
<item>problems of document grammar comparison</item>
</list>

<div>
<head>IDS (1)</head>
<list>
<item>Institut für Deutsche Sprache (= Institute for the German language) founded 1964</item>
<item>independent research institute</item>
<item>Member of Leibniz-Gemeinschaft (= Leibniz Association)</item>
<item>Mission: "Research and document the German language in its use at present and in modern history (= ~1950++)"</item>
<item>Funded by federal government and the state Baden-Württemberg</item>
</list>
</div>
<div>
<head>IDS (2)</head>                  
<p>Four research departments
<list>
<item>Grammar</item>
<item>Lexis</item>
<item>Pragmatics</item>
<item>Zentrale Forschung (= central research)
<list>
<item>Korpuslinguistik (= corpus linguistics)</item>
<item>Forschungsinfrastukturen (= research infrastructure)</item>
</list>
</item>
<item>support units: administration, public relations, library, (small) computing center</item>
</list>
</p>
</div>
<div>
<head>DeReKo</head>
<p>IDS hosts several large (and unique) collections of German language resources</p>
<p>Amongst them: DeReKo
<list>
<item>worldwide largest archive of corpora of contemporary written German</item>
<item>6.1 billion word tokens</item>
<item>contains fiction, scientific texts, newspaper articles,
and a wide variety of other text types</item>
<item>legal agreements with text donors (e.g. publishers)</item>
<item rend="incremental">... therefore not available for download, but searchable through custom search engine</item>
</list>
</p>
</div>
<div>
<head>I5</head>
<p>DeReKo uses a number of formats for representing corpora
including a customized version of XCES (IDS-XCES)
<list>
<item>recent efforts: converting DeReKo to TEI P5</item>
<item>... of course customized, we call it I5</item>
<item>no immediate benefit for "outside" parties,
but IDS hopes I5 will help with internal work flows
<list>
<item>ease building and maintenance of quality assurance tools</item>
<item>abandon the older in-house annotation format</item>
<item>enable new project members to familiarize themselves more quickly and
easily with the model</item>
</list>
</item>
</list>
</p>
</div>
<div>
<head>IDS collaborations</head>
<p>IDS collaborates with several other research institutes in various projects.
Among them:
<list>
<item>Berlin Brandenburgische Akademie der Wisschenschaften (BBAW)</item>
<item>also has a large collection of corpora in a variant TEI P5</item>
<item rend="incremental">... but it's not quite the same, as ours</item>
</list>
</p>
</div>
<div>
<head>Finding common ground in the TEI</head>
<p>How does <emph>our</emph> subset of TEI
compare to <emph>their</emph> subset?</p>
<list>
<item>Elements
<list>
<item>What elements do we have that they don't?</item>
<item>What elements do they have that we don't?</item>
<item>If we both have the element, does it have compatible content
models?</item>
</list></item>
<item>Attributes?</item>
<item>Parameter entities?</item>
<item>How do the two document grammars compare to
what is actually in the data?</item>
</list>
</div>
</div>
<div>
<head>Current state</head>
<list>
<item>system overview</item>
<item>demo<list>
<item>loading DTDs</item>
<item>component lists</item>
<item>declaration display</item>
</list></item>
</list>

<div>
<head>Data flow</head>
<p>
<list>
<item>DTD </item>
<item>&rarr; <ident>dpp</ident> (DTD pre-processor) </item>
<item>&rarr; XML</item>
<item>&rarr; XML database (BaseX)</item>
</list>
</p>
</div>
<div>
<head>System structure</head>
<list>
<item>HTML client (COTS browser)</item>
<item>HTTP server (Apache)</item>
<item>XQuery server (BaseX)</item>
<item>intermediary bop.php (BaseX-over-PHP)</item>
</list>
</div>
<div>
<head>Demo</head>
<p>Watch this one ...</p>
</div>

</div>

<div>
<head>Future work</head>
<list>
<item>deeper queries</item>
<item>content-model visualization</item>
<item>weighted automata</item>
</list>

<div>
<head>Deeper queries</head>
<p>How do these grammars relate?<list>
<item>What is their union (valid against <emph>either</emph>)?</item>
<item>What is their intersection (valid against <emph>both</emph>)?</item>
</list>
</p>
<p>How do these grammars differ?<list>
<item>A minus B (valid against A, invalid against B)?</item>
<item>B minus A (valid against B, invalid against A)?</item>
</list>
</p>
<p rend="incremental">But wait, isn't that hopeless?</p>
</div>

<div>
<head>Example:  <gi>biblFull</gi></head>
<p>Grammar A:  <code>( (titleStmt, editionStmt?, extent?,
publicationStmt), sourceDesc*)</code></p>
<p>Grammar B:  <code>( (titleStmt, editionStmt?, extent?,
publicationStmt, seriesStmt?, notesStmt?), sourceDesc*)</code></p>
<p rend="incremental">A &cup; B: <seg>&equiv; B</seg></p>
<p rend="incremental">A &cap; B: <seg>&equiv; A</seg></p>
<p rend="incremental">A - B: <seg>&empty;</seg></p>
<p rend="incremental">B - A: <code>( (titleStmt, editionStmt?, extent?,
publicationStmt, ((seriesStmt, notesStmt?) | (notesStmt)),
sourceDesc*)</code></p>
<p rend="incremental">But wait, this one really IS hopeless!</p>
</div>

<div>
<head>Visualization</head>
<p>Content models are just regular expressions.</p>
<p>So they can be shown as finite
state automata.</p>
<p>E.g. 
<figure rend="15%" entity="biblFull.i5.gluschkov.dot">
</figure>
</p>
</div>
<div>
<head>The other grammar</head>
<p>E.g. 
<figure rend="15%" entity="biblFull.bf.gluschkov.dot">
</figure>
</p>
</div>
<div>
<head>Another view</head>
<p>E.g.
<figure rend="85%" entity="biblFull.bf.sfdp">
</figure>
</p>
<!--* biblFull:
good: bf.dot i5.dot

quirky (maybe good) bf.sfdp i5.circo i5.sfdp
i5.fdp 

ng: bf.circo fdp neato twopi
*-->
</div>

<div>
<head>A complication</head>
<p>This doesn't work well for all elements.  Here is <gi>p</gi> (paragraph):
<!--* <figure rend="85%" entity="p.i5.gluschkov.dot"> *-->
<figure rend="85%" entity="p.i5.gluschkov.dot">
</figure>
</p>
</div>
<div>
<head>A complication (2)</head>
<p>Fortunately, GraphViz has multiple layout algorithms.  Here is another:
<figure rend="65%" entity="p.i5.gluschkov.neato">
</figure>
</p>
</div>

<div>
<head>Handling mixed content</head>
<p>Mixed content (in XML-DTD style) requires special handling:
<!--*
<figure rend="85%" entity="p.i5.gluschkov.circo">
</figure>
*-->
<figure rend="50%" entity="p.i5.gluschkov.circo">
</figure>
</p>
</div>

<div>
<head>Minimized automata</head>
<p>Minimized FSA are smaller:
<figure rend="85%" entity="p.i5.min.dot">
</figure>
</p>
</div>

<div>
<head>Weighted automata </head>
<p>But how much of the content model is cruft?</p>
<p>Let's color-code for frequency:

<figure rend="15%" entity="biblFull.i5.color.dot">
</figure></p>
</div>
<div>
<head>What's actually in paragraphs?</head>
<p>
<figure rend="50%" entity="p.i5.colored.circo">
</figure>
</p>
</div>
<div>
<head>Limitations, regrets</head>
<p><list>
<item>No db in the cloud; you have to install.</item>
<item>Currently DTD syntax only; other schema languages
non-zero priority.</item>
<item>UI currently primitive; XForms will help.</item>
</list>
</p>
</div>
<div>
<head>Show me the code, ...</head>
<p>
<xref href="https://github.com/BlackMesaTechnologies/Igel.git"
>https://github.com/BlackMesaTechnologies/Igel.git</xref></p>
</div>


</div>

<div>
<head>Acknowledgements</head>
<list>
  <!--*
  <item>Foto: <xref
	      href="http://www.flickr.com/photos/larry1732/5499394899/">Black
	      Mesa: San Ildefonso Pueblo</xref>, von Larry Lamsa
	    (<xref
	      href="http://creativecommons.org/licenses/by/2.0/deed.en">CC
	      BY 2.0</xref>)</item>
  *-->
  <item>Photo: Detail from <xref
	      href="http://www.flickr.com/photos/wmichary/2843673873/">Black
	      Mesa</xref>, by Marcin Wichary, 9 September 2008
	    (<xref
	      href="http://creativecommons.org/licenses/by/2.0/deed.en">CC
	      BY 2.0</xref>)</item>

</list>
</div>


</body>
</text>
</TEI.2>
<!-- Keep this comment at the end of the file
Local variables:
mode: xml
sgml-default-dtd-file:"/Library/SGML/Public/Emacs/sweb.ced"
sgml-omittag:t
sgml-shorttag:t
End:
-->
