diff --git a/interactive-mining-angular-frontend/ng-package.json b/interactive-mining-angular-frontend/ng-package.json index 4bd9b32..aaf355a 100755 --- a/interactive-mining-angular-frontend/ng-package.json +++ b/interactive-mining-angular-frontend/ng-package.json @@ -4,7 +4,8 @@ "ngPackage": { "lib": { "entryFile": "public_api.ts", - "comments" : "none" + "comments" : "none", + "externals": {"ng2-nouislider": "ng2-nouislider"} }, "dest": "../../dist/interactive-mining-lib" } diff --git a/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts b/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts index 1c24b69..c420656 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts +++ b/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts @@ -32,7 +32,7 @@ export class ConfigurationComponent implements OnInit, AfterViewInit { localStorage.setItem('contextprev', '10'); } if (!localStorage.getItem('contextnext') || localStorage.getItem('contextnext') === 'undefined') { - localStorage.setItem('contextnext', '5'); + localStorage.setItem('contextnext', '20'); } if (!localStorage.getItem('wordssplitnum') || localStorage.getItem('wordssplitnum') === 'undefined') { localStorage.setItem('wordssplitnum', '1'); diff --git a/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.html b/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.html index c570090..2843584 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.html +++ b/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.html @@ -40,7 +40,7 @@ Upload your documents - + diff --git a/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html b/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html index 23fedfc..f94c30e 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html +++ b/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html @@ -10,7 +10,7 @@
- Hight
precision
+ High
precision
@@ -71,7 +71,7 @@ {{negativePhrasesArray.length}} phrase{{negativePhrasesArray.length===1?'':'s'}}
-

Add negative phrases that reduce the possibility for a match to be a match. You can use different weights to divide between important and less important phrases.

+

Add negative phrases. If these phrases are found around the match, this is possibly a false possitive. You can use different weights to assign importance.

@@ -121,7 +121,7 @@

Select among the following text preprocessing steps.

- +
@@ -130,7 +130,7 @@
- +
@@ -142,7 +142,7 @@ after: {{settings.contextnext}}
-

You may edit the length of the text area that the algorithm uses to decide if a match is a true positive.
This is mainly for advanced users, so you ‘ll rarely need to change these values.

+

You may edit the length of the text area that the algorithm uses to decide if a match is a true positive.
For advanced users only

diff --git a/interactive-mining-angular-frontend/src/app/contents/contents.component.html b/interactive-mining-angular-frontend/src/app/contents/contents.component.html index 14f74b2..e66f639 100755 --- a/interactive-mining-angular-frontend/src/app/contents/contents.component.html +++ b/interactive-mining-angular-frontend/src/app/contents/contents.component.html @@ -2,7 +2,7 @@
-

Use the table or the upload form to add your concepts

+

Use the table or the upload form to add your concepts

@@ -56,7 +56,6 @@
TSV file type, maximum 50kB -
diff --git a/interactive-mining-angular-frontend/src/app/manageprofiles/manageprofiles.service.ts b/interactive-mining-angular-frontend/src/app/manageprofiles/manageprofiles.service.ts index 1a516aa..77ca7f7 100755 --- a/interactive-mining-angular-frontend/src/app/manageprofiles/manageprofiles.service.ts +++ b/interactive-mining-angular-frontend/src/app/manageprofiles/manageprofiles.service.ts @@ -56,7 +56,7 @@ export class ManageprofilesService { } loadExampleProfile(name: string): Observable { - return this.http.get(this.backendServerAddress + this.loadExampleProfileUrl + `?user=${this.userId}`) + return this.http.post(this.backendServerAddress + this.loadExampleProfileUrl, {user: this.userId, name: name}) .catch(this.util.handleError); } diff --git a/interactive-mining-angular-frontend/src/assets/css/interactive-mining.css b/interactive-mining-angular-frontend/src/assets/css/interactive-mining.css index 79273b3..4b6ed3d 100755 --- a/interactive-mining-angular-frontend/src/assets/css/interactive-mining.css +++ b/interactive-mining-angular-frontend/src/assets/css/interactive-mining.css @@ -9,7 +9,8 @@ background-color: #FFF !important; } -@import "~nouislider/distribute/nouislider.min.css"; +/*NoUiSlider CSS*/ +/*! nouislider - 11.0.3 - 2018-01-21 14:04:07 */.noUi-target,.noUi-target *{-webkit-touch-callout:none;-webkit-tap-highlight-color:transparent;-webkit-user-select:none;-ms-touch-action:none;touch-action:none;-ms-user-select:none;-moz-user-select:none;user-select:none;-moz-box-sizing:border-box;box-sizing:border-box}.noUi-target{position:relative;direction:ltr}.noUi-base,.noUi-connects{width:100%;height:100%;position:relative;z-index:1}.noUi-connects{overflow:hidden;z-index:0}.noUi-connect,.noUi-origin{will-change:transform;position:absolute;z-index:1;top:0;left:0;height:100%;width:100%;-webkit-transform-origin:0 0;transform-origin:0 0}html:not([dir=rtl]) .noUi-horizontal .noUi-origin{left:auto;right:0}.noUi-vertical .noUi-origin{width:0}.noUi-horizontal .noUi-origin{height:0}.noUi-handle{position:absolute}.noUi-state-tap .noUi-connect,.noUi-state-tap .noUi-origin{-webkit-transition:transform .3s;transition:transform .3s}.noUi-state-drag *{cursor:inherit!important}.noUi-horizontal{height:18px}.noUi-horizontal .noUi-handle{width:34px;height:28px;left:-17px;top:-6px}.noUi-vertical{width:18px}.noUi-vertical .noUi-handle{width:28px;height:34px;left:-6px;top:-17px}html:not([dir=rtl]) .noUi-horizontal .noUi-handle{right:-17px;left:auto}.noUi-target{background:#FAFAFA;border-radius:4px;border:1px solid #D3D3D3;box-shadow:inset 0 1px 1px #F0F0F0,0 3px 6px -5px #BBB}.noUi-connects{border-radius:3px}.noUi-connect{background:#3FB8AF}.noUi-draggable{cursor:ew-resize}.noUi-vertical .noUi-draggable{cursor:ns-resize}.noUi-handle{border:1px solid #D9D9D9;border-radius:3px;background:#FFF;cursor:default;box-shadow:inset 0 0 1px #FFF,inset 0 1px 7px #EBEBEB,0 3px 6px -3px #BBB}.noUi-active{box-shadow:inset 0 0 1px #FFF,inset 0 1px 7px #DDD,0 3px 6px -3px #BBB}.noUi-handle:after,.noUi-handle:before{content:"";display:block;position:absolute;height:14px;width:1px;background:#E8E7E6;left:14px;top:6px}.noUi-handle:after{left:17px}.noUi-vertical .noUi-handle:after,.noUi-vertical .noUi-handle:before{width:14px;height:1px;left:6px;top:14px}.noUi-vertical .noUi-handle:after{top:17px}[disabled] .noUi-connect{background:#B8B8B8}[disabled] .noUi-handle,[disabled].noUi-handle,[disabled].noUi-target{cursor:not-allowed}.noUi-pips,.noUi-pips *{-moz-box-sizing:border-box;box-sizing:border-box}.noUi-pips{position:absolute;color:#999}.noUi-value{position:absolute;white-space:nowrap;text-align:center}.noUi-value-sub{color:#ccc;font-size:10px}.noUi-marker{position:absolute;background:#CCC}.noUi-marker-large,.noUi-marker-sub{background:#AAA}.noUi-pips-horizontal{padding:10px 0;height:80px;top:100%;left:0;width:100%}.noUi-value-horizontal{-webkit-transform:translate(-50%,50%);transform:translate(-50%,50%)}.noUi-rtl .noUi-value-horizontal{-webkit-transform:translate(50%,50%);transform:translate(50%,50%)}.noUi-marker-horizontal.noUi-marker{margin-left:-1px;width:2px;height:5px}.noUi-marker-horizontal.noUi-marker-sub{height:10px}.noUi-marker-horizontal.noUi-marker-large{height:15px}.noUi-pips-vertical{padding:0 10px;height:100%;top:0;left:100%}.noUi-value-vertical{-webkit-transform:translate(0,-50%);transform:translate(0,-50%,0);padding-left:25px}.noUi-rtl .noUi-value-vertical{-webkit-transform:translate(0,50%);transform:translate(0,50%)}.noUi-marker-vertical.noUi-marker{width:5px;height:2px;margin-top:-1px}.noUi-marker-vertical.noUi-marker-sub{width:10px}.noUi-marker-vertical.noUi-marker-large{width:15px}.noUi-tooltip{display:block;position:absolute;border:1px solid #D9D9D9;border-radius:3px;background:#fff;color:#000;padding:5px;text-align:center;white-space:nowrap}.noUi-horizontal .noUi-tooltip{-webkit-transform:translate(-50%,0);transform:translate(-50%,0);left:50%;bottom:120%}.noUi-vertical .noUi-tooltip{-webkit-transform:translate(0,-50%);transform:translate(0,-50%);top:50%;right:120%} /* COLOR PALETTE */ :root { @@ -1057,6 +1058,9 @@ header.uk-sticky-fixed .cm-results-count-section { border-radius: 2px; margin-right: 5px; } +.cm-label-docs { + background: var(--cm-theme-7); +} /* Custom text ========================================================================== */ .cm-text-muted { diff --git a/interactive-mining-backend/madoap/src/madserverv3.py b/interactive-mining-backend/madoap/src/madserverv3.py index d16d66a..0aeff72 100755 --- a/interactive-mining-backend/madoap/src/madserverv3.py +++ b/interactive-mining-backend/madoap/src/madserverv3.py @@ -158,8 +158,8 @@ def loadProfileDocs(user_id, profile_id): if os.path.isfile(unique_profile_docs_file_name): copyfile(unique_profile_docs_file_name, docs_file_name) -def loadExampleDocs(user_id): - sample_file = open("static/exampleDocs.txt", 'r') +def loadExampleDocs(docsLocation, user_id): + sample_file = open(docsLocation, 'r') # write data to physical file cname = "users_files/docs{0}.json".format(user_id) fh = open(cname, 'w') @@ -169,7 +169,7 @@ def loadExampleDocs(user_id): break fh.write(copy_buffer) fh.close() - lines_num = sum(1 for line in open(cname)) + return sum(1 for line in open(cname)) def loadExampleProfile(user_id): return loadProfile("static/exampleProfile.oamp", user_id) @@ -592,6 +592,7 @@ class GetExampleProfilesHandler(BaseHandler): try: data = {} example_profiles = [] + example_profiles.append({'name': 'Clarin', 'contents': 11, 'documents': 7}) example_profiles.append({'name': 'Communities', 'contents': 25, 'documents': 104}) example_profiles.append({'name': 'AOF', 'contents': 66, 'documents': 1023}) example_profiles.append({'name': 'RCUK', 'contents': 263, 'documents': 140}) @@ -643,28 +644,40 @@ class LoadExampleProfileHandler(BaseHandler): def set_default_headers(self): self.set_header("Access-Control-Allow-Origin", "*") self.set_header("Access-Control-Allow-Headers", "Origin, X-Requested-With, Content-Type, Accept") - self.set_header('Access-Control-Allow-Methods', 'GET, OPTIONS') + self.set_header('Access-Control-Allow-Methods', 'POST, OPTIONS') self.set_header('Access-Control-Allow-Credentials', 'true') self.set_header('Content-Type', 'application/json') def options(self): # no body self.set_status(204) self.finish() - def get(self): + def post(self): try: - # get user id from arguments. Must have - if 'user' not in self.request.arguments or self.request.arguments['user'][0] == '': + # get user id from body. Must have + request_arguments = json.loads(self.request.body) + if 'user' not in request_arguments or request_arguments['user'] == '': self.set_status(400) self.write("Missing user's id parameter") return - user_id = self.request.arguments['user'][0] + user_id = request_arguments['user'] + # get data + if 'name' not in request_arguments or request_arguments['name'] == '': + self.set_status(400) + self.write("Missing example profiles name parameter") + return + example_name = request_arguments['name'] # reset everything deleteAllUserFiles(user_id) - # load example data - loadExampleDocs(user_id) - data = loadExampleProfile(user_id) - data['docname'] = 'Example' - data['docsnumber'] = '26' + data = {} + if example_name == 'Clarin': + data = loadProfile("static/example{0}Profile.oamp".format(example_name), user_id) + data['docname'] = example_name + data['docsnumber'] = loadExampleDocs("static/example{0}Docs.json".format(example_name), user_id) + else: + # load example data + data = loadExampleProfile(user_id) + data['docname'] = 'Example' + data['docsnumber'] = loadExampleDocs("static/exampleDocs.txt", user_id) self.write(json.dumps(data)) self.finish() except Exception as ints: @@ -899,7 +912,7 @@ class GetDocSamplesHandler(BaseHandler): data = {} doc_samples = [] doc_samples.append({'name': 'Egi', 'documents': 104}) - doc_samples.append({'name': 'AOF', 'documents': 1023}) + doc_samples.append({'name': 'Clarin', 'documents': 1023}) doc_samples.append({'name': 'SNSF', 'documents': 140}) doc_samples.append({'name': 'ARIADNE', 'documents': 502}) doc_samples.append({'name': 'RCUK', 'documents': 104}) @@ -1029,6 +1042,8 @@ class ChooseDocSampleHandler(BaseHandler): sample_file_name = "" if doc_sample == "Egi": sample_file_name = "static/egi_sample.tsv" + elif doc_sample == "Clarin": + sample_file_name = "static/clarin_docs.json" elif doc_sample == "Rcuk": sample_file_name = "static/rcuk_sample.tsv" elif doc_sample == "Arxiv": @@ -1228,21 +1243,26 @@ class RunMiningHandler(BaseHandler): if numberOfDocsUploaded(user_id) != 0: doc_filters = "comprspaces(regexpr('[\n|\r]',d2,' '))" + grant_filters = "stripchars(comprspaces(regexpr(\"\\'\", p1,'')))" ackn_filters = "comprspaces(regexpr(\"\\'\", p2,''))" if 'punctuation' in mining_parameters and mining_parameters['punctuation'] == 1: doc_filters = 'keywords('+doc_filters+')' + grant_filters = 'keywords('+grant_filters+')' ackn_filters = 'keywords('+ackn_filters+')' if 'lowercase' in mining_parameters and mining_parameters['lowercase'] == 1: doc_filters = 'lower('+doc_filters+')' + grant_filters = 'lower('+grant_filters+')' ackn_filters = 'lower('+ackn_filters+')' if 'stopwords' in mining_parameters and mining_parameters['stopwords'] == 1: doc_filters = 'filterstopwords('+doc_filters+')' + grant_filters = 'filterstopwords('+grant_filters+')' ackn_filters = 'filterstopwords('+ackn_filters+')' if 'stemming' in mining_parameters and mining_parameters['stemming'] == 1: doc_filters = 'stem('+doc_filters+')' + grant_filters = 'stem('+grant_filters+')' ackn_filters = 'stem('+ackn_filters+')' list(cursor.execute("drop table if exists grantstemp"+user_id, parse=False)) - query_pre_grants = "create temp table grantstemp{0} as select stripchars(p1) as gt1, case when p2 is null then null else {1} end as gt2 from (setschema 'p1,p2' file 'users_files/p{0}.tsv' dialect:tsv)".format(user_id, ackn_filters) + query_pre_grants = "create temp table grantstemp{0} as select {1} as gt1, case when p2 is null then null else {2} end as gt2 from (setschema 'p1,p2' file 'users_files/p{0}.tsv' dialect:tsv)".format(user_id, grant_filters, ackn_filters) cursor.execute(query_pre_grants) query00get = "select * from grantstemp{0}".format(user_id) results00get = [r for r in cursor.execute(query00get)] diff --git a/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json b/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json new file mode 100644 index 0000000..3e6e3d7 --- /dev/null +++ b/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json @@ -0,0 +1,7 @@ +{"id":"1410.0286","text":"LAF-Fabric: a data analysis tool for Linguistic Annotation Framework with an application to the Hebrew Bible Dirk Roorda\u2217 \u2217\u2217 Gino Kalkman\u2217\u2217\u2217 Martijn Naaijer\u2217\u2217\u2217 Andreas van Cranenburgh\u2217\u2217\u2217\u2217 \u2217\u2217\u2217\u2217\u2217 dirk.roorda@dans.knaw.nl g.j.kalkman@vu.nl m.naaijer@vu.nl andreas.van.cranenburgh@huygens.knaw.nl \u2217 Data Archiving and Networked Services - Royal Netherlands Academy of Arts and Sciences, Anna van Saksenlaan 10; 2593 HT Den Haag, Netherlands \u2217\u2217 The Language Archive - Max Planck Institute for Psycholinguistics, Wundtlaan 1; 6525 XD Nijmegen, Netherlands arXiv:1410.0286v1 [cs.CL] 1 Oct 2014 \u2217\u2217\u2217 Eep Talstra Centre for Bible and Computing - VU University, Faculteit der Godgeleerdheid; De Boelelaan 1105; 1081 HV Amsterdam, Netherlands \u2217\u2217\u2217\u2217 Huygens Institute for the History of the Netherlands - Royal Netherlands Academy of Arts and Sciences, P.O. box 90754; 2509 LT; Den Haag, Netherlands \u2217\u2217\u2217\u2217\u2217 Institute for Logic Language and Computation - University of Amsterdam, FNWI ILLC Universiteit van Amsterdam; P.O. Box 94242; 1090 GE Amsterdam, Netherlands Abstract The Linguistic Annotation Framework (LAF) provides a general, extensible stand-off markup system for corpora. This paper discusses LAF-Fabric, a new tool to analyse LAF resources in general with an extension to process the Hebrew Bible in particular. We first walk through the history of the Hebrew Bible as text database in decennium-wide steps. Then we describe how LAF-Fabric may serve as an analysis tool for this corpus. Finally, we describe three analytic projects/workflows that benefit from the new LAF representation: 1) the study of linguistic variation: extract cooccurrence data of common nouns between the books of the Bible (Martijn Naaijer); 2) the study of the grammar of Hebrew poetry in the Psalms: extract clause typology (Gino Kalkman); 3) construction of a parser of classical Hebrew by Data Oriented Parsing: generate tree structures from the database (Andreas van Cranenburgh). 1. The Hebrew Bible The Hebrew Bible is written in old forms of Hebrew and Aramaic, which are dead languages now. Written manuscripts are our only source for studying these languages. The study of such a body of historical texts involves research questions from different disciplines. Linguistic analysis is a stepping stone which must be followed by questions at higher levels of abstraction, such as literary questions: how did authors use the system of the language to craft their design: i.e. style, literary effect, focus, and all those features of the text that are not dictated by the language system (van Peursen et al. 2010)? Another line of questions falls into historical linguistics: systematically charting linguistic variation in the biblical linguistic corpus can help addressing the question as to whether the variation reflects diachronic development (van Peursen and van Keulen 2006). 1.1 Bible and Computer Naturally, a research program as mentioned above seeks to employ digital tools. In fact, a group of researchers in Amsterdam started compiling a text database in the 1970s: the Werkgroep Informatica Vrije Universiteit (WIVU). This resulted in a database of text and markup, the so-called WIVU database (Talstra and Sikkel 2000), which became widely known because it became incorporated in Bible study software packages. The WIVU markup is based on observable characteristics of the texts and refrains from being committed to a particular linguistic theory or framework. There is no explicit grammar to which the marked-up text material has to conform. One of the consequences is that the data cannot conveniently be described in one hierarchical structure, although hierarchy plays a role. There are several, incompatible hierarchies implicit in the data. In the 1990s the ground work has been laid for an analytical tool operating on the WIVU data. In his PhD. thesis, Doedens (1994) defined a database model and a pair of query languages to deal with text databases, i.e. databases in which the essential structural characteristics of text, sequence and embedding, are catered for in a natural way. One of the query languages, QL1 , he characterised as topographic meaning that a query instruction is isomorphic to its results with respect to sequence and embedding (Doedens 1994, pp. 108).2 QL has been turned into an efficient implementation by Petersen (2004, 2006), allbeit in a somewhat restricted form, named MQL (Mini-QL). The resulting database system, EMDROS (Petersen 2002-2014), has become part of the WIVU tool set and is still maintained by its author. Between the initial work of the WIVU group then and the activities of the ETCBC group now3 , the web has arrived and the practice of information sharing and standardizing has changed dramatically. Now, in the 2010s, the need is increasingly felt to preserve and share the results of the ETCBC with not only theologians but also with linguists and researchers in the digital humanities. Conversely, the methods developed in the sphere of computational linguistics hold promises for new kinds of studies of the Hebrew texts. A first step has been the deposition of the database in its EMDROS form at DANS (Talstra et al. 2012). 1.2 Text, markup, and annotation In 2013 the CLARIN-NL4 project SHEBANQ5 started to convert the Hebrew Text Database from its EMDROS representation into Linguistic Annotation Framework (LAF) (Ide and Romary 2012). Linguistic Annotation Framework defines a fairly generic data model for annotated data. The assumption is that there is an immutable data resource with identifiable regions in it. Regions can be attached to nodes, nodes can be linked with edges, and both nodes and edges can be the targets of annotations, which contain feature structures. This annotated graph, anchored to primary data, can be expressed in XML, by means of the GrAF6 schemas, suggested by the LAF standard. This model has turned out to be a very good fit for the EMDROS data, which consists of objects carrying features, where features define properties or point to other objects. In EMDROS, objects are linked to arbitrary subsets of words, and the concatenation of all words is the text of the Hebrew Bible according to the Biblia Hebraica Stuttgartensia Edition (Kittel 1968-1997), which can be regarded as an immutable resource in practice. Figure 1 shows the mapping in a nutshell. Note that this kind of markup is stand-off markup, as opposed to inline markup, of which TEI7 is the prime example. 1. QL just means Query Language. 2. The other query language, LL, is a logical one, introduced for theoretical purposes (Doedens 1994, pp. 199). 3. With the retirement of founding father Eep Talstra in 2013, WIVU has been rebaptized as Eep Talstra Centre for Bible and Computing. 4. http://www.clarin.nl 5. System for HEBrew text: ANnotations for Queries and Markup, http://www.godgeleerdheid.vu.nl/nl/ nieuws-en-agenda/nieuwsarchief/2013/jan-mrt/clarin-subsidie-voor-hebreeuwse-database-op-internet. asp 6. Graph Annotation Format, http://www.xces.org/ns/GrAF/1.0/ 7. Text Encoding Initiative, http://www.tei-c.org/index.xml sentence clause_atom_number=1 clause_atom_relation=0 clause_atom_type=xQtl indentation=0 clause annotations (features) determination=determined phrase_function=Objc phrase_type=PP Linguistic Annotation Framework phrase parents subphrase labeled edges mother link to regions annotations (empty) lexeme_utf8=\u202b\u05e8\u05d0\ufb2a\u05d9\u05ea\u202c surface_consonants_utf8=\u202b\u05e8\u05d0\ufb2a\u05d9\u05ea\u202c r11 r10 r9 word r11 r10 r9 92 72-91 n3 n2 nodes 6-23 0-5 regions \u202b\ufb2c\u0596\u05b7\u05de\u05b4\u05d9\u05dd \u05b0\u05d5\u05a5\u05b5\u05d0\u05ea \u05b8\u05d4\u05bd\u05b8\u05d0\u05b6\u05e8\u05e5\u05c3\u202c \u05b8 \u202b\u05d4\u05d9\u05dd \u05a5\u05b5\u05d0\u05ea \u05b7\u05d4\u202c \u0591 \u05b4 \u202b\u05d0\u05b9\u05dc\u202c \u05b1 \u202b\u05e8\u05d0\u202c \u05a3 \u05b8 \u202b\u05b0\ufb31\u05b5\u05e8\u05d0\u0596\u05b4\ufb2a\u05d9\u05ea \u05b8\ufb31\u202c primary data Figure 1: The Hebrew text database in LAF 2. LAF-Fabric The primary result of the conversion of the Hebrew Text Database to LAF (van Peursen and Roorda 2014) is the fact that now a standard representation of the data can be archived. Moreover, we can preserve queries on the database as well, following an idea expressed in (Roorda and Heuvel 2012). But as soon as one turns to the LAF representation, the question comes to mind: are there tools with which this LAF resource can be processed? 2.1 LAF tools There are emerging tools to deal with stand-off annotations and markup. But here the versatility of stand-off markup must be paid for: there are no tools that are applicable to all stand-off resources. Even if we restrict to LAF, there are no mature tools that deal with LAF in full generality. There are candidates, though. Here are some options and experiences. 1. The eXist database engine8 . Initial experiments showed clearly that eXist is not designed to handle large LAF resources well in its default configuration. Apart from an initial load time of more than an hour, even simple queries took dozens of minutes. Surely this could have been improved by setting up indexes, but it is by no means obvious which indexes are needed for all possible queries. We decided not to pursue this path. 8. http://www.exist-db.org 2. ELAN9 is a tool for annotating audio and video material. In theory it could also be used for annotating plain text streams, but it is not designed for that. There are modelling issues, and probably performance issues as well. Discussions with the ELAN people at the The Language Archive10 made clear that ELAN works well with big primary data (audio and video) and sparse annotation data. Our case is the exact opposite: small primary data (plain text) and very rich annotation data. 3. Graf-python (Bouda 2013-2014) (component of POIO (Bouda et al. 2012)) is a Python library designed to analyse generic LAF resources, such as the Open American National Corpus11 . However, in its current form it does not scale up to match the size of the Hebrew Text Database. The load time is half an hour, the memory footprint is 20 GB, and these costs are incurred every time before you run even the simplest analysis script. However, graf-python is appealing, and the first author decided to implement it in a new way with performance in mind. Whereas graf-python is a clean translation of the LAF concepts into object-oriented Python code, an efficient processor would need to compile LAF concepts into efficient data structures, such as Python arrays, which have C-like performance. 2.2 Computing with LAF-Fabric A new, graf-python-like library was needed, one that could handle the full size of the Hebrew text database (> 400K words, > 1.4M nodes, > 1.5M edges, > 33M features) with ease. This is not, by today\u2019s standards, a large corpus, but as it represents a fixed cultural artefact, it is all the data we have got and we want to examine all of them very closely. That is why have invested in a system that allows us to manipulate the full data in main memory: LAF-Fabric (Roorda 2013-2014), a Python package that compiles LAF into a compact, binary form that loads fast. It has an API that supports walking over the node set along edges or in various orders, absorbing the features of the nodes in passing. This kind of processing could also be done with a graph database, such as NEO4J12 . The advantages of LAF-Fabric are the ease of installation, compactness of the compiled data, and the integration in the Python world of scientific computing, e.g. the IPython notebook13 (P\u00b4erez and Granger 2007). The baseline mode of data access is to walk over node sets and use edges to explore the neighbourhood of nodes. Each node is loaded with features, from which their text and linguistic properties may be read off. From this graph one can populate tables, vectors, trees and graphs with the data one wants to focus on. Then, leaving LAF-Fabric, but still in the IPython notebook, one can use the facilities of the Python ecosystem to do data analysis and visualization. All these steps are shown in a tutorial notebook that charts the frequency of masculine and feminine gender for all chapters in the Hebrew Bible14 . LAF-Fabric does not introduce a new query language, so the user of LAF-Fabric does not have to learn the ins and outs of a new formalism. Instead, it offers programmatic access to the nodes, edges and features in a LAF resource, in such a way that the programmer is not burdened with the technicalities of the LAF data representation in XML. But LAF-Fabric is not an end user tool. End user tools are usually built on the basis of a relatively fixed concept of what results users typically 9. Eudico Linguistic ANnotator is an annotation tool that allows the creation, editing, visualizing and searching of annotations for video and audio data. Software developed at Max Planck Institute for Psycholinguistics, Nijmegen. http://tla.mpi.nl/tools/tla-tools/elan/. 10. Max Planck Institute for Psycholinguistics, Nijmegen, https://tla.mpi.nl 11. The OANC is the corpus that drove much of the specifications of LAF. See http://www.americannationalcorpus. org/OANC/index.html 12. http://www.neo4j.org 13. A quick introduction of IPython can be found at http://ipython.org/ipython-doc/stable/notebook/notebook. html 14. http://nbviewer.ipython.org/github/ETCBC/laf-fabric/blob/master/examples/gender.ipynb want to achieve, and those results are then offered by the tool in a user-friendly manner. As soon as the user\u2019s needs move outside the scope of the tool in question, the user-friendliness is over. In contrast, the approach of LAF-Fabric is that it just makes the data available, leaving it to the user how to extract that data and what to do with it. Clearly, LAF-Fabric is not a tool for the non-programming end user, it is best used by a team in a laboratory context, where some members define the research needs and another member translates those needs into working IPython notebooks. 2.3 Beyond pure LAF LAF is a rather loose standard. There are many ways to model specific language resources, and it is not obvious which choices will work out best for data processing. In particular, the semantics of edges in LAF is completely open. Specific resources may have more structure than can be exploited by a generic LAF tool. That is why LAF-Fabric has hooks for third-party modules to exploit additional regularities. We ourselves developed a package called etcbc that provides extra functionality for the Hebrew Text Database in its LAF representation: 1. Node ordering: the Hebrew Bible data has object types by which nodes can be ordered more extensively than on the basis of the generic LAF data only. LAF-Fabric can be instructed to use the etcbc sorting instead of its default sorting. 2. Data entry: etcbc contains facilities to generate forms for data entry, read them back and convert the results to proper LAF files, which can then be added to the original resource. 3. MQL queries: etcbc has a facility to run queries on the EMDROS version of the data, and collect the results as node sets in LAF. This gives the best of two worlds: topographic queries intermixed with node walking. The github repository laf-fabric-nbs15 contains quite a few examples of Hebrew-LAF data processing in various degrees of maturity. 2.4 Notes on the choice for LAF When the SHEBANQ project was submitted, CLARIN required that a standard format be selected from a list16 . LAF was the only obvious choice from that list. But has it been a good choice? FoLiA (van Gompel 2013), which is not on the CLARIN list, comes to mind. So far, LAF is serving us well. Because the data model is very general, we can easily translate our data into LAF, even where they are organized in multiple hierarchies or no hierarchy at all. There are also features that do not represent linguistic information, but e.g. orthographic information17 and numbering information, some of which are clearly ad-hoc. All this data can be represented easily in LAF without introducing tricks and devices to work around the specifics of the LAF data model. We are confident that we will be able to represent the future data output of biblical scholars as well. It must be said, however, that because of the very abstractness of LAF, it was not obvious at first how to choose between the numerous ways in which one can represent annotation data in LAF. FoLiA has the characteristics of an attractive ecosystem of data, formats and tools for linguistic analysis of corpora. There are, however, a few things to be wary about when making a choice: 15. https://github.com/ETCBC/laf-fabric-nbs 16. http://www.clarin.eu/sites/default/files/Standards%20for%20LRT-v6.pdf 17. The Hebrew script poses complexities: the basic information is in the consonants, the vowels have been added later as diacritics, and there are also prosodic diacritics present. The database provides a number of different representations for each word: with and without diacritics, in UNICODE Hebrew or in Latin transcription. 1. the ETCBC data is not the product of main-stream linguists, and there are other concerns than linguistic ones; 2. biblical scholars are continuously producing new data, in the form of new annotations. Within the stand-off paradigm it is easy to incorporate these data in a controlled way, indicating the provenance. Any format that relies on inline markup is at a disadvantage here; 3. the ETCBC data is essentially one document, and must fit in memory, together with a large subset of its annotations. It seems that FoLiA is geared to corpora with multiple, smaller documents. Nevertheless, it is an interesting exercise to convert the LAF version of the Hebrew Bible into FoLiA, but we leave it to one of our readers. Graf-python has an advantage over LAF-Fabric: it can deal with feature structures in full generality. LAF-Fabric only deals with feature structures that are sets of key-value pairs. 2.5 Preprocessing for research The next sections describe several lines of research that benefit from the incarnation of the data in LAF and from LAF-Fabric as a preprocessing tool. They are examples of historical, literary and linguistic research lines and in that way they serve to indicate the breadth of the landscape of biblical scholarship. Rather than pursuing those lines in full depth, the purpose of this paper is to convey the importance of having good preprocessing tools, based on a standard format. If that is in place, research efforts and results get a boost. 3. Linguistic variation: extracting cooccurrences For a long time scholars have held the consensus that most of the linguistic variation in Biblical Hebrew can be explained by assuming that there is an early variety (Early Biblical Hebrew or EBH, in use before the Babylonian exile in the 6th century BCE) and a late variety (Late Biblical Hebrew, or LBH, in use after the exile)18 . EBH can be found mainly in the books of the Pentateuch and the Former Prophets, whereas LBH can be found mainly in the undisputed late books (Esther, Daniel, Ezra, Nehemiah and Chronicles). To this diachronic model some have added dialectical variation (Rendsburg 1990b) and other kinds of variation (Rendsburg 1990a). However, in the past two decades serious challenges have been brought forward by several scholars. Biblical Hebrew seems to be pretty homogeneous in general and several of the methods used by scholars to study linguistic variation in Biblical Hebrew have become questionable now it has been shown that many linguistic features of which it was thought earlier that they are characteristic of LBH occur throughout the Hebrew Bible. An important methodological problem is the linguistic-literary circularity involved in many studies applying the diachronic model: a feature occurs mainly in late texts, therefore the feature is late and this proves that the texts in which it occurs are late and so on (Young et al. 2008, vol. 1, ch. 3,4). These issues have led some to propose a new model, namely that the variation in Biblical Hebrew fits a situation in which two styles were used, a more conservative one (formerly called EBH) and a freer one (formerly called LBH). According to this model, both styles were used before and after the exile (Young et al. 2008, vol. 2, ch. 2). In the NWO-funded project Does Syntactic Variation reflect Language Change? Tracing Syntactic Diversity in Biblical Hebrew Texts19 , another approach is advocated. Instead of starting with assumptions about where or when a specific biblical text was written, first the distribution of a large quantity of syntactic features and the way they vary throughout the Hebrew Bible will be mapped 18. An early voice propagating this view is Gesenius (1813); nowadays one of the most influential authors using the diachronic model is Avi Hurvitz, for instance (Hurvitz 1974, pp. 17-34). 19. See http://www.nwo.nl/en/research-and-results/research-projects/19/2300177219.html and only after this is done the question arises to what historical, geographical or cultural factors the resulted variation may be related. Using LAF-Fabric, the first and third authors did a pilot to get a general impression of the linguistic variation in Biblical Hebrew. First a list was made of the lexemes of all the verbs and common nouns in the Hebrew Bible and with this list a table was made in which the presence or absence of these lexemes in the separate biblical books is registered. With the data in this table a graph was made using the Force Atlas algorithm20 implemented in Gephi. The result is shown in figure 2. The figure shows several expected linguistic results, such as the relatively close relationship between the LBH books21 . Although this is only a preliminary result, with the help of LAF-Fabric in combination with high level tools for statistical analysis and visualization like Gephi or Matplotlib (an implementation of various visualization algorithms in Python) it is possible to analyse the use and distribution of large quantities of data, instead of focusing exclusively on details in separate features, as is usual in biblical studies. It may be expected that in the coming years such an approach will lead to many new results, insights and research ideas in the study of Biblical Hebrew. Canticum Leviticus Ezechiel Deuteronomium Iob Psalmi Threni Exodus Jesaia Proverbia Genesis Hosea Nahum Numeri Judices Josua Daniel Chronica_II Esra Jeremia Micha Reges_I Samuel_I Habakuk Esther Reges_II Samuel_II Amos Sacharia Nehemia Chronica_I Joel Maleachi Obadia Ecclesiastes Zephania Haggai Ruth Jona Figure 2: Gephi force atlas of distribution of common nouns and verbs in the books of the Hebrew Bible 4. Grammar of Hebrew Poetry: extracting clause typology For centuries Hebraists have been studying the verbal forms used in Biblical Hebrew. Though many have tried to provide a coherent description of verbal functions in Hebrew, consensus has never been reached. This is especially true for the functioning of the verbal forms in the poetic parts of the 20. This is a so called force-directed algorithm, which assigns attractive and repulsive forces to the edges. The algorithm performs a simulation of a physical system, resulting in intuitively understandable graphs. 21. This approach is derived from the author\u2019s MA thesis (Naaijer 2012). Hebrew Bible, which has repeatedly been characterized as completely irregular. Illustrative in this regard is the comment made by the grammarian Bergstr\u00a8asser (1926, 1986, pp. 29-35), who speaks of a v\u00a8 olligen Verwischung der Bedeutungsunterschiede der Tempora (\u2019a complete blurring of functional distinctions between the tenses\u2019) in Hebrew poetry and identifies the poetic use of verbal forms as regellos (\u2019random\u2019) and ohne ersichtlichen Grund (\u2019without apparent motivation\u2019). In a PhD-research project by the second author started in 2011, this rather desperate view on the verbal system of Hebrew poetry is considered unacceptable and the search for a linguistic system regulating the poetic use of verbal forms has been taken up again. A central assumption in the project is that the meaning of verbal forms in Hebrew is not to be described in terms of the traditional categories of tense, aspect and mood, but has more to do with the structuring of discourse. Therefore, one should not focus on the bare verbal forms, but rather on the clauses in which they are embedded and the position of these clauses in the whole of the text. In this type of approach, which is usually defined as text-linguistic, special attention is paid to the patterns constituted by subsequent verbal forms (and their clauses). Our research has shown that the connection of mother and daughter clause, in particular, has a strong influence on the exact functions adopted by the verbal forms in Hebrew. Though the forms can be assigned certain default functionalities, the exact concretization of these basic functions can only be determined on the basis of a detailed analysis of the broader clause patterns in which a specific clause takes its position. In this research project we reject the tendency among Hebraists to assume a gap between the use of verbal forms in prose and poetry. Instead of identifying different verbal systems for the two genres or even characterizing the use of verbs in Hebrew poetry as being devoid of any system, we claim that the two genres make use of a single verbal system, but differ in their preferences for certain parts of that system. More specifically, we assume that all types of clauses and clause patterns known in Biblical Hebrew can be used (with the same functionalities) in both prose and poetry, but that different clause types and patterns are dominant in the two genres. LAF-Fabric offers an excellent opportunity to test these assumptions as it provides direct access to the ETCBC database from which the clause patterns attested in the Hebrew texts can be easily extracted, as it contains syntactic hierarchies for each chapter of the Hebrew Bible. As an initial experiment we have created an IPython Notebook in which we have sorted and counted all asyndetic sequences of a mother clause and a daughter clause attested in selected prosaic, poetic and prophetic sections of the Hebrew Bible. LAF-Fabric enables us to iterate over all clauses in the preselected texts. For each clause, we have retrieved the value of its clause atom relation22 feature, which is coded as a three-digit value identifying the type of relation between that clause and its mother (i.e.: asyndetic, parallel, syndetic, coordinate, subordinate, etc.), the tense of the verbal predicate of the clause (i.e.: imperfect, perfect, imperative, etc.), and the tense of the verbal predicate of the mother. Further details on the use of specific functionalities provided by LAF-Fabric and the Python code written for this task can be found in the notebook entitled AsyndeticClauseFunctions by Kalkman (2013). In table 1, we present a top-10 of most frequently attested asyndetic clause patterns in the current ETCBC data analysed by our LAF-Fabric task. As the table shows, these ten patterns account for over 70% of all 11,111 patterns that have been found in the prosaic, poetic and prophetic texts. Several interesting observations can be made. First of all, this top-10 does not contain patterns that are strongly attested in one genre, while being virtually absent in another. Instead, all of the types of sequences do have quite a number of occurrences in each of the three genres. (though the rather low number of patterns analysed for poetry forces us to adopt a cautious attitude at this point). On the other hand, the differences should not be overlooked. Another visualization of the results may help us in this regard. In fig 3, the same statistical data are presented in a bar graph. As the graph shows, the pattern perfect perfect, imperative nominal clause, and, to a lower extent, 22. See the comprehensive documentation of the ETCBC features at http://shebanq-doc.readthedocs.org/en/ latest/features/index/code.html. CARnumber Prose nominal nominal imperfect imperfect perfect perfect nominal imperfect perfect nominal imperfect nominal perfect imperfect nominal perfect imperative nominal nominal imperative Totals 429 371 120 232 161 116 145 145 54 128 1901 % 14.64 12.66 4.09 7.92 5.49 3.96 4.95 4.95 1.84 4.37 64.86 Poetry 493 544 392 250 213 328 187 204 270 123 3004 % 12.07 13.32 9.6 6.12 5.21 8.03 4.58 4.99 6.61 3.01 73.54 Prophecy % Totals 328 332 555 244 340 249 273 242 212 74 2849 1250 1247 1067 726 714 693 605 591 536 325 7754 8.01 8.11 13.55 5.96 8.3 6.08 6.67 5.91 5.18 1.81 69.57 % 11.25 11.22 9.60 6.53 6.43 6.24 5.45 5.32 4.82 2.93 69.79 Table 1: Frequency table of attestations of asyndetic clause patterns in Biblical Hebrew prose, poetry and prophecy. The percentage column is the fraction for the Clause-Atom-Relation-number within the genre. 16 14 12 10 8 6 4 2 0 prose poetry prophecy Figure 3: Bar graph of frequencies of asyndetic clause patterns in Biblical Hebrew prose, poetry and prophecy. the pattern imperfect nominal clause, are far better attested in poetry and prophecy than in prose. Conversely, sequences of two nominal clauses play an important role in each of the genres, but are most strongly attested in prosaic texts. These observations can be explained by referring to the important claim made by several text linguists studying the Hebrew verbal system that the marking of the mode of communication, which can be either narrative or discursive (i.e. direct speech) is an important function of verbal forms in Hebrew (Weinrich 1964, pp. 45-48, 51-52, 55), (Schneider 1974, pp. 182-183), (Niccacci 1990, pp. 29-34). Narrative discourse is hardly attested in poetry and prophecy, while it is a dominant mode of communication in prose. It is therefore not surprising that the patterns perfect perfect, imperfect nominal clause and imperative nominal clause, which are characteristic of direct speech (instead of narrative) discourse, have higher relative frequencies in poetry and prophecy than in prose. All in all, we can draw the preliminary conclusion that, while certain patterns are more strongly attested in one genre than in the others, the differences between the genres in the relative numbers of occurrences of the asyndetic patterns are not extreme. This suggests that indeed one linguistic system underlies the functioning of verbal forms and clause types in the different genres attested in the Hebrew Bible. Moreover, the main criterion on the basis of which the forms and constructions belonging to the Hebrew verbal system can be further categorized is not so much that of genre (prose vs. poetry vs. prophecy), but rather that of mode of communication (narrative vs. discursive). To summarize, the use of verbal forms in Hebrew poetry may not at all be as chaotic as grammars, commentaries and Bible translations seem tot suggest. This type of experiments conducted with the help of LAF-Fabric has constituted the basis for more profound research into the Biblical Hebrew verbal system (Kalkman to appear 2014, Kalkman to be published 2015). As part of our research project, we have developed a Java program in which use is made of the current (May 2014) version of the data included in the ETCBC database. By concentrating on the syntactic patterns that are identified in the actual data, the program calculates the default and inherited functions that are to be assigned to each verbal form in the book of Psalms. Based on these calculations, the program also offers a translation of the verbal forms (and other basic constituents, such as subjects and objects) attested in each of the 150 Psalms. The translations and the results of the calculations made by our program are presented on a website (Kalkman to be published 2015). This website also provides a description of our methodology and theories. All in all, LAF-Fabric proves itself to be an indispensable tool for obtaining new insights in the grammar of the Biblical Hebrew verb, as it enables us to systematically extract and collect for further analysis the linguistic patterns that have a decisive impact on the functioning of verbal forms in Biblical Hebrew. 5. Data Oriented Parsing of classical Hebrew: generating trees There are many unresolved questions concerning the history of composition and transmission of the Hebrew Bible. One line of research in this area is to compare and cluster the texts with a view to classifying them along the dimensions of historical time, geography, and religious context. Often these classifications are based on intuition and implicit characteristics. The comparison of syntactic trees could provide this method with a more objective underpinning and deliver stronger results. For a start, the LAF data has been exported to syntactic trees, in effect turning the Hebrew Bible into a treebank for natural language processing (NLP).23 This has led to two applications: 1. extraction of recurring syntactic patterns (tree fragments) 2. construction and evaluation of a Data-Oriented Parsing grammar from said patterns. Since classical Hebrew has a relatively free word order, we make use of discontinuous constituents in the syntactic trees, inspired by the Negra corpus annotation (Skut et al. 1997). See figure 4 for an example of a sentence with such a discontinuous constituent. Table 2 describes the syntactic categories and Part-of-Speech tags that appear in the trees. It is possible to extract recurring patterns from a collection of tree structures using an algorithm first described in Sangati et al. (2010); we use a faster method that is also able to handle discontinuous constituents (van Cranenburgh 2014). The algorithm compares each pair of tree structures and extracts the largest fragments they have in common, along with their occurrence counts. Tree fragments may consist of phrases with or without words, and the words do not have to form a contiguous phrase. Since the fragments are found in pairs of trees, this count will always be at least two. In this way idioms and linguistic constructions can be detected in syntactic corpora. Figure 5 shows a sample of fragments extracted in this way. 23. For details on the conversion of the ETCBC data to trees, refer to the following IPython notebook: http: //nbviewer.ipython.org/github/ETCBC/laf-fabric-nbs/blob/master/trees_bhs.ipynb Syntactic categories S C CP VP SU NP PrNP PP Attr Part-of-Speech tags sentence clause conjunctive phrase verbal phrase subphrase nominal phrase proper noun phrase prepositional phrase attributive clause cj vb aj n pp dt conjunction verb adjective noun personal pronoun determiner Table 2: The syntactic categories and Part-of-Speech tags used in the Hebrew Bible trees. S C Attr CP VP cj vb PP n dt CP n \u202b\u05d5\u202c \u202b\u05d9\u05d1\u05d3\u05dc\u202c \u202b\u05d1\u05d9\u05df\u202c \u202b\u05d4\u202c \u202b\u05de\u05d9\u05dd\u202c and-he-divided between the-waters Attr PP cj pp n pp \u202b\u05d0\ufb2a\u05e8\u202c REL \u202b\u05de\u202c \u202b\u05ea\u05d7\u05ea\u202c under \u202b\u05dc\u202c CP dt n \u202b\u05e8\u05e7\u05d9\u05e2\u202c the-expanse cj PP n dt CP n \u202b\u05d5\u202c \u202b\u05d1\u05d9\u05df\u202c \u202b\u05d4\u202c \u202b\u05de\u05d9\u05dd\u202c and-between the-waters PP cj pp pp pp \u202b\u05d0\ufb2a\u05e8\u202c REL \u202b\u05de\u202c \u202b\u05e2\u05dc\u202c above \u202b\u05dc\u202c dt Figure 4: A sentence with a discontinuous constituent. Genesis chapter 1 verse 7. Aside from corpus analysis and stylometry, the extracted fragments can also be used as a grammar that assigns a syntactic analysis to a given sentence (parsing). Since classical Hebrew is a dead language, this may at first sight appear to be a pointless exercise. However, training a probabilistic grammar and evaluating it gives an impression of how well statistical patterns in a corpus can be exploited to extrapolate the syntactic structure of new sentences. The use of tree fragments as grammar was first proposed in the Data-Oriented Parsing (DOP) framework (Scha 1990, Bod 1992). We evaluate the performance of parsing classical Hebrew by taking the first 50,000 sentences as training data, and evaluate the resulting grammar on a heldout set containing the next 2,000 sentences. In this experiment, the parser is supplied with both words and part-of-speech tags. We use the implementation presented in van Cranenburgh and Bod (2013), which supports trees with discontinuous constituents. For the results, see Table 3. In the evaluation, the f-measure is the harmonic mean of the precision and recall of correctly identified constituents (Collins 1997); the exact match is the percentage of trees where all constituents are correct. The results are encouraging, although it should be noted that the sentences in this held-out set are short. number of sentences: longest sentence: labelled f-measure: exact match: 2,000 19 90.0 75.3 Table 3: DOP parsing results with the Hebrew Bible. n \u202b\u05e8\u05e7\u05d9\u05e2\u202c the-expanse S S C C S NP CP VP SU PP SU cj vb aj \u202b\u05d9\u05d0\u05de\u05e8\u05d5 \u05d5\u202c \u202b\u05d6\u05e7\u05e0\u05d9\u202c and said the elders Attr \u2026 CP VP PrNP cj vb \u202b\u05d5\u202c \u2026 and n \u2026 SU SU NP pp \u2026 \u2026 n n \u2026 \u202b\u05d9\ufb2b\u05e8\u05d0\u05dc\u202c Israel SU CP VP C cj vb \u2026 \u202b\u05d0\ufb2a\u05e8\u202c \u202b\u05e2\ufb2b\u05ea\u05d4\u202c REL you-have-made C PP CP Attr PP SU cj pp n dt \u202b\u05e2\u05dc \u05d0\ufb2a\u05e8\u202c \u202b\u05d4 \u05db\u05e3\u202c REL over hand the CP VP n \u2026 cj vb PrNP pp \u202b\u05d5\u202c \u2026 \u2026 \u202b\u05d1\u202c and on SU dt \u2026 SU n dt pr PP \u202b\u05d4 \u05d9\u05d5\u05dd\u202c \u202b\u2026 \u05d4\u05d5\u05d0\u202c day this Figure 5: Some example fragments extracted from the Old Testament annotations. Acknowledgements The authors are indebted to Wido van Peursen and Rens Bod for setting the scene for the meeting between theology and computational linguistics, and to Constantijn Sikkel for enlightening conversations about the ETCBC data. References Bergstr\u00a8 asser, Gotthelf (1926, 1986), Hebr\u00a8 aische Grammatik I/II (\u2019Hebrew Grammar\u2019), Georg Olms Verlag, Hildesheim, Germany. https://archive.org/details/hebrischegramm00gese. Bod, Rens (1992), A computational model of language performance: Data-Oriented Parsing, Proceedings COLING, pp. 855\u2013859. http://aclweb.org/anthology/C92-3126. Bouda, Peter et al. (2013-2014), graf-python. Python software on Github. https://github.com/ cidles/graf-python. Bouda, Peter, Vera Ferreira, and Ant\u00b4 onio Lopes (2012), POIO API - An annotation framework to bridge language documentation and natural language processing., Proceedings of The Second Workshop on Annotation of Corpora for Research in the Humanities, Lisbon, 2012, Lisbon, Portugal. ISBN: 978-989-689-273-9, http://alfclul.clul.ul.pt/crpc/acrh2/ACRH-2_papers/ Bouda-Ferreira-Lopes.pdf. Collins, Michael (1997), Three generative, lexicalised models for statistical parsing, Proceedings of ACL, pp. 16\u201323. http://aclweb.org/anthology/P97-1003. Doedens, Crist-Jan (1994), Text Databases. One Database Model and Several Retrieval Languages, number 14 in Language and Computers, Editions Rodopi, Amsterdam, Netherlands and Atlanta, USA. ISBN: 90-5183-729-1, http://books.google.nl/books?id=9ggOBRz1dO4C. Gesenius, Wilhelm (1813), Geschichte der Hebr\u00a8 aischen Sprache und Schrift: Eine philologischhistorische Einleitung in die Sprachlehren und W\u00a8 orterb\u00a8 ucher der Hebr\u00a8 aischen Sprache (\u2019History of the Hebrew language and script: a philologico-historical introduction to the grammar and dictionaries of the Hebrew language\u2019), Vogel, Leipzig. https://archive.org/details/ geschichtederheb00geseuoft. Hurvitz, Avi (1974), The date of the prose-tale of Job linguistically reconsidered, Harvard Theological Review. http://www.ericlevy.com/Revel/Avi%20Hurvitz%20-%20The%20Date%20of% 20the%20Prose%20Tale%20of%20Job.PDF. Ide, Nancy and Laurent Romary (2012), Linguistic Annotation Framework. ISO standard 24612:2012. Edition 1, 2012-06-15. http://www.iso.org/iso/home/store/catalogue_tc/ catalogue_detail.htm?csnumber=37326. Kalkman, Gino J. (2013), Functions of asyndetic clause relations in Biblical Hebrew. IPython Notebook. http://nbviewer.ipython.org/github/ETCBC/Biblical_Hebrew_Analysis/blob/ master/Miscellaneous/AsyndeticClauseFunctions.ipynb. Kalkman, Gino J. (to appear 2014), In search of a verbal system in Biblical Hebrew poetry; a computer-assisted analysis of syntactic patterns, Digital Humanities Quarterly, Alliance of Digital Humanities Organizations. Kalkman, Gino J. (to be published 2015), Verbal Forms in Biblical Hebrew Poetry: Poetical Freedom or Linguistic System?, PhD thesis, VU University, Amsterdam. http://nbviewer.ipython. org/github/ETCBC/Biblical_Hebrew_Analysis/blob/master/PhD/Introduction.ipynb. Kittel, Rudolf, editor (1968-1997), Biblia Hebraica Stuttgartensia, Deutsche Bibelgesellschaft, Stuttgart, Germany. http://www.bibelwissenschaft.de/startseite/ wissenschaftliche-bibelausgaben/biblia-hebraica/bhs/. Naaijer, Martijn (2012), The common nouns in the book of Esther. A new quantitative approach to the linguistic relationships of Biblical books, Master\u2019s thesis, Radboud University, Nijmegen, Netherlands. Niccacci, Alviero (1990), The Syntax of the Verb in Classical Hebrew Prose, Vol. 86 of Journal for the Study of the Old Testament, Supplement Series, Sheffield Academic Press, Sheffield. ISBN 1-85075-226-5, http://books.google.nl/books?id=LdbsaZ7di5YC. P\u00b4erez, Fernando and Brian E. Granger (2007), IPython: a system for interactive scientific computing, Computing in Science and Engineering 9 (3), pp. 21\u201329, IEEE Computer Society. http:// ipython.org, ISSN: 1521-9615, DOI: 10.1109/MCSE.2007.53. Petersen, Ulrik (2002-2014), EMDROS. Text database engine for analyzed or annotated text. Open Source software. http://emdros.org. Petersen, Ulrik (2004), EMDROS - a text database engine for analyzed or annotated text, Proceedings of COLING 2004, p. 11901193. http://emdros.org/petersen-emdros-COLING-2004.pdf. Petersen, Ulrik (2006), Principles, Implementation Strategies, and Evaluation of a Corpus Query System, Vol. 4002, Springer, p. 215226. http://link.springer.com/chapter/10.1007% 2F11780885_21. Rendsburg, Gary (1990a), Diglossia in Ancient Hebrew, Vol. 72 of American Oriental Society, Eisenbrauns, New Haven. ISBN-13: 978-0940490727, http://books.google.nl/books?id= hRliAAAAMAAJ. Rendsburg, Gary (1990b), Linguistic Evidence for the Northern Origin of Selected Psalms, Vol. 43 of Society of Biblical Literature Monograph Series, Scholars Press, Atlanta. http://books. google.nl/books?id=xbiDQgAACAAJ. Roorda, Dirk (2013-2014), LAF-Fabric. Workbench for analysing LAF resources. Python software on Github. https://github.com/ETCBC/laf-fabric. Roorda, Dirk and van den Charles M.J.M. Heuvel (2012), Annotation as a new paradigm in research archiving, Proceedings of ASIS&T 2012 Annual Meeting. Final Papers, Panels and Posters. https://www.asis.org/asist2012/proceedings/Submissions/84.pdf (author\u2019s version: http://annotation-paradigm.readthedocs.org/en/latest/_downloads/ ASIST2012-Annot-DR-ChvdH-final-submission.pdf). Sangati, Federico, Willem Zuidema, and Rens Bod (2010), Efficiently extract recurring tree fragments from large treebanks, Proceedings of LREC, pp. 219\u2013226. http://dare.uva.nl/record/371504. Scha, Remko (1990), Language theory and language technology; competence and performance, in de Kort, Q.A.M. and G.L.J. Leerdam, editors, Computertoepassingen in de Neerlandistiek, LVVN, Almere, the Netherlands, pp. 7\u201322. Original title: Taaltheorie en taaltechnologie; competence en performance. Translation available at http://iaaa.nl/rs/LeerdamE.html. Schneider, Wolfgang (1974), Grammatik des biblischen Hebr\u00a8 aisch (\u2019Grammar of Biblical Hebrew\u2019), Claudius Verlag, M\u00a8 unchen. ISBN-13: 978-3532711514, http://books.google.nl/books/?id= 2bLFnQEACAAJ. Skut, Wojciech, Brigitte Krenn, Thorsten Brants, and Hans Uszkoreit (1997), An annotation scheme for free word order languages, Proceedings of ANLP, pp. 88\u201395. Talstra, Eep and Constantijn J. Sikkel (2000), Genese und Kategorienentwicklung der WIVUDatenbank (\u2019Origin and category development of the WIVU database\u2019), in Hardmeier, C. et al, editor, Ad Fontes! Quellen erfassen lesen deuten. Was ist Computerphilologie? Ansatzpunkte und Methodologie Instrument und Praxis, VU University Press, Amsterdam, Netherlands, pp. 33\u201368. Talstra, Eep, Constantijn J. Sikkel, Oliver Glanz, Reinoud Oosting, and Janet W. Dyk (2012), Text database of the Hebrew Bible. Dataset available online after permission of the depositor at Data Archiving and Networked services, Den Haag, Netherlands. http://www. persistent-identifier.nl/?identifier=urn:nbn:nl:ui:13-ukhm-eb. van Cranenburgh, Andreas (2014), Linear average time extraction of phrase-structure fragments, Computational Linguistics in the Netherlands Journal. ISSN: 2211-4009. van Cranenburgh, Andreas and Rens Bod (2013), Discontinuous parsing with an efficient and accurate DOP model, Proceedings of the International Conference on Parsing Technologies, Nara, Japan, 27\u201329 November. http://acl.cs.qc.edu/iwpt2013/proceedings/Splits/9_pdfsam_ IWPTproceedings.pdf. van Gompel, Maarten (2013), FoLiA: Format for Linguistic Annotation. Documentation, Technical Report 01, Radboud University Nijmegen. http://proycon.github.io/folia. van Peursen, Wido Th. and Dirk Roorda (2014), Hebrew Text Database in Linguistic Annotation Framework. Dataset available online at Data Archiving and Networked services, Den Haag, Netherlands. PID: http://www.persistent-identifier.nl/?identifier=urn:nbn:nl:ui: 13-048i-71. van Peursen, Wido Th. and Percy S.F. van Keulen, editors (2006), Corpus Linguistics and Textual History, Brill, Leiden, Netherlands. ISBN13: 9789023241942. van Peursen, Wido Th., Ernst D. Thoutenhoofd, and Adriaan H. van der Weel, editors (2010), Text Comparison and Digital Creativity. The Production of Presence and Meaning in Digital Text Scholarship, Brill, Leiden, Netherlands. DOI: 10.1163/ ej.9789004188655.i-328. Weinrich, Harald (1964), Tempus: Besprochene und erz\u00a8 ahlte Welt (\u2019Tense: commentated and narrated world\u2019), Vol. 16 of Sprache unde Literatur, Kohlhammer Verlag, Stuttgart. ISBN: 340647876X, http://books.google.nl/books/?id=MdYVTlzLeiUC. Young, Ian, Robert Rezetko, and Martin Ehrensv\u00a8ard (2008), Linguistic Dating of Biblical Texts, Equinox Publishing, London. 2 vols. ISBN-13: 978-1845530815, http://books.google.nl/ books?id=-b9AAQAAIAAJ."} +{"id":"1501.01866","text":"The Hebrew Bible as Data: Laboratory Sharing Experiences Dirk Roorda\u2217 \u2217\u2217 dirk.roorda@dans.knaw.nl \u2217 Data Archiving and Networked Services - Royal Netherlands Academy of Arts and Sciences, Anna van Saksenlaan 10; 2593 HT Den Haag, Netherlands \u2217 The Language Archive - Max Planck Institute for Psycholinguistics, Wundtlaan 1; 6525 XD Nijmegen, Netherlands arXiv:1501.01866v1 [cs.CL] 8 Jan 2015 Abstract The systematic study of ancient texts including their production, transmission and interpretation is greatly aided by the digital methods that started taking off in the 1970s. But how is that research in turn transmitted to new generations of researchers? We tell a story of Bible and computer across the decades and then point out the current challenges: (1) finding a stable data representation for changing methods of computation; (2) sharing results in inter- and intra-disciplinary ways, for reproducibility and cross-fertilization. We report recent developments in meeting these challenges. The scene is the text database of the Hebrew Bible, constructed by the Eep Talstra Centre for Bible and Computer (ETCBC), which is still growing in detail and sophistication. We show how a subtle mix of computational ingredients enable scholars to research the transmission and interpretation of the Hebrew Bible in new ways: (1) a standard data format, Linguistic Annotation Framework (LAF); (2) the methods of scientific computing, made accessible by (interactive) Python and its associated ecosystem. Additionally, we show how these efforts have culminated in the construction of a new, publicly accessible search engine SHEBANQ, where the text of the Hebrew Bible and its underlying data can be queried in a simple, yet powerful query language MQL, and where those queries can be saved and shared. 1. Introduction The Hebrew Bible is a collection of ancient texts resulting from a ten-centuries long tradition. It is one of the most studied texts in human culture. Information processing by machines is less than two centuries old, but since its inception its capabilities have evolved in an exponential manner up till now (Gleick 2011). We are interested in what happens when the Hebrew Bible as an object of study is brought under the scope of the current methods of information processing. The Eep Talstra Centre for Bible and Computing (ETCBC) formerly known as Werkgroep Informatica Vrije Universiteit (WIVU), has been involved in just this since the 1970s and their members are dedicated to this approach. The combination of a relatively stable set of data and a rapidly evolving set of methods urges for reflection. Add to that a growing set of ambitious research questions, and it becomes clear that not only reflection is needed but also action. Methods from computational linguistics and the wider digital humanities are to be used, hence people from different disciplines have to be involved. How can the ETCBC share its data and way of working productively with people that are used to a wide variety of computational ways? In this article we tell a story of reflection and action, and the characters are databases, data formats, query languages, annotations, computer languages, archives, repositories and social media. This story has a beginning in February 2012, when a group of biblical scholars convened at the Lorentz center at Leiden for the workshop Biblical Scholarship and Humanities Computing: Data Types, Text, Language and Interpretation (Roorda et al. 2012). They searched for new ways to obtain computational tools that matched their research interests. The author was part of that meeting and had prepared a demo application: a query saver. It was an attempt to improve the sharing of knowledge. It is a craft to write successful queries for the ETCBC Hebrew Text database, and by publishing their queries, researchers might teach each other how to do it. In the years that followed, this idea has materialized as the result of the SHEBANQ project (System for HEBrew text: ANnotations for Queries and markup), a curation and demonstrator project funded by CLARIN-NL, the Dutch department of the Common LAnguage Resource INfrastructure in Europe http://www.clarin.eu. We have chosen a modern standard format for the data: Linguistic Annotation Framework (LAF), and have built a web-application for saving queries. During the execution of this project we also have built LAF-Fabric, a tool to analyze and manipulate LAF resources. Now, in 2014, we can say that we have a modern data laboratory for historico-linguistic data, plus ways to share results, not only among a small circle of theological experts, but also among computational linguists on the one hand and students and interested lay people on the other. Of course, every beginning of such a story is arbitrary. There is always so much more that happened before. In order to provide the reader with enough context, we shall also relate key moments of that greater story. Moreover, we cannot tell the whole story: our perspective is biased to the computational side. We shall not delve into the intricacies of manuscript research, but focus on the data models and computational methods that help analyze a rather fixed body of transcribed text. Yet, we believe that this simplified context is rich enough material for a good story. Whereas this paper deliberately scratches only the surface of the computational methods, there is also a joint paper with researchers, which contains a more technical account in (Roorda et al. to appear 2015). 2. Ground work: WIVU and ETCBC Since the 1970s, Eep Talstra, Constantijn Sikkel and a group of researchers at the VU University Amsterdam have been compiling a text database of the Hebrew Bible. This database started as a set of files, containing the transliterated Hebrew text of the Bible according to the Biblia Hebraica Stuttgartensia edition (Kittel 1968-1997). To this text, they added files with their observations of linguistic patterns in it as coded annotations, anchored to the individual words, phrases, clauses, and sentences. They tested tentative patterns against the data, refined them, and added manual exceptions. This led to a complex web of files, containing the base text and a set of semi-automatically generated annotations. They refrained from shaping these annotations in a hierarchical, linguistic model, because they wanted to represent observations, not theory (Talstra and Sikkel 2000). The result of this work is a database in the sense of being observational data on which theories can be based. It is not a database in the sense of a modern relational database system. The advantages of a proper (in the sense of computer science) database are obvious indeed, but the relational model does not represent textual data in a natural way, and does not facilitate queries that are linguistically meaningful. In the 1990s there have been promising efforts to define the notion of a text database. In his Ph.D. thesis, Crist-Jan Doedens (Doedens 1994) defined a data model for texts and the notion of a topographic query language (QL) to retrieve linguistic results. He identified the relations of sequence and embedding as the key structures to store and retrieve texts. A query is topographic if its internal structure exhibits the same sequence and embedding relations as the results it is meant to retrieve. Interestingly, he did not postulate that a text is one hierarchy. In his data model, textual data may be organized by means of multiple, overlapping hierarchies. The definition of a data model and a query language are not yet a working database system. In the 2000s, Ulrik Petersen undertook to create an implementation of Doedenss ideas. This led to the Emdros database system with the MQL (Mini-QL) query language Petersen (2004, 2006, 2002-2014). Emdros consists of a front-end, which is an MQL interpreter, and a back-end, which is an existing production class relational database system such as Postgres or MySQL. Despite the fact that MQL is a concession to practicality, it is still a topographic query language and very convenient to express real-life textual queries without invoking programming skills. Since then, an Emdros export of the current Hebrew text database is being maintained by the ETCBC team. Emdros is open source software, the data model is very clear, so this export is a communication device: the intricacies of the internal annotation-creation of the ETCBC workflow are largely left behind, and users of the export have a well-defined dataset at their disposal. 3. Idea: Queries As Annotations During the aforementioned Lorentz workshop (Roorda et al. 2012), an international group of experts reflected on how to bring biblical data resources to better fruition in the digital age. The ETCBC database had been incorporated in Bible study software, but developments there were not being driven by agendas set by academic research. Yet those bible study applications offered attractive interfaces to browse the text, look up words and more. The problem was: how can theologians, with limited ICT resources, regain control over the development of software that works with their data? The workshop offered no concrete solutions, but some ingredients of potential long-term solutions did get mentioned: open up the data and develop open source tools. Theologians can only hope to keep up with ICT developments if they allow people building on each others accomplishments. A very concrete articulation of this statement was made by Eep Talstra himself, when he deposited the ETCBC database into EASY, the research archive of DANS (Talstra et al. 2012). It must be admitted that there remained barriers: the data was not Open Access and the format in which it was deposited was MQL, which is not a very well-known format, so the experimenting theological programmer still has a hard time to do some meaningful work with this data. But it was definitely a step towards increased sharing of resources. Figure 1: Queries/Features as Annotations In that same workshop, the author showed a demo application (Roorda 2012) (see Figure 1) by which the user could browse the Hebrew text and highlight a number of linguistic features. The idea to highlight features, which are essentially annotations to the text, triggered another idea: to view queries as annotations to the passages that contain their results (Roorda and van den Heuvel 2012). If researchers can save their carefully crafted queries as annotations, and if those annotations are centrally stored, then other researchers have access to them and may encounter them when they are reading a passage. Just as readers encounter ordinary annotations by other scholars in printed books, they will encounter results of queries of others when they are browsing a chapter of the Hebrew Bible in their web browser. With a single click they are led to not only the query instruction itself but also a description of the provenance and motivation of the query. This could be the basis of interesting scenarios for cross-fertilization. It is interesting to note the stack of computational tools needed to write this demo. Its construction involved a data preparation tool for transforming the contents of the ETCBC database into a relational database for driving a website. The web app itself was based on web2py, a lightweight python based web-application framework (Di Pierro 2015). Table 1 is a list of languages used to implement both the data-preparation tool and the web-site, together with the amount of code needed in each formalism. There are several things to note: 1. The numbers of lines of code are very small. 2. The formalisms, while considerable in number, are utterly commonplace. 3. The number of formalisms may be reduced by one by dropping Perl in favor of Python It can be concluded that mastering commonplace ICT techniques may generate a good return on investment, in the form of a web application that expose data on the web in rich interfaces. formalism web app data prep tool sql python perl javascript html css shell script 90 250 80 650 300 50 60 280 Table 1: Amount of lines of code per formalism per application 4. Realization: LAF-Fabric and SHEBANQ In 2013-2014, ETCBC together with DANS has carried out the CLARIN-NL project SHEBANQ. We seized the opportunity to implement the idea of queries-as-annotations, but to make it possible at all more work had to be done. 4.1 LAF and LAF-Fabric First of all, a new representation of the data had to be selected, one that conformed to a standard used in linguistics. Linguistic Annotation Framework, an ISO standard (Ide and Romary 2012), was chosen. LAF defines a data model in which an immutable stream of primary data is annotated by feature structures. The data stream is addressed by means of a graph of nodes and edges, where the nodes may be linked to regions of the primary data, and where edges serve to connect smaller parts to bigger wholes. Both nodes and edges can act as targets of annotations, which contain the feature structures. Finally, all entities, except the primary data, are serialized in XML. In concrete terms, we have extracted the complete text of the Hebrew Bible as a plain Unicode text file. As far as LAF is concerned, this is our primary data. For the books, chapters and verses we have created nodes that are linked to the stretches of text that they correspond to. For every individual word there is a node, linked to a region defined by the character positions of the first and last character of that word. For the phrases, clauses and sentences there are nodes, linked to the regions corresponding to the words they contain. Relationships between constituents correspond to edges. The properties of sectional units, words, and constituents are key-value pairs targeted at the corresponding nodes. The LAF data model shares a lot of structure with the Emdros data model of text, objects and features. We only had to map objects to nodes and features to key-value pairs inside annotations targeting the proper nodes, so this conversion has been a straightforward process with only a few devilish details. The result is a good example of stand-off markup. The primary data is left untouched, and around it is a graph of annotations. It is perfectly possible to add new annotations without interfering with the primary data or the other annotations. The annotations are like a fabric, into which new threads can be woven, and that can be stitched to other fabrics. In this way, the stand-off way of adding information to sources facilitates cooperation and sharing much better than adding markup inline, such as TEI prescribes. This bold assertion must be qualified by two considerations, however: 1. Stand-off markup works best in those cases where the primary sources are immutable. As easy as it is to add new annotations, so difficult it is to insert new primary data. 2. Stand-off markup flourishes in cases where the main access mode to the sources is by programmatic means. Manual inspection of stand-off data and their annotations becomes quickly overwhelming. In our case, condition 1 is satisfied for years in a row. How we will deal with major updates remains to be seen. Table 2 indicates some quantities of the ETCBC data, both in their Emdros form and in their LAF form. These numbers suggest that manual inspection of individual files is so cumbersome that it pays off to invest in programmatic access of the data. quantity words linguistic objects resp. nodes total number of features serialized size (MQL resp. XML) compiled size (SQLite3 resp. binary) Emdros LAF 426,555 945,726 22,622,100 455 MB in 1 file 126 MB 426,555 945,726 25,504,388 1640 MB in 14 files 260 MB Table 2: Quantities in the ETCBC data The LAF version of the Hebrew text database has been archived at Data Archiving and Networked Services (DANS), the research archive for the humanities and social sciences in the Netherlands (Peursen and Roorda 2014). As LAF is a relative new standard, there are few LAF-compatible tools. A LAF resource is represented in XML, but the nature and size of this XML make it difficult to be handled by ordinary XML tools. Looking through the surface syntax, a LAF resource is neither a relational database, nor a document, but a graph. XML processing works well when the underlying data structure is a single hierarchy, no matter how deep, or a table of records, no matter how large, but it grinds to a halt when the data is a large and intricate web of nodes and edges, i.e. a graph. In order to facilitate productive work with the freshly created LAF representation of the Hebrew Bible, we have developed LAF-Fabric (Roorda 2013-2014b), which is a LAF compiler and loader. In a typical workflow, a researcher wants to inspect the LAF data, focus on some aspects, sort, collate, link and transform selected data, and finally export results. Without LAF-Fabric, the obvious way to do so is read the XML data, apply XPATH, XSLT or XQUERY scripts and collect the results. Reading the XML data means parsing it and building an internal representation in memory, and this alone takes an annoying 15 minutes on a average laptop and uses a prohibitive amount of memory. This is not conducive to an interactive, explorative, agile use of the data, and LAF-Fabric remedies this. When first invoked on a LAF-resource, it compiles it into efficient data structures and writes those to disk, in such a way that this data can be loaded fast. This one-time compilation process takes roughly 15 minutes, but then the data loads in a matter of seconds every time you want to work with it. Furthermore, LAF-Fabric offers a programmers interface (API) to the LAF data, by which the programmer can walk over the nodes and edges and collect feature information on the fly. These walks are fast, and can be programmed easily. The idea to create LAF-Fabric arose after we tried to use a library called graf-python (Bouda 2013-2014), part of POIO (Bouda et al. 2012), for the biblical LAF data. Unfortunately, the way grafpython was programmed made it unsuitable for dealing with our LAF resource because of its size. Python is a scripting language with a clean syntax and a good performance if used judiciously, hence we undertook to write LAF-Fabric in Python as well. We use those parts of Python that perform best for the heavy data lifting, and those parts that are most user friendly for the programmers interface. LAF-Fabric is a package that can be imported in any Python script, and it behaves particularly well when invoked in an IPython notebook. IPython Notebook is an interactive way of writing Python scripts and documentation (P\u00b4erez and Granger 2007). A notebook is a document in which the data analyst writes cells with Python code and other cells with documentation. Code cells can be run individually, in any order, while the results of the execution remain in memory. The notebook has powerful capabilities of formatting results. A notebook can be published easily on the web, so that others can download it and execute it as well, provided they have the same data and packages installed. IPython notebook belongs to a branch of computer programming called scientific computing. It is about explorative data analysis by means of computing power. The scientific programmer produces analyses, charts, and documents that account for his data and results. By contrast, the typical software engineer produces applications that perform well-defined tasks for end users. The scientific programmer works close to the researchers, and writes special purpose code fast, and reacts to changing demands in an agile way. The software engineer works at a greater distance from the actual use cases. He uses programming languages that support good software organization at the cost of a much slower development process. He is less prepared to accomodate fast-changing requirements. When LAF-Fabric runs in an IPython notebook, even the few seconds it needs to load data are required only once. The programmer can experiment with his code cells at will, without the need to reload the data all the time. LAF-Fabric has already been used for some significant data extractions. There is a varied and growing set of notebooks (Roorda 2014a) on Github that is testimony to the extent of use cases that can be served. Not only data analysis, but also adding new annotations is supported. One of the use cases is the query saver itself. 4.2 SHEBANQ The actual goal of the SHEBANQ project was to create a demonstrator query saver for the ETCBC data. This has been achieved, and the resulting web application is called SHEBANQ (van Peursen et al. 2014). It went live on 2014-08-01, and contains now, on 2015-01-06, 309 public queries, saved by 42 users. The public part of the application offers users the options to read the Hebrew Bible chapter by chapter, to see query results of public queries as annotations in the margin, and to jump from query annotations to query descriptions and result lists. Figure 2 shows a screenshot of the page of a saved query. No matter how many query results there are, the user is able so navigate through them all, as can be seen in Figure 3. When a user clicks on the verse indicator of a result, he is led to the browsing interface, where other queries show up and can be navigated to, see Figure 4. When users register and log in, they can write their own queries, have them executed, save them, including the query results, and make them public. In order to execute MQL queries, SHEBANQ communicates with a web-service that is wrapped around the Emdros text database. While the underlying idea of SHEBANQ is straightforward, turning it into practice posed several challenges. To begin with, the data had to be modeled in a way suitable for driving web applications. We have programmed a MySQL export in a notebook, invoking LAF-Fabric. Every now and then the functionality of SHEBANQ is extended. For example, it can now show additional linguistic information in layers below the plain text. We first experimented in LAF-Fabric by generating Figure 2: A saved query in SHEBANQ Figure 3: Navigating through results of the query for the definite article (12,835 result verses) HTML for a visual prototype, then we collectd feedback, and adapted our notebook. We ran consistency checks whenever we wanted to make use of perceived regularities in the data. After everything had crystallized out satisfactorily, we built the new data representation into SHEBANQ, see Figure 5. The fact that both the notebook and the SHEBANQ website are written in Python turned out very convenient. Rendering the Hebrew text turned out to be a problem because of subtle bugs in some platform/browser combinations. On Mac OSX, Chrome and Safari mangled whitespace between partic- Figure 4: Reading a passage and seeing the results of various queries. Figure 5: Text and underlying data ular combinations of consonants and diacritics. We have explored the problems using LAF-Fabric and found a mitigating work-around. If the demonstrator shows one thing, then it is the fact that there are many additional desiderata. Whereas SHEBANQ has been designed to fulfill the sharing function, researchers also want to use it as a research tool. It is not easy to write a good MQL query, because many of the linguistic aspects of the data are not shown on the interface. If, for instance, a user wants to use the dictionary entries of the words or the syntactic features of clauses and phrases, he has no immediate, visual clues. So SHEBANQ has been extended again. The user can now click on any word in the text for easy access to lexical information. Other users see SHEBANQ as a preprocessor tool: they need data exports of query results. The next iteration of SHEBANQ is planned to deliver that. Another matter is usability: the number of queries is becoming too large to display them all in the margin. Users want to be able to filter the queries they see on the basis of who wrote them, both in the browsing interface and in the list of public queries. Last-but-not-least, query execution is CPU-hungry. We have already started thinking about measures to prevent excessive processor loads, or ways to distribute the load over multiple servers. 5. Reflection Back in 2012 we faced the challenge to provide better data models and better programs for biblical scholars. It had become clear that the software companies that were developing the bible study applications were not interested in building software for researchers. The researchers did not have funds to hire programmers themselves. There seemed to be only one way out: researchers should take their fate in their own hands and write the software themselves, which looked like a daunting proposition at best and an impossible one at worst. Yet, now in 2014, we have a publicly accessible tool for querying the linguistic data of the Hebrew Bible, with a means to share those queries. We also have a data laboratory where the programming theologian can take control over her data. Collectively, biblical scholars can use the data laboratory to help the query tool evolve according to their needs. Several factors have contributed to this achievement. 1. The existence of the LAF standard, which turned out to be a natural fit for this kind of data. 2. The realization that the plain text of the Hebrew Bible is not subject to copyright, and hence that the ETCBC database of text and annotations can be made available as Open Source. 3. The existence of a research archive, DANS, acting as a data-hub; the intellectual heritage of many years of ETCBC work lays deposited there and is open to scrutiny by anyone at any time. 4. The existence of a social medium for program code, Github; all software for LAF-Fabric and SHEBANQ (and even some of the supporting software) lies there ready to be cloned and re-used. 5. The rise of scientific computing and its paraphernalia, such as (interactive) Python and auxiliary packages; it offers an unprecedented level of user-friendliness to novice programmers; it has the potential to draw a much wider range of humanities scholars into the enticing world of computing. A researcher is much closer to a scientific programmer than to a software engineer. Yet, this is not sufficient to get the job done. The ETCBC is steeped in its own ways, it has an efficient internal data workflow, run with the best tools that were available in the late 1980s. The internet existed then, but had not yet morphed into the world-wide web. Data sharing is not in the genes of the ETCBC. Doing unique things in relative isolation for a prolonged stretch of time tends to make you idiosyncratic. The ETCBC has its own transliteration of Hebrew, its own, locally documented way of coding data into forms that are optimal for local data processing. Opening up to the world poses new requirements on the ways the data is coded and how it is documented. While we have archived the existing ETCBC documentation at DANS, we started publishing a new kind of feature documentation on the web (Roorda et al. 2014). There we document not only the intended meaning of features, but we also provide frequency lists of their complete value sets, things that are easily computed by means of LAF-Fabric. Can we say that we have succeeded in meeting the challenges posed in 2012? It is too early for that. Proof of success would be the adoption of LAF-Fabric by at least some theological researchers, interest in the Hebrew data from the side of computational linguistics and artificial intelligence, and large access log files of the SHEBANQ web application. At the moment of writing, all these indicators are non-zero (Roorda et al. to appear 2015), Kalkman (2013, to be published 2015), which is promising, given the fact that we just started. Acknowledgements The following people contributed significantly to the work described in this paper, in very different ways: My colleagues at DANS: Henk Harmsen and Andrea Scharnhorst for granting additional time for research in these topics; Henk van den Berg and Heleen van de Schraaf for developing fundamental portions of the web-service and web application of SHEBANQ. The ETCBC people: Eep Talstra and Constantijn Sikkel for creating and maintaining the ETCBC database and sharing so much knowledge about it; Wido van Peursen for paving the way for increased sharing of data. Oliver Glanz for showing order in the forest of features of the database; Gino Kalkman and Martijn Naaijer for using the new tools and challenging me. Grietje en Johan Commelin for their efforts to get LAF-Fabric working in the cloud. Reinoud Oosting for finding subtle bugs in SHEBANQ and Janet Dyk introducing new use cases. Wider Digital Humanities: Joris van Zundert (HuygensING) for leading an inspiring Interedition bootcamp (van Zundert 2012) which set me on the track of rapid development for the humanities; Rens Bod (Univ. of Amsterdam) and Andreas van Cranenburgh (Univ. of Amsterdam and HuygensING) who asked for the Hebrew data as tree structures in order to try out Data Oriented Parsing for classical Hebrew. References Bouda, Peter (2013-2014), graf-python. Python software on Github. https://github.com/cidles/ graf-python. Bouda, Peter, Vera Ferreira, and Ant\u00b4onio Lopes (2012), Poio API - an annotation framework to bridge language documentation and natural language processing., Proceedings of The Second Workshop on Annotation of Corpora for Research in the Humanities, Lisbon, 2012, Lisbon, Portugal. ISBN: 978-989-689-273-9, http://alfclul.clul.ul.pt/crpc/acrh2/ACRH-2_papers/ Bouda-Ferreira-Lopes.pdf. Di Pierro, Massimo (2015), Web2py. full stack web framework, 6th edition. Online book. http: //web2py.com/book. Doedens, Crist-Jan (1994), Text Databases. One Database Model and Several Retrieval Languages, number 14 in Language and Computers, Editions Rodopi, Amsterdam, Netherlands and Atlanta, USA. ISBN: 90-5183-729-1, http://books.google.nl/books?id=9ggOBRz1dO4C. Gleick, James, editor (2011), The Information: a History, a Theory, a Flood, HarperCollins. ISBN: 0007225741, http://en.wikipedia.org/wiki/The_Information:_A_History,_a_Theory, _a_Flood. Ide, Nancy and Laurent Romary (2012), Linguistic Annotation Framework. ISO standard 24612:2012. Edition 1, 2012-06-15. http://www.iso.org/iso/home/store/catalogue_tc/ catalogue_detail.htm?csnumber=37326. Kalkman, Gino J. (2013), Functions of asyndetic clause relations in biblical Hebrew. IPython Notebook. http://nbviewer.ipython.org/github/ETCBC/Biblical_Hebrew_Analysis/blob/ master/Miscellaneous/AsyndeticClauseFunctions.ipynb. Kalkman, Gino J. (to be published 2015), Verbal Forms in Biblical Hebrew Poetry: Poetical Freedom or Linguistic System?, PhD thesis, VU University, Amsterdam. http://nbviewer.ipython. org/github/ETCBC/Biblical_Hebrew_Analysis/blob/master/PhD/Introduction.ipynb. Kittel, Rudolf, editor (1968-1997), Biblia Hebraica Stuttgartensia, Deutsche Bibelgesellschaft, Stuttgart, Germany. http://www.bibelwissenschaft.de/startseite/ wissenschaftliche-bibelausgaben/biblia-hebraica/bhs/. P\u00b4erez, Fernando and Brian E. Granger (2007), IPython: a system for interactive scientific computing, Computing in Science and Engineering 9 (3), pp. 21\u201329, IEEE Computer Society. http:// ipython.org, ISSN: 1521-9615, DOI: 10.1109/MCSE.2007.53. Petersen, Ulrik (2002-2014), Emdros. text database engine for analyzed or annotated text. Open Source software. http://emdros.org. Petersen, Ulrik (2004), Emdros - a text database engine for analyzed or annotated text, Proceedings of COLING 2004, p. 11901193. http://emdros.org/petersen-emdros-COLING-2004.pdf. Petersen, Ulrik (2006), Principles, Implementation Strategies, and Evaluation of a Corpus Query System, Vol. 4002, Springer, p. 215226. http://link.springer.com/chapter/10.1007% 2F11780885_21. Peursen, Wido Th. van and Dirk Roorda (2014), Hebrew text database in Linguistic Annotation Framework. Dataset available online at Data Archiving and Networked services, Den Haag, Netherlands. PID: urn:nbn:nl:ui:13-048i-71, http://www.persistent-identifier.nl/ ?identifier=urn:nbn:nl:ui:13-048i-71. Roorda, Dirk (2012), Queries-as-annotations. Github repository. https://github.com/Dans-labs/ annotation-paradigm. Roorda, Dirk (2013-2014b), LAF-Fabric. workbench for analysing LAF resources. Python software on Github. https://github.com/ETCBC/laf-fabric. Roorda, Dirk (2014a), LAF-Fabric notebooks. examples of data processing for the hebrew bible. Python software on Github. https://github.com/ETCBC/laf-fabric-nbs. Roorda, Dirk and Charles M.J.M. van den Heuvel (2012), Annotation as a new paradigm in research archiving, Proceedings of ASIS&T 2012 Annual Meeting. Final Papers, Panels and Posters. https://www.asis.org/asist2012/proceedings/Submissions/84.pdf (author\u2019s version: http://arxiv.org/abs/1412.6069). Roorda, Dirk, Constantijn Sikkel, and Wido Th van Peursen (2014), Feature documentation of shebanq. Github repository published on readthedocs. https://github.com/ETCBC/ shebanq-doc. Roorda, Dirk, Jan Krans, Bert-Jan Lietaert-Peerbolte, Wido Th. van Peursen, Ulrik SandborgPetersen, and Eep Talstra (2012), Scientific report of the workshop biblical scholarship and humanities computing: Data types, text, language and interpretation, held at the lorentz centre leiden from 6 feb 2012 through 10 feb 2012, Technical report, Lorentz Center, Leiden. http: //www.lorentzcenter.nl/lc/web/2012/480/report.php3?wsid=480&venue=Oort. Roorda, Dirk, Martijn Naaijer, Gino J. Kalkman, and Andreas van Cranenburgh (to appear 2015), Laf-fabric: a data analysis tool for linguistic annotation framework with an application to the hebrew bible, Computational Linguistics in the Netherlands Journal. ISSN: 2211-4009, preprint http://arxiv.org/abs/1410.0286. Talstra, Eep and Constantijn J. Sikkel (2000), Genese und kategorienentwicklung der wivudatenbank, in Hardmeier, C. et al, editor, Ad Fontes! Quellen erfassen lesen deuten. Wat ist Computerphilologie? Ansatzpunkte und Methodologie Instrument und Praxis, VU University Press, Amsterdam, Netherlands, pp. 33\u201368. Talstra, Eep, Constantijn J. Sikkel, Oliver Glanz, Reinoud Oosting, and Janet W. Dyk (2012), Text database of the hebrew bible. Dataset available online after permission of the depositor at Data Archiving and Networked services, Den Haag, Netherlands. http://www. persistent-identifier.nl/?identifier=urn:nbn:nl:ui:13-ukhm-eb. van Peursen, Wido Th., Dirk Roorda, Henk van den Berg, and Heleen van de Schraaf (2014), SHEBANQ, a search engine and query saver for the etcbc text database of the hebrew bible. web application. http://shebanq.ancient-data.org. van Zundert, Joris et al. (2012), Proceedings of the leuven2012 interedition workshop. Wiki page. http://interedition.eu/wiki/index.php/Leuven2012_Proceedings."} +{"id":"1501.06412","text":"The Anatomy of Relevance Topical, Snippet and Perceived Relevance in Search Result Evaluation \u02da Aleksandr Chuklin Maarten de Rijke University of Amsterdam, Amsterdam, The Netherlands arXiv:1501.06412v1 [cs.IR] 26 Jan 2015 a.chuklin, derijke@uva.nl ABSTRACT Currently, the quality of a search engine is often determined using so-called topical relevance, i.e., the match between the user intent (expressed as a query) and the content of the document. In this work we want to draw attention to two aspects of retrieval system performance affected by the presentation of results: result attractiveness (\u201cperceived relevance\u201d) and immediate usefulness of the snippets (\u201csnippet relevance\u201d). Perceived relevance may influence discoverability of good topical documents and seemingly better rankings may in fact be less useful to the user if good-looking snippets lead to irrelevant documents or vice-versa. And result items on a search engine result page (SERP) with high snippet relevance may add towards the total utility gained by the user even without the need to click those items. We start by motivating the need to collect different aspects of relevance (topical, perceived and snippet relevances) and how these aspects can improve evaluation measures. We then discuss possible ways to collect these relevance aspects using crowdsourcing and the challenges arising from that. Categories and Subject Descriptors H.3.3 [Information Storage and Retrieval]: Information Search and Retrieval 1. INTRODUCTION For decades the main evaluation paradigm for search engines was the Cranfield methodology [7]. In a typical setting of a TREC conference, the documents are evaluated by human raters who assign relevance labels based on their judgement about the relevance of the document to the user\u2019s topic of interest, expressed as a query. A graded relevance scale is typically used with topical relevance labels ranging from 0 to 4 or from irrelevant to highly relevant. 9\u02daNow at Google Switzerland. Copyright is held by the author/owner(s). SIGIR\u201914 Workshop on Gathering Efficient Assessments of Relevance (GEAR\u201914), July 11, 2014, Gold Coast, Queensland, Australia. These relevance labels can be obtained either from trained experts or using a crowdsourcing approach. Either way, cases of disagreement have to be addressed, and those are usually treated as raters\u2019 mistakes, but may also arise from different interpretations of the user intent or the notion of relevance. In a traditional evaluation approach a single relevance label is chosen for each document-topic pair. These labels are then aggregated to SERP-level quality measures such as DCG [9] or ERR [2]. By using additional inputs from raters, we can (a) refine these quality measures and (b) better understand the performance of retrieval systems. 2. RELATED WORK The idea to separate perceived and topical relevance was suggested by [3] while designing the DBN click model. Unlike earlier click models, it suggests that the likelihood of a user clicking a document depends not on the topical relevance of the document, but rather on its perceived relevance, since the user can only judge based on the result snippet. This idea was later picked up by [12] who showed that while topical and perceived relevance are correlated, there is a noticeable discrepancy between them. They performed a simulated experiment by modeling the user click probability and showed that taking it into account would lead to substantially different ordering of the systems participating in a TREC Web Track. The idea to separate out snippet relevance appears after the introduction of good abandonment [10]: cases when users abandon a search result page without clicking any results and yet they are satisfied. This may be due to the SERP being rich with instant answers [4], e.g., a weather widget or a dictionary box, or due to the fact that a query has a precise informational need, that can easily be answered in a result snippet [5]. In fact, as was shown by [11] a big portion of abandoned searches was due to a pre-determined behaviors: users came to a search engine with a prior intention to find an answer directly on a SERP. This is especially true when considering mobile search where the internet connection can be slow or the user interface is less convenient to use. We complement these works by arguing that good and relevant snippet does not necessarily lead to a complete good abandonment, but rather represents an aspect of utility gained by the user that is currently ignored. 3. APPLICATION TO EVALUATION As was suggested by [1], many evaluation metrics, including DCG and ERR may be viewed as based on a click model. This was further refined by [6] where a recipe of converting any click model into a metric was presented: N \u00ff uMetric \u201c P pCk \u201c 1q \u00a8 Rk , (1) k\u201c1 where Rk is the (topical) relevance of the k-th document in the ranking, and P pCk \u201c 1q is the probability that the user will click on that document. Depending on the user model, the click probability may depend on attractiveness parameters. This is where we can use perceived relevance labels Ak (attractiveness). For example, for a metric based on the DCM model [8] we have:1 uDCM \u201c N \u00ff apAk q k\u00b41 \u017a p1 \u00b4 apAi qsi q \u00a8 Rk , (2) i\u201c1 k\u201c1 where apAq is a list of parameters, one for each possible value of perceived relevance label A; si is another list of parameters, one for each value of the document position i. Further, if we want to use snippet relevance labels Sk , we introduce a metric of the utility gained from the SERP itself similar to (1): uMetric S \u201c N \u00ff P pEk \u201c 1q \u00a8 Sk , (3) k\u201c1 where P pEk \u201c 1q is the probability that the user examines the k-th document. Again, for DCM that would lead us to: uDCM S \u201c N k\u00b41 \u017a \u00ff p1 \u00b4 apAi qsi q \u00a8 Sk . (4) k\u201c1 i\u201c1 To summarize, we showed that by collecting perceived, topical and snippet relevance we can refine system quality measures (eq. (1), (3)). To estimate the effect of this refinement one can compute correlations with online click metrics similar to [6] or with side-by-side comparison judgements collected using independent set of raters. 4. to the rater the story of a web search and asking her if she would click this link in order to find the relevant information in the document. The snippet has to be shown without the context of the other snippets and without its placement on a SERP to avoid position and presentation biases. These, then, are the challenges of gathering relevance judgements in the multi-aspect setting that we are proposing: \u201a How to make sure the raters do not confuse different tasks (topical, snippet, perceived relevance)? \u201a How do we treat special SERP items such as images, instant answers or interactive tools? \u201a What influence does the query category have on the difficulty of the task? For example, snippet relevance does not make sense for navigational queries. 5. CONCLUSION This paper advocates for the need to review the notion of relevance in to order improve evaluation as well as understand the anatomy of relevance. We believe that after performing initial experiments and collecting feedback from the raters, we can address the challenges outlined above and derive a judgement procedure that will allow us to collect all three aspects of relevance, refine system performance evaluation and get deeper insights into the foundation of relevance. Acknowledgements. This research was partially supported by the European Community\u2019s Seventh Framework Programme (FP7/20072013) under grant agreements nrs 288024 and 312827, the Netherlands Organisation for Scientific Research (NWO) under project nrs 727.011.005, 612.001.116, HOR-11-10, 640.006.013, the Center for Creation, Content and Technology (CCCT), the QuaMerdes project funded by the CLARIN-nl program, the TROVe project funded by the CLARIAH program, the Dutch national program COMMIT, the ESF Research Network Program ELIAS, the Elite Network Shifts project funded by the Royal Dutch Academy of Sciences (KNAW), the Netherlands eScience Center under project number 027.012.105, the Yahoo! Faculty Research and Engagement Program, the Microsoft Research PhD program, and the HPC Fund. GATHERING JUDGEMENTS Now that we have argued that perceived, topical and snippet relevance are potentially valuable dimensions of assessing system quality, how do we gather the required judgements? Firstly, we believe, that the topical relevance definition used by TREC raters is time-tested and hence can be used without modification. Secondly, snippet relevance can be treated as document topical relevance with document replaced by its snippet. We also need additional messaging for the raters explaining to them why the \u201cdocuments\u201d are so short to avoid undervalued scores. In order to prevent the raters from confusing this task with perceived relevance judgement, we may hide the fact that they are judging clickable snippets and just refer to them as short summaries.2 Similar ratings were collected by [5], where three possible answers were offered to the raters: the snippet \u201canswers the user question,\u201d \u201canswers the question only partially,\u201d \u201cdoes not answer the question.\u201d Third and finally, perceived relevance is a new task that has to be formulated by explaining 9 1 A similar but more involved equation can be obtained for a metric based on the DBN model [3]. 9 2 It would be interesting to alter this clause in the rater\u2019s instruction and see how the outcomes change. 6. REFERENCES [1] B. Carterette. System effectiveness, user models, and user utility: a conceptual framework for investigation. In Proceedings of the 34th international ACM SIGIR conference on Research and development in Information Retrieval, pages 903\u2013912. ACM, 2011. [2] O. Chapelle, D. Metlzer, Y. Zhang, and P. Grinspan. Expected reciprocal rank for graded relevance. In Proceedings of the 18th ACM conference on Information and knowledge management, pages 621\u2013630. ACM, 2009. [3] O. Chapelle and Y. Zhang. A dynamic bayesian network click model for web search ranking. In Proceedings of the 18th international conference on World wide web, pages 1\u201310. ACM, 2009. [4] L. B. Chilton and J. Teevan. Addressing people\u2019s information needs directly in a web search result page. In Proceedings of the 20th international conference on World wide web, pages 27\u201336. ACM, 2011. [5] A. Chuklin and P. Serdyukov. Good abandonments in factoid queries. In Proceedings of the 21st [6] [7] [8] [9] [10] [11] [12] international conference companion on World Wide Web, pages 483\u2013484. ACM, 2012. A. Chuklin, P. Serdyukov, and M. De Rijke. Click model-based information retrieval metrics. In Proceedings of the 36th international ACM SIGIR conference on Research and development in information retrieval, pages 493\u2013502. ACM, 2013. C. W. Cleverdon and M. Keen. Aslib cranfield research project-factors determining the performance of indexing systems; volumes 1 and 2. 1966. F. Guo, C. Liu, and Y. M. Wang. Efficient multiple-click models in web search. In Proceedings of the Second ACM International Conference on Web Search and Data Mining, pages 124\u2013131. ACM, 2009. K. J\u00a8 arvelin and J. Kek\u00a8 al\u00a8 ainen. Cumulated gain-based evaluation of ir techniques. ACM Transactions on Information Systems (TOIS), 20(4):422\u2013446, 2002. J. Li, S. Huffman, and A. Tokuda. Good abandonment in mobile and pc internet search. In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval, pages 43\u201350. ACM, 2009. S. Stamou and E. N. Efthimiadis. Interpreting user inactivity on search results. In Advances in Information Retrieval, pages 100\u2013113. Springer, 2010. A. Turpin, F. Scholer, K. Jarvelin, M. Wu, and J. S. Culpepper. Including summaries in system evaluation. In Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval, pages 508\u2013515. ACM, 2009."} +{"id":"oai:arXiv.org:1309.2788","text":"Training in Data Curation as Service in a Federated Data Infrastructure - the FrontO\ufb03ce\u2013BackO\ufb03ce Model Ingrid Dillo, Rene van Horik, and Andrea Scharnhorst arXiv:1309.2788v1 [cs.DL] 11 Sep 2013 Data Archiving and Networked Services, Anna van Saksenlaan 10, 2593 HT The Hague , The Netherlands {ingrid.dillo,rene.van.horik,andrea.scharnhorst}@dans.knaw.nl http://www.dans.knaw.nl Abstract. The increasing volume and importance of research data leads to the emergence of research data infrastructures in which data management plays an important role. As a consequence, practices at digital archives and libraries change. In this paper, we focus on a possible alliance between archives and libraries around training activities in data curation. We introduce a so-called FrontO\ufb03ce\u2013BackO\ufb03ce model and discuss experiences of its implementation in the Netherlands. In this model, an e\ufb03cient division of tasks relies on a distributed infrastructure in which research institutions (i.e., universities) use centralized storage and data curation services provided by national research data archives. The training activities are aimed at information professionals working at those research institutions, for instance as digital librarians. We describe our experiences with the course DataIntelligence4Librarians. Eventually, we re\ufb02ect about the international dimension of education and training around data curation and stewardship. Keywords: data curation, data management, training, data sharing, data archive, digital libraries, education, science policy, documentation 1 Introduction A research archive can be depicted as a safe haven for research data, carefully selected, documented and stored for future consultation. Accordingly, the core tasks of a data archivist could be imagined to be con\ufb01ned to proper documentation, and the care for material preservation. In short: \u201dOur service starts where others drop the data\u201d1 . The current practices of archivists seem to deviate from such an archetype to a large extent. This turn of tables can best be understood by a recall to the history of archival sciences. In general, for archives of research data the same principles hold as for any other archive. In 1898, in the handbook, one of the foundational texts in archival sciences [1], Muller, Feith, and Fruin describe the archive as an organic entirety whose function cannot be determined 1 Personal communication Henk Koning, former Technical Archivist at DANS 2 Dillo Fig. 1. The federated data infrastructure - a collaborative framework. Scheme designed by Peter Doorn based on the Collaborative Data Infrastructure as envisioned in [6, p. 31] . a priori. On the contrary, its function needs to be de\ufb01ned and rede\ufb01ned depending on the development of the institution (i.e., a board or government) whose selected traces it is obliged to archive. In other words, Muller et al. describe a co-evolution of the institution and its archive. This view applied to a research data archive, the corresponding institution is none other than the science system. From out this viewpoint, it is not surprising that the profound changes in scienti\ufb01c practice [2] and scholarly communication [3] in\ufb02uence the expectations placed on a data archive or, more speci\ufb01cally, a sustainable digital archive (Trusted Digital Repository). The changing modes of scholarly communication and practice alter the form and content of what is seen worth to be preserved. [5] Changing research practices require new negotiations on the division of labor. Who is responsible for setting up digital research infrastructures including virtual research environments - the information service providers such as Trusted Digital Repositories (TDRs) or the research institutions? Who takes care of the preparation of (meta)\u2013data and formats prior to archiving? Who should preserve software tools - the labs which developed them or the archive together with \u2019data\u2019 for which they have been developed? The high volatility of the environment in which archives are currently operating Training in Data Curation 3 in\ufb02uences their function as reliable, stable reference point for important information. Open Access, Data Management Plan, Data Stewardship, Data Curation, Trusted Digital Repositories, BigData and SmartData are some of the \ufb02oating around buzzwords of the last decade. They stand for the struggle to identify and communicate most urgent trends and to coordinate actions across the di\ufb00erent stakeholders in the \ufb01eld of data curation. Important to note here is the reference model for Open Archival Information Systems (in short OAIS model, ISO 14721:2012), a model foundational for the discussion of structure and function of any archive. Its key elements are Ingest, Archival Storage, Data Management, Administration, Preservation Planning and Access. Allison emphasizes that the OAIS model is not an architectural model for implementation, but instead o\ufb00ers a shared terminology. [4]. Inside of our own organization, Data Archiving and Networked Services (DANS), the OAIS model is often used in discussions about internal work\ufb02ows and their improvement and further development. In this paper, we focus on institutional networks around an archive as DANS. Hereby we rely on schemata as depicted in Fig. 1 which sketch the complexity of the research data landscape, its stakeholders and infrastructure [6]. Coming back to it later, in a \ufb01rst step we can use this scheme in an exercise to locate a TDR such as DANS. Starting at the bottom of Fig. 1 the basic (technical) infrastructure entails storage. In the Netherlands this level of Basic Infrastructure is provided by SURFsara, the Dutch network of computing facilities whose services DANS is using itself. The following three levels could be seen as the heart of activities of an archive of digital research data. They form a kind of back-o\ufb03ce. The three boxes at the next level, labeled as front o\ufb03ce, contain the funding agencies, as NWO2 in the Netherlands, university libraries, and research infrastructures such as CLARIN3 , or DARIAH4 , which are in themselves complex organizations. They could be seen as \u2019clients\u2019 of an archive. But actually, DANS is also part of them. The same holds true for the top level of data providers and users. DANS as part of research infrastructures harvests information from other data providers. With its own research and development activities it is even part of the data production cycle. In short, DANS plays di\ufb00erent roles in di\ufb00erent contexts and, therefore, can be located at many places in this scheme. Correspondingly, at DANS a variety of di\ufb00erent activities take place. In the next section, we discuss how, together with this increase in complexity, the need emerges to build alliances and to coordinate actions among di\ufb00erent institutional players in the data landscape. At the core of the paper we propose a speci\ufb01c model to articulate possibilities of collaboration, coordination, and division of labour. We report about steps towards its concrete implementation at the Dutch national level. At the end of the paper we discuss links to international developments. 2 3 4 www.nwo.nl www.clarin.eu www.dariah.eu 4 Dillo 2 The archivist as a consultant DANS is one of the national research data archives in the Netherlands. With roots in the social sciences and humanities back to the 1960s, in its current form, it was founded in 2005 as an institute of NWO - the Netherlands Organization for Scienti\ufb01c Research and the KNAW - the Royal Netherlands Academy of Arts and Sciences. DANS is primarily an information service institute and, despite of a small in-house research group, not a research institute. This makes DANS much more comparable to a classical, stand-alone archive. The mission of DANS it to promote sustained access to digital research data. For this purpose, DANS encourages researchers to archive and reuse data in a sustained manner, e.g. through the online (self)archiving system EASY5 . DANS also provides access, via NARCIS.nl6 , to thousands of scienti\ufb01c datasets, epublications and other research information in the Netherlands. EASY and NARCIS are two services which form the core of DANS. In di\ufb00erence to many other knowledge-domain speci\ufb01c archives, DANS operates cross-disciplinary with a focus on social sciences and humanities. It is also an exclusively digital archive and it is placed - as an institution - outside the Dutch university system. All this together positions DANS as a gateway to the diverse Dutch research data landscape and as a hub in it. Activities and practices at DANS can be ordered along three dimensions: \u2013 Archive: selection, preservation, and description of data collections \u2013 Research and Development: maintenance and development of the ICT infrastructure for seamless access and exploitation and for long-term preservation \u2013 Science Policy: in\ufb02uence on research data policies and data curation strategies on the national and international levels The \ufb01rst dimension corresponds to a large extent to the image of a traditional research archive. But due to ongoing ICT innovations both in the area of research as well as of information services, a digital archive cannot operate without means to adopt its technological backbone to those innovations. The process of adopting and inventing services entails to a large extent what Andrew Prescott called \u201dtinkering\u201d, when he compared practices at digital libraries with the craftsmanship needed in labs and workshops in the high-time of industrialization [7]. ICT is usually depicted as an e\ufb03ciency engine. What is often forgotten is the existence of a transition period during which old and new forms of practices coexist. On the work \ufb02oor, this means that traditional services of acquisition, community support, and documentation are pursued in parallel to designing new work\ufb02ows, testing and implementing them. So, before ICT leads to more e\ufb03ciency, temporarily the actual workload often increases. Project-based work and external funding for projects can only partly bu\ufb00er this extension of activities at an archive. On top of archiving and related R&D, the changing environment 5 6 www.easy.dans.knaw.nl www.narcis.nl Training in Data Curation 5 in which the archive operates requires continuous attention. Hence, a third dimension - science policy - appears. Participation in national and international networks of research infrastructures require substantive investment of time. The point we make is that the current portfolio of activities at information service institutions is much more diverse than in the past. For DANS this changing role of an archive is re\ufb02ected in its name as Data Archiving and Networked Services. Among the increased portfolio of activities, consultancy plays a special role [8]. It appears in many forms: in the foundation of a Data Seal of Approval for TDRs7 , in the advisory role in research projects, in contributions to data policy documents, and in training activities. Consultancy contributes to knowledge diffusion around data curation practices and the coordination of data management at a national (partly also international) level. It also supports the emergence of a distributed network structure which we describe in the next section. 3 Strategic alliance between archives and libraries - the FrontO\ufb03ce \u2013 BackO\ufb03ce model Profound and timely data management together with a sustainable storage of data \u2013 during and after the research \u2013 are indispensable preconditions for sharing data. It is of great importance that universities and other research institutions develop a clear data policy themselves. An adequate infrastructure is needed to coordinate and implement those policies. In the Netherlands, with its rich institutional landscape of information service providers and research institutions, we encounter a discussion around a federated data infrastructure. It is quite clear that no single organization will be able to deliver individually tailored support for all possible data depositors. It it also clear that it is not possible for a single organization to provide services across all levels, from storage up to interactions with individual researchers. In order to create a sustainable national infrastructure for data management and curation, it is important to support a network of local data stewards close to the actual scienti\ufb01c practice combined with centralized services. Fig. 1 designs such a federated data infrastructure. It introduces at the same time a FrontO\ufb03ce\u2013BackO\ufb03ce model (FO\u2013BO model) as part of it. 3.1 Description of the model The FO\u2013BO model clari\ufb01es the interaction between researcher and information service provider concerning research data management. It also clari\ufb01es the relation among di\ufb00erent information service providers. Front o\ufb03ces should be placed at institutions where research takes place in order to support the research community at those institutions. An example could be a front o\ufb03ce as part of a university library. The front o\ufb03ce is responsible for raising awareness for data sharing and re-use, for taking care of the local data management, and for organizing training for researchers. Virtual Research Environments (VRE\u2019s) could be 7 www.datasealofapproval.org 6 Dillo also part of the service at a front o\ufb03ce. In particular, temporary data archiving on platforms as Sharepoint or Dataverse could be part of the VRE\u2019s. Once a research project is \ufb01nished the front o\ufb03ce - in consultation with the back o\ufb03ce - takes care of the transfer of data to a TDR. So, data acquisition is an inherent part of the front o\ufb03ce tasks. The core tasks of the back o\ufb03ce consist in the storage and documentation of research data which arrive via the front o\ufb03ces. The back o\ufb03ce provides access to data, and possibly enriches and links data. The back o\ufb03ce acquires expert knowledge around data management, and the long-term, sustainable and persistent archiving of research data. Part of the back o\ufb03ce portfolio is to disseminate this expertise by means of training of information professionals, such as data librarians/managers/stewards, working at front o\ufb03ces. The back o\ufb03ce acts as an expertise centrum for the front o\ufb03ce and as an innovation centrum concerning new trends in data curation. Fig. 2 summarizes the bene\ufb01ts of the model for researchers, front o\ufb03ces and back o\ufb03ce organizations. By means of the FO\u2013BO model we also try to reduce the complexity of interactions in the data infrastructure. With this model the role of DANS (and of comparable institutions) is restricted to the back o\ufb03ce function. In the next subsection we report about one key element of the model: training for front o\ufb03ce personal. Fig. 2. Poster about the FrontO\ufb03ce\u2013BackO\ufb03ce model. Designed by Carolien van Zuilekom, Fieke Schoots, Madeleine de Smaele and Ingrid Dillo Training in Data Curation 3.2 7 Implementation - the DataIntelligence4Librarians In the FO\u2013BO model training for information professionals is part of the back o\ufb03ce portfolio. The DataIntelligence4Librarians course is an example for such a training. Organized by the 3TU.Datacenter8 and DANS, it is based on an earlier course of the 3TU.Datacenter developed for data-librarians. The currently envisioned audience reaches from sta\ufb00 at libraries to everybody interested in the topic independently of the disciplinary background. Fig. 3. Snapshot of the website http://dataintelligence.3tu.nl/en/home/ - host of the course \u201dDataIntelligence4Librarians\u201d Description of the course The course design \ufb01ts into the professional education format. It combines distance learning with four face2face (f2f) sessions and maintains next to an eLearning environment also a public website (see Fig. 3, in 8 The 3TU.Datacenter \u2013 a network organization of the university libraries of Delft University of Technology, Eindhoven University of Technology, and the University of Twente \u2013 o\ufb00ers facilities for the preservation and the sustained availability of technical research data, similar to the services at DANS (see http://data.3tu.nl/ repository/) 8 Dillo Dutch) with background material. Google Plus was used as the platform for the eLearning part. Participants are supposed to study theoretical parts as homework. Between the f2f sessions more homework is assigned. The website contains a description for the \ufb01rst practical task. More of them are distributed in the eLearning environment. Both coach and participants give feedback at f2f sessions as well as on-line. Didactically, feedback and knowledge sharing is used as an important element next to knowledge transfer. During the \ufb01rst f2f session, an introduction into the course and the eLearning environment is given. An introduction into the module Data Management follows and homework is assigned labeled State-of-Art Map. This task starts with reading a report, and continues with a number of search tasks using the phrase research data management across bibliographic databases (Scopus, Web of Science), but also in Twitter and Google. Participants are advised to subscribe to speci\ufb01c mailing lists to get an impression of the actual discussion around the topic. At the second f2f session participants presents their resulting map. The module Technical Skills is introduced and tools (3TU.Databrowser, DANS/EASY) are demonstrated. The third f2f meeting starts with the same scheme of sharing homework and getting feedback. Content-wise the module for this session is Acquisition and Consultation Skills. Speci\ufb01c attention is given to the question how to overcome barriers for data sharing. The instrument of a Data Interview with possible data depositors is introduced. During the fourth and last session the acquisition assignment is discussed and the course is evaluated. At the end of the course a certi\ufb01cate is issued under the condition that all sessions have been attended and the tasks have been ful\ufb01lled. During the modules, di\ufb00erent experts from the organizing institutions give guest lectures. Examples of topics are legal aspects, issues of data selection, audit and certi\ufb01cation of TDRs, and the FO\u2013BO model itself. Experiences with the course One goal of the course is to sharpen the insight into the role of research data in scienti\ufb01c practices. Eventually, the participant should be able to advise and inform researchers how data curation can enhance data use and re-use. In summary, the goals are rather diverse and broad compared with the limited time of the course. So far, the course has been run three times: February 2012 to June 2012 (16 participants, organized by 3TU.Datacentre), September 2012 to December 2012 (16 participants), and February 2013 to May 2013 (13 participants). The last two events have been organized in collaboration between the 3TU.Datacentre and DANS. Most of the participants were information professionals, either working at a library or archive, or for one of the network organizations, such as SURF. In the evaluation, the participants named a couple of critical points not unusual for distance learning. Among them are problems with the eLearning environment, or the spreading out of the course over a rather long period. Another critical remark concerns the demonstrations. Obviously the participants did not seek hands-on experiences with a tool, platform, or interface. They seemed to be more interested in guidance and factual information in the area of data cura- Training in Data Curation 9 tion. This springs also out from the positive reactions. Information about actual developments from experts involved in the practice of data curation have been highly appreciated. Further, a need to get to know each other and to learn from each others practices is articulated. This holds true even for a small country as the Netherlands. One of the suggestions of the participants was to form a special interest group. 4 Conclusions In this paper we discussed changing portfolios of responsibilities for archives and libraries. Data infrastructures emerge in response to data science, open access, and data sharing policies. In the making of a data infrastructure, the division of tasks between di\ufb00erent information service providers needs to be re-negotiated. We present a federal data infrastructure with a layered architecture including a FrontO\ufb03ce\u2013BackO\ufb03ce model. This model allows to articulate di\ufb00erent roles in the interaction with research communities, the acquisition of expert knowledge, and the provision of data management services. The model is in line with the Data pyramid [6] which classi\ufb01es data according to permanence and function. Data management is tailored towards certain classes of data and specialization in data curation is allocated to di\ufb00erent organizations. Front o\ufb03ces, naturally to be placed at academic libraries, take care of data management for transient and cyclic data produced by individuals and research communities. Trusted Digital Repositories as DANS act as back o\ufb03ce and take care for patrimonial data. They also become expertise center and knowledge transfer hubs for data curation. Training plays a key role in the FO\u2013BO model. It is a way to disseminate the idea of the model. At the same time, it is an instantiation of the model. The experiences in the Netherlands are encouraging. Several Dutch universities signaled interest in this approach and the challenge is now to implement more front o\ufb03ces there. At the same time, a coordination among possible back o\ufb03ce organizations is needed. DANS recently signed an coalition agreement with the 3TU.Datacenter to cooperate more closely and to foster the FO\u2013BO model. This coalition, Research Data Netherlands, is open to any other Dutch TDR with at least a Data Seal of Approval. To shape the role of back o\ufb03ces as centers of expertise and innovation is another way to make the model attractive and reliable. To give an example, there is a growing need for auto-ingest of larger data collections. Another shared issue is the question of a sustainable cost model for data archiving. Exploration of these issues needs to be done locally and shared in collaboration. Returning to the issue of training, as we argue in this paper, in the short run, there is an urgent need for education among information professionals. In the mid term, these e\ufb00orts could be connected to comparable modules in curricula for future information professionals, e.g. at iSchools. The FO\u2013BO models contains training also as a part of front o\ufb03ce activities. This is in line with e\ufb00orts in the framework of digital librarianship to develop modules for information lit- 10 Dillo eracy and data stewardship at many universities. The APARSEN project that aims at establishing a virtual centre of excellence on digital preservation carried out a survey concerning the European training landscape in this area [10]. The DataIntelligence4Librarians course \ufb01ts very well to outcomes of this survey. A coordination between those di\ufb00erent training activities will support further professionalization. Shared textbooks, syllabi, best practices guidelines could also help to keep locally provided on-line material up to date. Our experiences show that a natural alliance between (digital) archives and libraries exist which is worth to be explored in daily practice. Current science policies emphasizes the role of data and their re-use. The envisioned coupling of funding with data-sharing and archiving, the Linked Open Data movement, and the rise of data science will put more pressure on information service institutions, but at the same time also o\ufb00ers new possibilities. To form alliances and to coordinate actions seems to be the only possible answer. Here, libraries and archives are natural partners because as Ross formulated \u201dwhen we re\ufb02ect on the core of digital libraries we easily observe that they may be libraries by name, but they are archives by nature\u201d [9]. References 1. Muller, S., Feith, J.A., Fruin, R.: Handleiding voor het Ordenen en Beschrijven van Archiven. Erven B. Van Der Kamp. Groningen. 1920. 2ed. Reprinted in: Horsman, P.J., Ketelaar, F.C.J., Thomassen, T.H.P.M.: Tekst en Context van de Handleiding voor het Ordenen en Beschrijven van Archiven van 1898. Verloren, Hilversum 1998. 2. Wouters, P., Beaulieu, A., Scharnhorst, A., Wyatt, S.: Virtual Knowledge: Experimenting in the Humanities and the Social Sciences. MIT, Cambridge, Mass. 2012. 3. Borgman, C.: Scholarship in the digital age: Information, infrastructure, and the Internet. MIT, Cambridge, Mass. 2007. 4. Allison, J.: OAIS as a reference model for repositories. An evaluation. Report UKOLN University of Bath, 2007 http://eprints.whiterose.ac.uk/id/eprint/ 3464. 5. Doorn, P., Tjalsma, H.: Introduction: archiving research data. Archival Science 7(1), 1\u201320 (2007). DOI 10.1007/s10502-007-9054-6 6. Anonymous: Riding the wave. How Europe can gain from the riding tide of scienti\ufb01c data. Final report of the High Level Expert Group on Scienti\ufb01c Data. A submission of the European Commission. October 2010 http://cordis.europa.eu/fp7/ict/ e-infrastructure/docs/hlg-sdi-report 7. Prescott, A.: Made In She\ufb03eld: Industrial Perspectives on the Digital Humanities. Keynote at the Digital Humanities Congress at the University of She\ufb03eld, 6 September 2012. (The text of this keynote lecture can be found at Andrew Prescott\u2019s blog http://digitalriffs.blogspot.co.uk/2012/09/ made-in-sheffield-industrial.html) 8. Anonymous. Duurzame toegang tot digitale onderzoeksgegevens. Strategienota DANS (in Dutch). DANS, The Hague 2010. http://www.dans.knaw.nl/sites/ default/files/file/Uitgaven/Strategie/DANSSTRATEGIENOTAcompleet_DEF. pdf. A summary in English with the title: Sustained access to digital research data can be found at http://www.dans.knaw.nl/sites/default/files/file/ jaarverslagenenstrategienota/Samenvattingstrategienota_UK_DEF.pdf Training in Data Curation 11 9. Ross, S.: Digital Preservation, Archival Science and Methodological Foundations for Digital Libraries. New Review of Information Networking 17(1), 43\u201368 (2012). DOI 10.1080/13614576.2012.679446 10. Anonymous. APARSEN report: D43.1 Survey for the assessment of training material. Assessment of digital curation requirements. Available at: http://www.alliancepermanentaccess.org/wp-content/uploads/downloads/ 2012/12/APARSEN-REP-D43_1-01-4_1.pdf 5 Acknowledgement The following colleagues have been involved in the DataIntelligence4Librarians course. Nicole Potters, Marina Noordegraaf, Madeleine de Smaele, Ellen Verbakel (from the 3TU.Datacenter) and Rene van Horik, Caroline van Zuilekom, Marion Wittenberg, Ingrid Dillo (from DANS)."} +{"id":"oai:arXiv.org:1310.3370","text":"Talking With Scholars: Developing a Research Environment for Oral History Collections Max Kemman1 , Stef Scagliola1 , Franciska de Jong1,2 , and Roeland Ordelman2,3 arXiv:1310.3370v1 [cs.DL] 12 Oct 2013 1 3 Erasmus University Rotterdam, Rotterdam, The Netherlands {kemman,scagliola}@eshcc.eur.nl 2 University of Twente, Enschede, The Netherlands f.m.g.dejong@utwente.nl Netherlands Institute for Sound and Vision, Hilversum, The Netherlands rordelman@beeldengeluid.nl Abstract. Scholars are yet to make optimal use of Oral History collections. For the uptake of digital research tools in the daily working practice of researchers, practices and conventions commonly adhered to in the subfields in the humanities should be taken into account during development. To this end, in the Oral History Today project a research tool for exploring Oral History collections is developed in close collaboration with scholarly researchers. This paper describes four stages of scholarly research and the first steps undertaken to incorporate requirements of these stages in a digital research environment. Keywords: Oral History, scholarly research, user-centered design, exploration, result presentation, data curation, word cloud, visual facets 1 Introduction The digital turn has profoundly influenced historical culture and has led to a rise in the creation of audio-visual archives with personal narratives, commonly identified as Oral History. For the general public, searching these archives by making use of standard search tools may be sufficient. Yet for scholars, the full value of this type of data cannot be exploited optimally as available tools do not enable scholars to engage with the content for the purposes of research. When working with audio-visual content, the availability of annotations is key to the process of digging up interesting fragments. In the past years, a lot of effort has been put in tools for creating manual annotations and generating annotations (semi-)automatically. But to accelerate scholarly research, tools are required that can take available annotations layers as input and provide means for visualization, compression and aggregation of the data. Thus allowing the researcher to explore and process the data, both at fragment-, item- and collection-level. However, to develop such dedicated data exploration tools, technology specialists and researchers in the humanities have to engage in a process of mutual understanding and joint development. Taking carefully into account the specific set of practices and conventions commonly adhered to within the subfields in the humanities is a minimum requirement for the uptake of the technology in the daily working practice of scholars. In this paper we present a research tool developed in close collaboration with scholars that enables searching and exploration of aggregated, heterogeneous Oral History content. 2 Four stages of scholarly research The user interface development is based upon four stages of scholarly research that were defined on the basis of an investigation of use scenarios reported in [1]. Exploration and selection. In the first stage, the focus is on the exploration and selection of one or more content sets within an archive that may be suitable for addressing a certain scholarly issue. The first steps in content exploration by a researcher often come down to searching for material. Research starts with the search for new or additional data. This stage can get the form of plain browsing, but it can also be strongly purpose-driven, (e.g., checking details, searching for complementary sources), item-oriented (e.g., finding the first interview with a specific person), or directed towards patterns in a collection, in which case an entire data set is the focus of attention. Exploration and investigation. Once the relevant materials have been identified, the focus in the next stage is mostly on the further exploration of the collected materials, the ordering, comparison (by individual researchers or in joint efforts) and analysis, and the documentation of the interpretation. This exploration stage may generate new ideas and perspectives, requiring new searches and inquiries. Result presentation. After the analysis has been completed, the third stage is the presentation of research results. In the digital realm it has become feasible to link annotations that capture the results of an analytical step to the data on which they are based. Data and annotations can be shared with peers, both during collaboration as well as in publications. Instead of a printed book, one can produce a digital publication with links to audio-visual content. Data curation. The fourth and final stage of the process is the long-term preservation of the data and the results of the investigation that has been carried out. Especially audio-visual materials that have been processed with digital tools are not the kind of research result that can be stored in a cupboard; they should be deposited in a trusted digital repository [2]. Ideally the depositing of material should be in line with emerging standards for Open Data, as this would allow the data and annotations to be reused by scholars with similar interests. For example, links can then be created to other data sets to place the data in a broader context [3]. Although the actual curation process itself is out-of-scope in this specific research project, workspaces can provide a form of data curation through the individual collecting of interviews, cutting interesting fragments with a virtual cutter [5] and creating additional manual annotations that can be fed into the existing metadata and thereby enrich the collection even further. 3 Oral History Today research environment Visual search. The Oral History Today research interface is based upon the four stages described above. As the search process for the exploration and selection and exploration and investigation stages is reminiscent of Shneiderman\u2019s Visual Information-Seeking Mantra of overview first, zoom and filter, then detailson-demand [6], we developed a visual search interface to provide overview and zooming facilities, as well as support exploration strategies. Two visualizations were developed to complement the search interface and allow visual searching: word clouds and visual facets. Word clouds provide a textual insight in the material available, with the additional benefit that a better insight is gained in what terminology is used in the collections explored; an issue identified for keyword search interfaces [4]. Visual facets (Figure 1) provide a visual overview of the facets. Facets are shown as graphical bars, where the length of each value represents the number of related search results, as demonstrated previously in Relation Browser++ [7]. A difference with RB++ is that the facet values are stacked into a single bar representing the facet. On mouse-hovering a tooltip is shown with a textual description and the number of corresponding items. When the user selects a facet value, the facet bar is moved to the top to allow the user to keep a history of selected facets. Visual facets not only give a more visual overview of the search results, but also allow for faster interactions with the facets. Evaluation. To allow user feedback to be incorporated in the development process, evaluation is undertaken in multiple cycles. To elicit a broad range of responses with regard to usability as well as applicability to research practices, the first cycle was performed with semi-structured interviews. Five scholars were asked to try research subjects of their own interest. The results of this first evaluation are very positive. Concerning the visualisations described above, it was generally agreed that word clouds enable the searcher to acquire an idea of what material is available. However, they did not think word clouds would provide them with keywords to improve their queries. Visual facets were considered interesting and felt as a very fast way to both acquire an overview of the search results as well as refine search results. Fig. 1. Visual Facets Further adjustments. Scholars noted that being able to quickly assess the importance of search results is vital during the exploration and selection stage. To enable fast assessments, we added the ability to expand summary-descriptions in the search results, no longer requiring scholars to open each individual search result. After this assessment, scholars need to be able to save important items. Therefore, we developed workspaces, which allow researchers to save interviews in project-specific sets for later analysis, as well as for referencing in publications as needed in the result presentation stage described above. 4 Conclusion The results of the first evaluation are promising. The positive responses of the scholars indicated that the chosen approach for exploring Oral History data is in the right direction. In the near future, this evaluation will receive a larger followup in the final evaluation of the research interface. After this final evaluation, the tool will be released to the Oral History research community, allowing us to investigate how it will eventually be used in daily research practices. Acknowledgments. The work reported in this paper was funded by the EU Project AXES - Access to Audiovisual Archives (FP7-269980) and the Dutch national program CLARIAH (http://www.clariah.nl/). We thank Dispectu (www.dispectu.com) and Spinque (www.spinque.nl) for their collaboration in the research project Oral History Today. References 1. F. de Jong, R. Ordelman, and S. Scagliola. Audio-visual collections and the user needs of scholars in the humanities: a case for co-development. In Proceedings of the 2nd Conference on Supporting Digital Humanities (SDH 2011), page 7, Copenhagen, Denmark, 2011. Centre for Language Technology, Copenhagen. 2. R. Jantz and M. J. Giarlo. Digital preservation: Architecture and technology for trusted digital repositories. Microform & imaging review, 34(3):135\u2013147, 2005. 3. M. Kemman and M. Kleppe. PoliMedia - Improving Analyses of Radio, TV & Newspaper Coverage of Political Debates. In T. Aalberg, M. Dobreva, C. Papatheodorou, G. Tsakonas, and C. Farrugia, editors, TPDL 2013, Valletta, Malta, September 22-26, 2013. Proceedings, Valletta, Malta, 2013. Springer. 4. T. Mann. Will Google\u2019s Keyword Searching Eliminate the Need for LC Cataloging and Classification? Journal of Library Metadata, 8(2):159\u2013168, June 2008. 5. A. Rosendaal and J. Oomen. The Davideon Project: Capitalizing the Possibilities of Streaming Video as Flexible Learning Objects for the Humanities. Innovate: Journal of Online Education, 2(1), 2005. 6. B. Shneiderman. The eyes have it: a task by data type taxonomy for information visualizations. In Proceedings 1996 IEEE Symposium on Visual Languages, pages 336\u2013343. IEEE Comput. Soc. Press, 1996. 7. J. Zhang and G. Marchionini. Evaluation and evolution of a browse and search interface: Relation browser. In In: Proceedings of the 2005 national conference on Digital government research, pages 179\u2013188, 2005."} +{"id":"oai:arXiv.org:1312.3393","text":"arXiv:1312.3393v2 [cs.LG] 17 Dec 2013 Relative Upper Confidence Bound for the K-Armed Dueling Bandit Problem Masrour Zoghi ISLA, University of Amsterdam, The Netherlands m.zoghi@uva.nl Shimon Whiteson ISLA, University of Amsterdam, The Netherlands s.a.whiteson@uva.nl Remi Munos INRIA Lille - Nord Europe, Villeneuve d\u2019Ascq, France remi.munos@inria.fr Maarten de Rijke ISLA, University of Amsterdam, The Netherlands Abstract This paper proposes a new method for the K-armed dueling bandit problem, a variation on the regular K-armed bandit problem that offers only relative feedback about pairs of arms. Our approach extends the Upper Confidence Bound algorithm to the relative setting by using estimates of the pairwise probabilities to select a promising arm and applying Upper Confidence Bound with the winner as a benchmark. We prove a finite-time regret bound of order O(log t). In addition, our empirical results using real data from an information retrieval application show that it greatly outperforms the state of the art. 1. Introduction In this paper, we propose and analyze a new algorithm, called Relative Upper Confidence Bound (RUCB), for the K-armed dueling bandit problem (Yue et al., 2012), a variation on the K-armed bandit problem, where the feedback comes in the form of pairwise preferences. We assess the performance of this algorithm using one of the main current applications of the K-armed dueling bandit problem, ranker evaluation (Hofmann et al., 2013; Joachims, 2002; Yue & Joachims, 2011), which is used in information retrieval, ad placement and recommender systems, among others. derijke@uva.nl The K-armed dueling bandit problem is part of the general framework of preference learning (F\u00a8 urnkranz & H\u00a8 ullermeier, 2010; F\u00a8 urnkranz et al., 2012), where the goal is to learn, not from real-valued feedback, but from relative feedback, which specifies only which of two alternatives is preferred. Developing effective preference learning methods is important for dealing with domains in which feedback is naturally qualitative (e.g., because it is provided by a human) and specifying real-valued feedback instead would be arbitrary or inefficient (F\u00a8 urnkranz et al., 2012). Other algorithms proposed for this problem are Interleaved Filter (IF) (Yue et al., 2012), Beat the Mean (BTM) (Yue & Joachims, 2011), and SAVAGE (Urvoy et al., 2013). All of these methods were designed for the finite-horizon setting, in which the algorithm requires as input the exploration horizon, T , the time by which the algorithm needs to produce the best arm. The algorithm is then judged based upon either the accuracy of the returned best arm or the regret accumulated in the exploration phase.1 All three of these algorithms use the exploration horizon to set their internal parameters, so for each T , there is a separate algorithm IFT , BTMT and SAVAGET . By contrast, RUCB does not require this input, making it more useful in practice, since a good exploration horizon is often difficult to guess. Nonetheless, RUCB outperforms these algorithms in terms of the accuracy and regret metrics used in the finite-horizon setting. The main idea of RUCB is to maintain optimistic estimates of the probabilities of all possible pairwise out1 These terms are formalized in Section 2. Relative Upper Confidence Bound comes, and (1) use these estimates to select a potential champion, which is an arm that has a chance of being the best arm, and (2) select an arm to compare to this potential champion by performing regular Upper Confidence Bound (Auer et al., 2002) relative to it. We prove a finite-time high-probability bound of O(log t) on the cumulative regret of RUCB, from which we deduce a bound on the expected cumulative regret. These bounds rely on substantially less restrictive assumptions on the K-armed dueling bandit problem than IF and BTM and have better multiplicative constants than those of SAVAGE. Furthermore, our bounds are the first explicitly non-asymptotic results for the K-armed dueling bandit problem. More importantly, The main distinction of our result is that it holds for all time steps. By contrast, given an exploration horizon T , the results for IF, BTM and SAVAGE bound only the regret accumulated by IFT , BTMT and SAVAGET in the first T time steps. Finally, we evaluate our method empirically using real data from an information retrieval application. The results show that RUCB can learn quickly and effectively and greatly outperforms BTM and SAVAGE. The main contributions of this paper are as follows: \u2022 A novel algorithm for the K-armed dueling bandit problem that is more broadly applicable than existing algorithms, \u2022 More comprehensive theoretical results that make less restrictive assumptions than those of IF and BTM, have better multiplicative constants than the results of SAVAGE, and apply to all time steps, and \u2022 Experimental results, based on a real-world application, demonstrating the superior performance of our algorithm compared to existing methods. 2. Problem Setting The K-armed dueling bandit problem (Yue et al., 2012) is a modification of the K-armed bandit problem (Auer et al., 2002): the latter considers K arms {a1 , . . . , aK } and at each time-step, an arm ai can be pulled, generating a reward drawn from an unknown stationary distribution with expected value \u00b5i . The K-armed dueling bandit problem is a variation, where instead of pulling a single arm, we choose a pair (ai , aj ) and receive one of the two as the better choice, with the probability of ai being picked equal to a constant pij and that of aj equal to pji = 1 \u2212 pij . We define the preference matrix P = [pij ], whose ij entry is pij . In this paper, we assume that there exists a Condorcet winner (Urvoy et al., 2013): an arm, which without loss of generality we label a1 , such that p1i > 12 for all i > 1. Given a Condorcet winner, we define regret for each time-step as follows (Yue et al., 2012): if arms ai and aj were chosen for comparison at time t, then \u2206 +\u2206 regret at that time is set to be rt := 1i 2 1j , with 1 \u2206k := p1k \u2212 2 for all k \u2208 {1, . . . , K}. Thus, regret measures the average advantage that the Condorcet winner has over the two arms being compared against each other. Given our assumption on the probabilities p1k , this implies that r = 0 if and only if the best arm is compared against itself. We define cumulative regret T up to time T to be RT = t=1 rt . The Condorcet winner is different in a subtle but important way from the Borda winner (Urvoy et al., 2013), which is an arm ab that satisfies j pbj \u2265 j pij , for all i = 1, . . . , K. In other words, when averaged across all other arms, the Borda winner is the arm with the highest probability of winning a given comparison. In the K-armed dueling bandit problem, the Condorcet winner is sought rather than the Borda winner, for two reasons. First, in many applications, including the ranker evaluation problem addressed in our experiments, the eventual goal is to adapt to the preferences of the users of the system. Given a choice between the Borda and Condorcet winners, those users prefer the latter in a direct comparison, so it is immaterial how these two arms fare against the others. Second, in settings where the Borda winner is more appropriate, no special methods are required: one can simply solve the K-armed bandit algorithm with arms {a1 , . . . , aK }, where pulling ai means choosing an index j \u2208 {1, . . . , K} randomly and comparing ai against aj . Thus, research on the K-armed dueling bandit problem focuses on finding the Condorcet winner, for which special methods are required to avoid mistakenly choosing the Borda winner. The goal of a bandit algorithm can be formalized in several ways. In this paper, we consider two standard settings: 1. The finite-horizon setting: In this setting, the algorithm is told in advance the exploration horizon, T , i.e., the number of time-steps that the evaluation process is given to explore before it has to produce a single arm as the best, which will be exploited thenceforth. In this setting, the algorithm can be assessed on its accuracy, the probability that a given run of the algorithm reports the Condorcet winner as the best arm (Urvoy et al., 2013), which is related to expected simple regret: the regret associated with the algorithm\u2019s choice of the best arm, i.e., rT +1 (Bubeck et al., 2009). Another measure of success in this setting is the amount of regret accumulated during the exploration phase, as for- Relative Upper Confidence Bound mulated by the explore-then-exploit problem formulation (Yue et al., 2012). 2. The horizonless setting: In this setting, no horizon is specified and the evaluation process continues indefinitely. Thus, it is no longer sufficient for the algorithm to maximize accuracy or minimize regret after a single horizon is reached. Instead, it must minimize regret across all horizons by rapidly decreasing the frequency of comparisons involving suboptimal arms, particularly those that fare worse in comparison to the best arm. This goal can be formulated as minimizing the cumulative regret over time, rather than with respect to a fixed horizon (Lai & Robbins, 1985). As we describe in Section 3, all existing K-armed dueling bandit methods target the finite-horizon setting. However, we argue that the horizonless setting is more relevant in practice for the following reason: finitehorizon methods require a horizon as input and often behave differently for different horizons. This poses a practical problem because it is typically difficult to know in advance how many comparisons are required to determine the best arm with confidence and thus how to set the horizon. If the horizon is set too long, the algorithm is too exploratory, increasing the number of evaluations needed to find the best arm. If it is set too short, the best arm remains unknown when the horizon is reached and the algorithm must be restarted with a longer horizon. Moreover, any algorithm that can deal with the horizonless setting can easily be modified to address the finite-horizon setting by simply stopping the algorithm when it reaches the horizon and returning the best arm. By contrast, for the reverse direction, one would have to resort to the \u201cdoubling trick\u201d (Cesa-Bianchi & Lugosi, 2006, Section 2.3), which leads to substantially worse regret results: this is because all of the upper bounds proven for methods addressing the finitehorizon setting so far are in O(log T ) and applying the doubling trick to such results would lead to regret bounds of order (log T )2 , with the extra log factor coming from the number of partitions. To the best of our knowledge, RUCB is the first Karmed dueling bandit algorithm that can function in the horizonless setting without resorting to the doubling trick. We show in Section 4 how it can be adapted to the finite-horizon setting. 3. Related Work In this section, we briefly survey existing methods for the K-armed dueling bandit problem. The first method for the K-armed dueling bandit problem is interleaved filter (IF) (Yue et al., 2012), which was designed for a finite-horizon scenario and which proceeds by picking a reference arm to compare against the rest and using it to eliminate other arms, until the reference arm is eliminated by a better arm, in which case the latter becomes the reference arm and the algorithm continues as before. The algorithm terminates either when all other arms are eliminated or if the exploration horizon T is reached. More recently, the beat the mean (BTM) algorithm has been shown to outperform IF (Yue & Joachims, 2011), while imposing less restrictive assumptions on the K-armed dueling bandit problem. BTM focuses exploration on the arms that have been involved in the fewest comparisons. When it determines that an arm fares on average too poorly in comparison to the remaining arms, it removes it from consideration. More precisely, BTM considers the performance of each arm against the mean arm by averaging the arm\u2019s scores against all other arms and uses these estimates to decide which arm should be eliminated. Both IF and BTM require the comparison probabilities pij to satisfy certain conditions that are difficult to verify without specific knowledge about the dueling bandit problem at hand and, moreover, are often violated in practice (see the supplementary material for a more thorough discussion and analysis of these assumptions). Under these conditions, theoretical results have been proven for IF and BTM in (Yue et al., 2012) and (Yue & Joachims, 2011). More precisely, both algorithms take the exploration horizon T as an input and so for each T , there are algorithms IFT and BTMT ; the results then state the following: for large T , in the case of IFT , we have the expected regret bound E RTIFT \u2264 C K log T , minK j=2 \u2206j and, in the case of BTMT , the high probability regret bound RTBTMT \u2264 C \u03b3 7 K log T with high probability, minK j=2 \u2206j where arm a1 is assumed to be the best arm, and we define \u2206j := p1j \u2212 12 , and C and C are constants independent of the specific dueling bandit problem. The first bound matches a lower bound proven in (Yue et al., 2012, Theorem 4). However, as pointed out in (Yue & Joachims, 2011), this result holds for a very restrictive class of K-armed dueling bandit problems. Relative Upper Confidence Bound In an attempt to remedy this issue, the second bound was proven for BTM, which includes a relaxation parameter \u03b3 that allows for a broader class of problems, as discussed in the supplementary material. The difficulty with this result is that the parameter \u03b3, which depends on the probabilities pij and must be passed to the algorithm, can be very large. Since it is raised to the power of 7, this makes the bound very loose. For instance, in the three-ranker evaluation experiments discussed in Section 6, the values for \u03b3 are 4.85, 11.6 and 47.3 for the 16-, 32- and 64-armed examples. In contrast to the above limitations and loosenesses, in Section 5 we provide explicit bounds on the regret accumulated by RUCB that do not depend on \u03b3 and require only the existence of a Condorcet winner for their validity, which makes them much more broadly applicable. Sensitivity Analysis of VAriables for Generic Exploration (SAVAGE) (Urvoy et al., 2013) is a recently proposed algorithm that outperforms both IF and BTM by a wide margin when the number of arms is of moderate size. Moreover, one version of SAVAGE, called Condorcet SAVAGE, makes the Condorcet assumption and performed the best experimentally (Urvoy et al., 2013). Condorcet SAVAGE compares pairs of arms uniformly randomly until there exists a pair for which one of the arms beats another by a wide margin, in which case the loser is removed from the pool of arms under consideration. We show in this paper that our proposed algorithm for ranker evaluation substantially outperforms Condorcet SAVAGE. The theoretical result proven for Condorcet SAVAGE has the following form (Urvoy et al., 2013, Theorem 3). First, let us assume that a1 is the Condorcet winner and let TCSAVAGET denote the number of iterations the Condorcet SAVAGE algorithm with exploration horizon T requires before terminating and returning the best arm; then, given \u03b4 > 0, with probability 1 \u2212 \u03b4, we have for large T K\u22121 TCSAVAGET \u2264 C j=1 j \u00b7 log KT \u03b4 , \u22062j+1 with the indices j arranged such that \u22062 \u2264 \u00b7 \u00b7 \u00b7 \u2264 \u2206K and \u2206j = p1j \u2212 21 as before, and C a problem independent constant. This bound is very similar in spirit to our high probability result, with the important distinction that, unlike the above bound, the multiplicative factors in our result (i.e., the Dij in Theorem 2 below) do not depend on \u03b4. Moreover, in (Urvoy et al., 2013, Appendix B.1), the authors show that for large T we have the following expected regret bound: K E RTCSAVAGET \u2264C j=2 j \u00b7 log KT 2 + 1. \u22062j This is similar to our expected regret bound in Theorem 3, although for difficult problems where the \u2206j are small, Theorem 3 yields a tighter bound due to the presence of the \u2206j in the numerator of the second summand. An important advantage that our result has over the results reviewed here is an explicit expression for the additive constant, which was left out of the analyses of IF, BTM and SAVAGE. Finally, note that all of the above results bound only RT , where T is the predetermined exploration horizon, since IF, BTM and SAVAGE were designed for the finite-horizon setting. By contrast, in Section 5, we bound the cumulative regret of each version of our algorithm for all time steps. 4. Method We now introduce Relative Upper Confidence Bound (RUCB), which is applicable to any K-armed dueling bandit problem with a Condorcet winner. Algorithm 1 Relative Upper Confidence Bound Input: \u03b1 > 21 , T \u2208 {1, 2, . . .} \u222a {\u221e} 1: W = [wij ] \u2190 0K\u00d7K // 2D array of wins: wij is the number of times ai beat aj 2: for t = 1, . . . , T do W \u03b1 ln t // All operU := [uij ] = W+W T + W+WT x ations are element-wise; 0 := 1 for any x. 4: uii \u2190 21 for each i = 1, . . . , K. 5: Pick any c satisfying ucj \u2265 12 for all j. If no such c, pick c randomly from {1, . . . , K}. 6: d \u2190 arg maxj ujc 7: Compare arms ac and ad and increment wcd or wdc depending on which arm wins. 8: end for Return: An arm ac that beats the most arms, i.e., c wcj > 12 . with the largest count # j| wcj +w jc 3: In each time-step, RUCB, shown in Algorithm 1, goes through the following three stages: (1) RUCB puts all arms in a pool of potential champions. Then, it compares each arm ai against all other arms optimistically: for all i = j, we compute the upper bound uij (t) = \u00b5ij (t) + cij (t), where \u00b5ij (t) is the frequentist estimate of pij at time t and cij (t) is an optimism bonus that increases with t and decreases with the number of comparisons between i and j (Line 3). Relative Upper Confidence Bound If we have uij < 12 for any j, then ai is removed from the pool. Next, a champion arm ac is chosen randomly from the remaining potential champions (Line 5). (2) Regular UCB is performed using ac as a benchmark (Line 6), i.e., UCB is performed on the set of arms a1c . . . aKc . Specifically, we select the arm d = arg maxj ujc . When c = j, ujc is defined as above. When c = j, since pcc = 21 , we set ucc = 12 (Line 4). pij \u00b5ij n+2 \u00b7\u00b7\u00b7 \u00b5ij n+1 Eventually, as more comparisons are conducted, the estimates \u00b51j tend to concentrate above 12 and the optimism bonuses c1j (t) will become small. Thus, both stages of the algorithm will increasingly select a1 , i.e., ac = ad = a1 . Since comparing a1 to itself is optimal, rt declines over time. Note that Algorithm 1 is a finite-horizon algorithm if T < \u221e and a horizonless one if T = \u221e, in which case the for loop never terminates. 5. Theoretical Results In this section, we prove finite-time high-probability and expected regret bounds for RUCB. We first state Lemma 1 and use it to prove a high-probability bound in Theorem 2, from which we deduce an expected regret bound in Theorem 3. To simplify notation, we assume without loss of generality that a1 is the optimal arm in the following. Moreover, given any K-armed dueling bandit algorithm, we define wij (t) to be the number of times arm ai has beaten aj in the first t iterations of the algorithm. We also define uij (t) := wij (t) wij (t)+wji (t) + \u03b1 ln t wij (t)+wji (t) , for any given \u03b1 > 0, and set lij (t) := 1\u2212 uji (t). Moreover, for any \u03b4 > 0, define C(\u03b4) := (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 . Lemma 1. Let P := [pij ] be the preference matrix of a K-armed dueling bandit problem with arms {a1 , . . . , aK }, satisfying p1j > 12 for all j > 1 (i.e. a1 is the Condorcet winner). Then, for any dueling \u00b7\u00b7\u00b7 \u00b5ij n pij \u00b5ij (t) (3) The pair (ac , ad ) are compared and the score sheet is updated as appropriate (Line 7). Note that in stage (1) the comparisons are based on ucj , i.e., ac is compared optimistically to the other arms, making it easier for it to become the champion. By contrast, in stage (2) the comparisons are based on ujc , i.e., ac is compared to the other arms pessimistically, making it more difficult for ac to be compared against itself. This is important because comparing an arm against itself yields no information. Thus, RUCB strives to avoid auto-comparisons until there is great certainty that ac is indeed the Condorcet winner. \u00b7\u00b7\u00b7 Confidence intervals [lij (t), uij (t)] T \u03c4nij ij \u03c4n+1 Figure 1. An illustration of the idea behind Lemma 1 using an example of how the confidence intervals of a single pair of arms (ai , aj ), and their relation to the comparison ij probability pij , might evolve over time. The time-step \u03c4m th denotes the m time when the arms ai and aj were chosen by RUCB to be compared against each other. We also ij define \u00b5ij m := \u00b5ij (\u03c4m ). The time T is when the confidence intervals begin to include pij . The lemma then states that with probability 1 \u2212 \u03b4, we have T \u2264 C(\u03b4). bandit algorithm and any \u03b1 > 1 2 and \u03b4 > 0, we have P \u2200 t > C(\u03b4), i, j, pij \u2208 [lij (t), uij (t)] > 1 \u2212 \u03b4. Proof. See the supplementary material. The idea behind this lemma is depicted in Figure 1, which illustrates the two phenomena that make it possible: first, as long as arms ai and aj are not compared against each other, \u221a the interval [lij (t), uij (t)] will grow in length as log t, hence approaching pij ; second, as the number of comparisons between ai and aj increases, the estimated means \u00b5ij approach pij , hence increasing the probability that the interval [lij (t), uij (t)] will contain pij . Let us now turn to our high probability bound: Theorem 2. Given a preference matrix P = [pij ] and \u03b4 > 0 and \u03b1 > 21 , define C(\u03b4) := 4\u03b1 Dij := min{\u2206 2 ,\u22062 } for each i, j = i j where \u2206i := 12 \u2212 pi1 , and set Dii (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 and 1, . . . , K with i = j, = 0 for all i. Then, if we apply Algorithm 1 to the K-armed dueling bandit problem defined by P, given any pair (i, j) = (1, 1), the number of comparisons between arms ai and aj performed up to time t, denoted by Nij (t), satisfies P \u2200 t, Nij (t) \u2264 max C(\u03b4), Dij ln t > 1 \u2212 \u03b4. (1) Moreover, we have the following high probability bound for the regret accrued by the algorithm: P \u2200 t, Rt \u2264 C(\u03b4)\u2206\u2217 + Dij \u2206ij ln t i>j > 1 \u2212 \u03b4, (2) Relative Upper Confidence Bound \u2206 +\u2206 a1 where \u2206\u2217 := maxi \u2206i and \u2206ij := i 2 j , while Rt is the cumulative regret as defined in Section 2. Proof. Given Lemma 1, we know with probability 1\u2212\u03b4 that pij \u2208 [lij (t), uij (t)] for all t > C(\u03b4). Let us first deal with the easy case when i = j = 1: when t > C(\u03b4) holds, ai cannot be played against itself, since if we get c = i in Algorithm 1, then by Lemma 1 and the fact that a1 is the Condorcet winner we have a1 1 2 ai 1 2 pi1 1 uii (t) = < p1i \u2264 u1i (t), 2 ai aj \u2206i and so d = i. Now, let us assume that distinct arms ai and aj have been compared against each other more than Dij ln t times and that t > C(\u03b4). If s is the last time ai and aj were compared against each other, we must have uij (s) \u2212 lij (s) = 2 \u22642 \u03b1 ln s Nij (t) \u03b1 ln t <2 Nij (t) (3) \u03b1 ln t 4\u03b1 ln t min{\u22062i ,\u22062j } = min{\u2206i , \u2206j }. On the other hand, for ai to have been compared against aj at time s, one of the following two scenarios must have happened: I. In Algorithm 1, we had c = i and d = j, in which case both of the following inequalities must hold: a. uij (s) \u2265 12 , since otherwise c could not have been set to i by Line 5 of Algorithm 1, and b. lij (s) = 1 \u2212 uji (s) \u2264 1 \u2212 p1i = pi1 , since we know that p1j \u2264 u1i (t), by Lemma 1 and the fact that t > C(\u03b4), and for d = j to be satisfied, we must have u1i (t) \u2264 uji (t) by Line 6 of Algorithm 1. From these two inequalities, we can conclude uij (s) \u2212 lij (s) \u2265 1 \u2212 pi1 = \u2206i . 2 (4) This inequality is illustrated using the lower right confidence interval in the (ai , aj ) block of Figure 2, where the interval shows [lij (s), uij (s)] and the distance between the dotted lines is 12 \u2212 pi1 . II. In Algorithm 1, we had c = j and d = i, in which case swapping i and j in the above argument gives uji (s) \u2212 lji (s) \u2265 1 \u2212 pj1 = \u2206j . 2 (5) Similarly, this is illustrated using the lower left confidence interval in the (aj , ai ) block of Figure 2, where the interval shows [lji (s), uji (s)] and the distance between the dotted lines is 12 \u2212 pj1 . aj 1 2 pj1 \u2206j Figure 2. An illustration of the proof of Theorem 2. The figure shows an example of the internal state of RUCB at time s. The height of the dot in the block in row am and column an represents the comparisons probability pmn , while the interval, where present, represents the confidence interval [lmn , umn ]: we have only included them in the (ai , aj ) and the (aj , ai ) blocks of the figure because those are the ones that are discussed in the proof. Moreover, in those blocks, we have included the outcomes of two different runs: one drawn to the left of the dots representing pij and pji , and the other to the right (the horizontal axis in these plots has no other significance). These two outcomes are included to address the dichotomy present in the proof. Note that for a given run, we must have [lji (s), uji (s)] = [1 \u2212 uij (s), 1 \u2212 lij (s)] for any time s, hence the symmetry present in this figure. Putting (4) and (5) together with (3) yields a contradiction, so with probability 1 \u2212 \u03b4 we cannot have Nij be larger than both C(\u03b4) and Dij ln t. This gives us (1), from which (2) follows by allowing for the largest regret, \u2206\u2217 , to occur in each of the first C(\u03b4) steps of the algorithm and adding the regret accrued by Dij ln t comparisons between ai and aj . Next, we prove our expected regret bound: Theorem 3. Given \u03b1 > 1, the expected regret accumulated by RUCB after t iterations is bounded by 1 (4\u03b1 \u2212 1)K 2 2\u03b1\u22121 2\u03b1 \u2212 1 E[Rt ] \u2264 \u2206 2\u03b1 \u2212 1 2\u03b1 \u2212 2 \u2206i + \u2206 j + 2\u03b1 ln t. min{\u22062i , \u22062j } i>j \u2217 (6) Proof. We can obtain the bound in (6) from (2) by integrating with respect to \u03b4 from 0 to 1. This is because given any one-dimensional random variable X with Relative Upper Confidence Bound 1 \u22121 CDF FX , we can use the identity E[X] = 0 FX (q)dq. In our case, X = Rt for a fixed time t and, as illustrated in Figure 3, we can deduce from (2) that FRt (r) > Ht\u22121 (r), which gives the bound FR\u22121 (q) < Ht (q) = C(1 \u2212 q)\u2206\u2217 + t Dij \u2206ij ln t. i>j 6. Experiments Now, assume that \u03b1 > 1. To derive (6) from the above inequality, we need to integrate the righthand side, and since it is only the first term in the summand that depends on q, that is all we need to integrate. To do (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 so, recall that C(\u03b4) := 1 2\u03b1\u22121 , so to simplify 1 2\u03b1\u22121 2 notation, we define L := (4\u03b1\u22121)K . Now, we 2\u03b1\u22121 can carry out the integration as follows, beginning by using the substitution 1 \u2212 q = \u03b4, dq = \u2212d\u03b4: 1 0 q=0 C(1 \u2212 q)dq = 1 = 0 \u03b4=1 (4\u03b1 \u2212 1)K 2 (2\u03b1 \u2212 1)\u03b4 1 1\u2212 2\u03b1\u22121 \u03b4 =L 1\u2212 1 2\u03b1\u22121 1 = 0 \u2212C(\u03b4)d\u03b4 1 2\u03b1\u22121 1 1 \u03b4 \u2212 2\u03b1\u22121 d\u03b4 d\u03b4 = L 0 (4\u03b1 \u2212 1)K 2\u03b1 \u2212 1 2 1 2\u03b1\u22121 2\u03b1 \u2212 1 . 2\u03b1 \u2212 2 q0 FRt (r) 0 q FRt (r0) 1 Remark 4. Note that RUCB uses the upperconfidence bounds (Line 3 of Algorithm 1) introduced in the original version of UCB (Auer et al., 2002) (up to the \u03b1 factor). Recently refined upper-confidence bounds (such as UCB-V (Audibert et al., 2009) or KL-UCB (Capp\u00b4e et al., 2013)) have improved performance for the regular K-armed bandit problem. However, in our setting the arm distributions are Bernoulli and the comparison value is 1/2. Thus, since we have 2\u22062i \u2264 kl(p1,i , 1/2) \u2264 4\u22062i (where kl(a, b) = Ht\u22121(r), the inverse function of Ht(q) := C(1 \u2212 q)\u2206\u2217 + i>j Dij \u2206ij ln t 0 r0 = Ht(q0) a log ab + (1 \u2212 a) log 1\u2212a 1\u2212b is the KL divergence between Bernoulli distributions with parameters a and b), we deduce that using KL-UCB instead of UCB does not improve the leading constant in the logarithmic term of the regret by a numerical factor of more than 2. t r Figure 3. A schematic graph illustrating the proof of Theorem 3. Note that the expression for Ht (q) is extracted from (2), which also implies that Ht\u22121 is necessarily below FRt : formulated in terms of CDFs, (2) states that FRt (Ht (q0 )) > q0 = Ht\u22121 (Ht (q0 )), where q0 = 1 \u2212 \u03b40 is a quantile. From this, we can conclude that FRt (r) > Ht\u22121 (r) for all r. To evaluate RUCB, we apply it to the problem of ranker evaluation from the field of information retrieval (IR) (Manning et al., 2008). A ranker is a function that takes as input a user\u2019s search query and ranks the documents in a collection according to their relevance to that query. Ranker evaluation aims to determine which among a set of rankers performs best. One effective way to achieve this is to use interleaved comparisons (Radlinski et al., 2008), which interleave the documents proposed by two different rankers and presents the resulting list to the user, whose resulting click feedback is used to infer a noisy preference for one of the rankers. Given a set of K rankers, the problem of finding the best ranker can then be modeled as a K-armed dueling bandit problem, with each arm corresponding to a ranker. Our experimental setup is built on real IR data, namely the LETOR NP2004 dataset (Liu et al., 2007). Using this data set, we create a set of 64 rankers, each corresponding to a ranking feature provided in the data set, e.g., PageRank. The ranker evaluation task thus corresponds to determining which single feature constitutes the best ranker (Hofmann et al., 2013). To compare a pair of rankers, we use probabilistic interleave (PI) (Hofmann et al., 2011), a recently developed method for interleaved comparisons. To model the user\u2019s click behavior on the resulting interleaved lists, we employ a probabilistic user model (Craswell et al., 2008; Hofmann et al., 2011) that uses as input the manual labels (classifying documents as relevant or not for given queries) provided with the LETOR NP2004 dataset. Queries are sampled randomly and clicks are generated probabilistically by conditioning on these assessments in a way that resembles the behavior of an actual user (Guo et al., 2009a;b). Following (Yue & Joachims, 2011), we first used the above approach to estimate the comparison probabilities pij for each pair of rankers and then used these probabilities to simulate comparisons between rankers. More specifically, we estimated the full preference matrix by performing 4000 interleaved comparisons on each pair of the 64 feature rankers included in the LETOR dataset. We evaluated RUCB, Condorcet SAVAGE and BTM Relative Upper Confidence Bound LETOR NP2004 Dataset with 16 rankers RUCB \u03b1 = 0.51 Condorcet SAVAGE BTM 103 105 time 102 3 10 106 LETOR NP2004 Dataset with 16 rankers 1.0 0.6 0.4 0.2 104 105 time 106 104 105 time 102 3 10 106 LETOR NP2004 Dataset with 32 rankers 0.8 0.6 0.4 0.2 0.0 3 10 104 103 1.0 best ranker rate (accuracy) 104 0.8 0.0 3 10 104 103 best ranker rate (accuracy) best ranker rate (accuracy) 1.0 105 cumulative regret 104 102 3 10 LETOR NP2004 Dataset with 64 rankers 105 cumulative regret cumulative regret 105 LETOR NP2004 Dataset with 32 rankers 104 105 time 106 104 105 time 106 LETOR NP2004 Dataset with 64 rankers 0.8 0.6 0.4 0.2 0.0 3 10 104 105 time 106 Figure 4. Average cumulative regret and accuracy for 100 runs of BTM, Condorcet SAVAGE and RUCB with \u03b1 = 0.51 applied to three K-armed dueling bandit problems with K = 16, 32, 64. In the top row of plots, both axes use log scales, and the dotted curves signify best and worst regret performances; in the bottom plots, only the x-axis uses a log scale. using randomly chosen subsets from the pool of 64 rankers, yielding K-armed dueling bandit problems with K \u2208 {16, 32, 64}. For each set of rankers, we performed 100 independent runs of each algorithm for a maximum of 4.5 million iterations. For RUCB we set \u03b1 = 0.51, which approaches the limit of our highprobability theoretical results, i.e., \u03b1 > 0.5 as in Theorem 2. We did not include an evaluation of IF, since both BTM and Condocet SAVAGE were shown to outperform it (Urvoy et al., 2013; Yue & Joachims, 2011). Since BTM and SAVAGE require the exploration horizon as input, we ran BTMT and CSAVAGET for various horizons T ranging from 1000 to 4.5 million. In the top row of plots in Figure 4, the markers on the green and the blue curves show the regret accumulated by BTMT and CSAVAGET in the first T iteration of the algorithm for each of these horizons. Thus, each marker corresponds, not to the continuation of the runs that produced the previous marker, but to new runs conducted with a larger T . Since RUCB is horizonless, we ran it for 4.5 million iterations and plotted the cumulative regret, as shown using the red curves in the same plots. In the case of all three algorithms, the solid line shows the expected cumulative regret averaged across all 100 runs and the dotted lines show the minimum and the maximum cu- mulative regret that was observed across runs. Note that these plots are in log-log scale. The bottom plots in Figure 4 show the accuracy of all three algorithms across 100 runs, computed at the same times as the exploration horizons used for BTM and SAVAGE in the regret plots. Note that these plots are in lin-log scale. These results clearly demonstrate that RUCB identifies the best arm more quickly, since it asymptotically accumulates 5 to 10 times less regret than Condorcet SAVAGE, while reaching higher levels of accuracy in roughly 20% of the time as Condorcet SAVAGE, all without knowing the horizon T . The contrast is even more stark when comparing to BTM. 7. Conclusions This paper proposed a new method called Relative Upper Confidence Bound (RUCB) for the K-armed dueling bandit problem that extends the Upper Confidence Bound (UCB) algorithm to the relative setting by using optimistic estimates of the pairwise probabilities to choose a potential champion and conducting regular UCB with the champion as the benchmark. We proved finite-time high-probability and expected Relative Upper Confidence Bound regret bounds of order O(log t) for our algorithm and evaluated it empirically in an information retrieval application. Unlike existing results, our regret bounds hold for all time steps, rather than just a specific horizon T input to the algorithm. Furthermore, they rely on less restrictive assumptions or have better multiplicative constants than existing methods. Finally, the empirical results showed that RUCB greatly outperforms state-of-the-art methods. In future work, we will consider two extensions to this research. First, building off extensions of UCB to the continuous bandit setting (Bubeck et al., 2011; de Freitas et al., 2012; Munos, 2011; Srinivas et al., 2010; Valko et al., 2013), we aim to extend RUCB to the continuous dueling bandit setting, without a convexity assumption as in (Yue & Joachims, 2009). Second, building off Thompson Sampling (Agrawal & Goyal, 2012; Kauffmann et al., 2012; Thompson, 1933), an elegant and effective sampling-based alternative to UCB, we will investigate whether a sampling-based extension to RUCB would be amenable to theoretical analysis. Both these extensions involve overcoming not only the technical difficulties present in the regular bandit setting, but also those that arise from the two-stage nature of RUCB. Acknowledgments This research was partially supported by the European Community\u2019s Seventh Framework Programme (FP7/2007-2013) under grant agreement nr 288024 (LiMoSINe project), the Netherlands Organisation for Scientific Research (NWO) under project nrs 640.004.802, 727.011.005, 612.001.116, HOR-11-10, the Center for Creation, Content and Technology (CCCT), the QuaMerdes project funded by the CLARIN-nl program, the TROVe project funded by the CLARIAH program, the Dutch national program COMMIT, the ESF Research Network Program ELIAS, the Elite Network Shifts project funded by the Royal Dutch Academy of Sciences (KNAW), the Netherlands eScience Center under project number 027.012.105 and the Yahoo! Faculty Research and Engagement Program. Relative Upper Confidence Bound References pairwise online learning to rank for information retrieval. Information Retrieval, 16(1):63\u201390, 2013. Agrawal, S. and Goyal, N. Analysis of thompson sampling for the multi-armed bandit problem. In Conference on Learning Theory, pp. 1\u201326, 2012. Joachims, T. Optimizing search engines using clickthrough data. In KDD \u201902, pp. 133\u2013142, 2002. Audibert, J.-Y., Munos, R., and Szepesv\u00b4ari, C. Exploration-exploitation tradeoff using variance estimates in multi-armed bandits. Theor. Comput. Sci., 410(19):1876\u20131902, 2009. Kauffmann, E., Korda, N., and Munos, R. Thompson sampling: an asymptotically optimal finite time analysis. In International Conference on Algorithmic Learning Theory, 2012. Auer, P., Cesa-Bianchi, N., and Fischer, P. Finite-time analysis of the multiarmed bandit problem. Machine Learning, 47(2-3):235\u2013256, 2002. Lai, T. L. and Robbins, H. Asymptotically efficient adaptive allocation rules. Advances in Applied Mathematics, 6(1):4 \u2013 22, 1985. Bubeck, S., Munos, R., and Stoltz, G. Pure exploration in multi-armed bandits problems. In Algorithmic Learning Theory, 2009. Liu, T.-Y., Xu, J., Qin, T., Xiong, W., and Li, H. Letor: Benchmark dataset for research on learning to rank for information retrieval. In LR4IR \u201907, in conjunction with SIGIR \u201907, 2007. Bubeck, S., Munos, R., Stoltz, G., and Szepesvari, C. X-armed bandits. Journal of Machine Learning Research, 12:1655\u20131695, 2011. Manning, C., Raghavan, P., and Sch\u00a8 utze, H. Introduction to Information Retrieval. Cambridge University Press, 2008. Capp\u00b4e, O., Garivier, A., Maillard, O.-A., Munos, R., and Stoltz, G. Kullback-Leibler upper confidence bounds for optimal sequential allocation. Annals of Statistics, 41(3):1516\u20131541, 2013. Munos, R. Optimistic optimization of a deterministic function without the knowledge of its smoothness. In NIPS, 2011. Cesa-Bianchi, N. and Lugosi, G. Prediction, Learning, and Games. Cambridge University Press, New York, 2006. Radlinski, F., Kurup, M., and Joachims, T. How does clickthrough data reflect retrieval quality? In CIKM \u201908, pp. 43\u201352, 2008. Craswell, N., Zoeter, O., Taylor, M., and Ramsey, B. An experimental comparison of click position-bias models. In WSDM \u201908, pp. 87\u201394, 2008. Srinivas, N., Krause, A., Kakade, S. M., and Seeger, M. Gaussian process optimization in the bandit setting: No regret and experimental design. In ICML, 2010. de Freitas, N., Smola, A., and Zoghi, M. Exponential regret bounds for Gaussian process bandits with deterministic observations. In ICML, 2012. F\u00a8 urnkranz, J. and H\u00a8 ullermeier, E. (eds.). Preference Learning. Springer-Verlag, 2010. F\u00a8 urnkranz, J., H\u00a8 ullermeier, E., Cheng, W., and Park, S.H. Towards preference-based reinforcement learning. Machine Learning, 2012. Guo, F., Li, L., and Faloutsos, C. Tailoring click models to user goals. In WSCD \u201909, pp. 88\u201392, 2009a. Guo, F., Liu, C., and Wang, Y. Efficient multiple-click models in web search. In WSDM \u201909, pp. 124\u2013131, New York, NY, USA, 2009b. ACM. Hofmann, K., Whiteson, S., and de Rijke, M. A probabilistic method for inferring preferences from clicks. In CIKM \u201911, pp. 249\u2013258, USA, 2011. ACM. Hofmann, K., Whiteson, S., and de Rijke, M. Balancing exploration and exploitation in listwise and Thompson, W.R. On the likelihood that one unknown probability exceeds another in view of the evidence of two samples. Biometrika, pp. 285\u2013294, 1933. Urvoy, T., Clerot, F., F\u00b4eraud, R., and Naamane, S. Generic exploration and k-armed voting bandits. In ICML, 2013. Valko, M., Carpentier, A., and Munos, R. Stochastic simultaneous optimistic optimization. In ICML, 2013. Yue, Y. and Joachims, T. Interactively optimizing information retrieval systems as a dueling bandits problem. In ICML, 2009. Yue, Y. and Joachims, T. Beat the mean bandit. In ICML, 2011. Yue, Y., Broder, J., Kleinberg, R., and Joachims, T. The K-armed dueling bandits problem. Journal of Computer and System Sciences, 78(5):1538\u20131556, September 2012. Relative Upper Confidence Bound 8. Appendix Here we provide some details that were alluded to in the main body of the paper. 8.1. The Condorcet Assumption As mentioned in Section 3, IF and BTM require the comparison probabilities pij to satisfy certain difficult to verify conditions. Specifically, IF and BTM require a total ordering {a1 , . . . , aK } of the arms to exist such that pij > 12 for all i < j. Here we provide evidence that this assumption is often violated in practice. By contrast, the algorithm we propose in Section 4 makes only the Condorcet assumption, which is implied by the total ordering assumption of IF and BTM. In order to test how stringent an assumption the existence of a Condorcet winner is compared the total ordering assumption, we estimated the probability of each assumption holding in our ranker evaluation application. Using the same preference matrix as in our experiments in Section 6, we computed for each K = 1, . . . , 64 the probability PK that a given Karmed dueling bandit problem obtained from considering K of our 64 feature rankers would have a Condorcet winner as follows: first, we calculated the number of K-armed dueling bandit that have a Condorcet winner by calculating for each feature ranker r how many K-armed duelings bandits it can be the Condorcet winner of: for each r, this is equal to NKr , where Nr is the number rankers that r beats; next, we divided this total number of K-armed dueling bandit 1.0 with a Condorcet winner by 64 K , which is the number of all K-armed dueling bandit that one could construct from these 64 rankers. The probabilities PK , plotted as a function of K in Figure 5 (the red curve), were all larger than 0.97. The same plot also shows an estimate of the probability that the total ordering assumption holds for a given K (the blue curve), which was obtained by randomly selecting 100, 000 K-armed bandits and searching for ones that satisfy the total ordering assumption. As can be seen from Figure 5, as K grows the probability that the total ordering assumption holds decreases rapidly. This is because there exist cyclical relationships between these feature rankers and as soon as the chosen subset of feature rankers contains one of these cycles, it fails to satisfy the total ordering condition. By contrast, the Condorcet assumption will still be satisfied as long as the cycle does not include the Condorcet winner. Moreover, because of the presence of these cycles, the probability that the Condorcet assumption holds decreases initially as K increases, but then increases again because the number of all possible K-armed dueling bandit decreases as K approaches 64. Furthermore, in addition to the total ordering assumption, IF and BTM each require a form of stochastic transitivity. In particular, IF requires strong stochastic transitivity; for any triple (i, j, k), with i < j < k, the following condition needs to be satisfied: pik \u2265 max{pij , pjk }. BTM requies the less restrictive relaxed stochastic transitivity, i.e., that there exists a number \u03b3 \u2265 1 such that for all pairs (j, k) with 1 < j < k, we have Probability of satisfying the constraint \u03b3p1k \u2265 max{p1j , pjk }. As pointed out in (Yue & Joachims, 2011), strong stochastic transitivity is often violated in practice, a phenomenon also observed in our experiments: for instance, all of the K-armed dueling bandit on which we experimented require \u03b3 > 1. 0.8 0.6 0.4 0.2 Condorcet Total Ordering 0.0 10 20 30 40 Size of the subset 50 60 Figure 5. The probability that the Condorcet and the total ordering assumptions hold for subsets of the feature rankers. The probability is shown as a function of the size of the subset. Even though BTM permits a broader class of K-armed dueling bandit problems, it requires \u03b3 to be explicitly passed to it as a parameter, which poses substantial difficulties in practice. If \u03b3 is underestimated, the algorithm can in certain circumstances be misled with high probability into choosing the Borda winner instead of the Condorcet winner, e.g., when the Borda winner has a larger average advantage over the remaining arms than the Condorcet winner. On the other hand, though overestimating \u03b3 does not cause the algorithm to choose the wrong arm, it nonetheless results in a severe penalty, since it makes the algorithm Relative Upper Confidence Bound pij Chernoff-Hoeffding upper bound Confidence intervals [lij (t), uij (t)] \u00b5ij (t) on P pij \u2208 / [lij (t), uij (t)] pij \u00b7\u00b7\u00b7 \u00b7\u00b7\u00b7 \u00b7\u00b7\u00b7 \u00b5ij n+2 \u00b5ij n+1 \u00b5ij n T \u03c4nij ij \u03c4n+1 time Figure 6. An illustrations of the idea behind Lemma 1 using an example of how the confidence intervals of a single pair ij of arms (ai , aj ), and their relation to the comparison probability pij , might evolve over time. The time-step \u03c4m denotes th the m time when the arms ai and aj were chosen by RUCB to be compared against each other. We also define ij \u00b5ij m := \u00b5ij (\u03c4m ). The time T is when the confidence intervals [lij (t), uij (t)] begin to include pij . The lemma then states that with probability 1 \u2212 \u03b4, we have T \u2264 C(\u03b4). Moreover, for each time-step, the area of the shaded region under the vertical graphs is the bound given by the ChernoffHoeffding (CH) bound on the probability that the confidence interval will not contain pij . Note that the CH bound has ij 2 the form e\u2212(x\u2212\u00b5n ) and so in order for this number to be the area under a graph (hence making it easier to illustrate in a 2 \u2212(x\u2212\u00b5ij n ) , which is why the graphs are equal to figure), we have drawn the derivative of this function, fnij (x) := |x \u2212 \u00b5ij n |e 0 in the middle. Note that this does not mean that \u00b5ij n has very low probability of being close to pij : the graphs drawn here are not the PDFs of the posteriors, but simply a manifestation of the bound given by the Chernoff-Hoeffding bound. lij (t) ij \u221e ij More specifically, the property that they satisfy is that P pij \u2208 / [lij (t), uij (t)] \u2264 \u2212\u221e fNij (t) (x)dx + u (t) fN (x)dx. ij (t) ij much more exploratory, yielding the \u03b3 7 term in the upper bound on the cumulative regret, as discussed in Section 3. 8.2. Proof of Lemma 1 In this section, we prove Lemma 1, whose statement is repeated here for convenience. Recall from Section 5 that we assume without loss of generality that a1 is the optimal arm. Moreover, given any K-armed dueling bandit algorithm, we define wij (t) to be the number of times arm ai has beaten aj in the first t iterations of the algorithm. We also define uij (t) := wij (t) wij (t)+wji (t) + \u03b1 ln t wij (t)+wji (t) , where \u03b1 is any positive contant, and lij (t) := 1 \u2212 uji (t). Moreover, for any \u03b4 > 0, define C(\u03b4) := 2 (4\u03b1\u22121)K (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 . Lemma 1. Let P := [pij ] be the preference matrix of a K-armed dueling bandit problem with arms {a1 , . . . , aK }, satisfying p1j > 12 for all j > 1 (i.e., a1 is the Condorcet winner). Then, for any dueling bandit algorithm and any \u03b1 > 12 and \u03b4 > 0, we have P \u2200 t > C(\u03b4), i, j, pij \u2208 [lij (t), uij (t)] > 1 \u2212 \u03b4. (7) Proof. To decompose the lefthand side of (7), we introduce the notation Gij (t) for the \u201cgood\u201d event that at time t we have pij \u2208 [lij (t), uij (t)], which satisfies the following: (i) Gij (t) = Gji (t) because of the triple of equalities pji , lji (t), uji (t) = 1 \u2212 pij , 1 \u2212 uij (t), 1 \u2212 lij (t) . (ii) Gii (t) always holds, since (pii , lii (t), uii (t)) = 1 1 1 2 , 2 , 2 . Together with (i), this means that we only need to consider Gij (t) for i < j. Relative Upper Confidence Bound (iii) Define \u03c4nij to be the iteration at which arms i and j were compared against each other for the nth time. If Gij \u03c4nij + 1 holds, then the events Gij (t) hold for ij ij because when t \u2208 \u03c4nij , \u03c4n+1 , wij all t \u2208 \u03c4nij , \u03c4n+1 and wji remain constant and so in the expressions for uij (t) and uji (t) only the ln t changes, which is a monotonically increasing function of t. So, we have lij (t) \u2264 lij (\u03c4nij + 1) \u2264 pij \u2264 uij (\u03c4nij + 1) \u2264 uij (t). Moreover, the same statement holds with \u03c4nij reij , i.e., if we know placed by any T \u2208 \u03c4nij , \u03c4n+1 that Gij (T ) holds, then Gij (t) also holds for all t\u2208 ij T, \u03c4n+1 Nij (T ) \u2264 T , we can replace Nij (T ) with a variable n that can take values between 0 and T . For the second term, we know that \u03c4nij > T , so we can replace \u03c4nij with T and remove the condition \u03c4nij > T and look at all n \u2264 T . For the third term, since we always have that n < \u03c4nij , we can replace \u03c4nij with n and get a higher probability event. Putting all of this together we get the looser bound P \u2203 t > T, i, j s.t. Bij (t) \u2264 . This is illustrated in Figure 6. P \u2203 n \u2208 {0, . . . , T } : pij \u2212 \u00b5ij n > \u03b1 ln T n +P \u2203 n \u2208 {0, . . . , T } : pij \u2212 \u00b5ij n > \u03b1 ln T n +P \u2203 n > T s.t. pij \u2212 \u00b5ij n > i j, Gij (T ) and \u2200 n s.t. \u03c4nij > T, Gij (\u03c4nij ) . Let us now flip things around and look at the complement of these events, i.e. the \u201cbad\u201d event Bij (t) that pij \u2208 / [lij (t), uij (t)] occurs. Then, subtracting both sides of Equation (8) from 1 and using the union bound gives P \u2203 t > T, i, j s.t. Bij (t) \u2264 i T and Bij (\u03c4nij ) . Further decomposing the righthand side using union bounds and making the condition explicit, we get P \u2203 t > T, i, j s.t. Bij (t) \u2264 P i>j pij \u2212 \u00b5ij Nij (T ) > \u03b1 ln T Nij (T ) + P \u2203 n > T s.t. pij \u2212 \u00b5ij n > \u03b1 ln \u03c4nij n \u03b1 ln \u03c4nij , n ij wij (\u03c4n ) ij ij ) wij (\u03c4n )+wji (\u03c4n is the since T < n < \u03c4nij . Here, \u00b5ij n := frequentist estimate of pij after n comparisons between arms ai and aj . Now, in the above sum, we can upper-bound the first term by looking at the higher probability event that Bij (T ) happens for any possible number of comparisons between ai and aj , and since we know that n=0 i P 2 \u221e \u03b1 ln n n pij \u2212 \u00b5ij n > P + n=T +1 . (9) To bound the expression on line (9), we apply the Chernoff-Hoeffding bound, which in its simplest form states that given i.i.d. random variables X1 , . . . , Xn , whose support is contained in [0, 1] and whose expectan tion satisfies E[Xk ] = p, and defining \u00b5n := X1 +\u00b7\u00b7\u00b7+X , n 2 \u22122na we have P (|\u00b5n \u2212 p| > a) \u2264 2e . This gives us P \u2203 t > T, i, j s.t. Bij (t) \u03b1 ln T T \u22122\u2701 n 2 n + \u2264 2e \u271a i T and pij \u2212 \u00b5ij n > T \u2264 \u03b1 ln n n n=1 K(K \u2212 1) = 2 T 4 T 2\u03b1 n=1 2K 2 \u2264 2\u03b1\u22121 + K 2 T 2K 2 \u2264 2\u03b1\u22121 + K 2 T \u221e T \u221e T (4\u03b1 \u2212 1)K 2 = . (2\u03b1 \u2212 1)T 2\u03b1\u22121 Now, since C(\u03b4) = \u221e 2e n=T +1 \u221e + n=T +1 2 n2\u03b1 \u22122\u2701 n \u03b1 ln n n \u271a dx 1 , since 2\u03b1 is decreasing. x2\u03b1 x dx 2K 2 K2 = + x2\u03b1 T 2\u03b1\u22121 (1 \u2212 2\u03b1)x2\u03b1\u22121 \u221e T (10) (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 the bound in (10) gives us (7). 1 2\u03b1\u22121 for each \u03b4 > 0,"} +{"id":"oai:arXiv.org:1312.4428","text":"On Constraint Satisfaction Problems below P\u2217 arXiv:1312.4428v2 [cs.CC] 17 Dec 2013 L\u00b4aszl\u00b4o Egri\u2020 Abstract Symmetric Datalog, a fragment of the logic programming language Datalog, is conjectured to capture all constraint satisfaction problems (CSP) in L. Therefore developing tools that help us understand whether or not a CSP can be defined in symmetric Datalog is an important task. It is widely known that a CSP is definable in Datalog and linear Datalog if and only if that CSP has bounded treewidth and bounded pathwidth duality, respectively. In the case of symmetric Datalog, Bulatov, Krokhin and Larose ask for such a duality (2008). We provide two such dualities, and give applications. In particular, we give a short and simple new proof of the result of Dalmau and Larose that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d (2008). In the second part of the paper, we provide some evidence for the conjecture of Dalmau (2002) that every CSP in NL is definable in linear Datalog. Our results also show that a wide class of CSPs\u2013CSPs which do not have bounded pathwidth duality (e.g., the P-complete Horn-3Sat problem)\u2013cannot be defined by any polynomial size family of monotone read-once nondeterministic branching programs. 1 Introduction Constraint satisfaction problems (CSP) constitute a unifying framework to study various computational problems arising naturally in various branches of computer science, including artificial intelligence, graph homomorphisms, and database theory. Loosely speaking, an instance of a CSP consists of a list of variables and a set of constraints, each specified by an ordered tuple of variables and a constraint relation over some specified domain. The goal is then to determine whether variables can be assigned domain values such that all constraints are simultaneously satisfied. Recent efforts have been directed at classifying the complexity of the so-called nonuniform CSP. For a fixed finite set of finite relations \u0393, CSP(\u0393) denotes the nonuniform CSP corresponding to \u0393. The difference between an instance of CSP(\u0393) and an instance of the general CSP is that constraints in an instance of CSP(\u0393) take the form (xi1 , . . . , xik ) \u2208 R for some R \u2208 \u0393. Examples of nonuniform CSPs include k-Sat, Horn-3Sat, Graph H-Coloring, and many others. \u2217 Research supported by NSERC, FQRNT, and ERC Starting Grant PARAMTIGHT (No. 280152). Institute for Computer Science and Control, Hungarian Academy of Sciences (MTA SZTAKI), Budapest, Hungary. {laszlo.egri@mail.mcgill.ca} \u2020 1 For a relational structure B, the homomorphism problem HOM(B) takes a structure A as input, and the task is to determine if there is a homomorphism from A to B. For instance, consider structures that contain a single symmetric binary relation, i.e., graphs. A homomorphism from a graph G to a graph H is a mapping from VG to VH such that any edge of G is mapped to an edge of H. If H is a graph with a single edge then HOM(H) is the set of graphs which are two-colorable. There is a well-known and straightforward correspondence between the CSP and the homomorphism problem. For this reason, from now on we work only with the homomorphism problem instead of the CSP. Nevertheless, we call HOM(B) a CSP and we also write CSP(B) instead of HOM(B), as it is often done in the literature. The CSP is of course NP-complete, and therefore research has focused on identifying \u201cislands\u201d of tractable CSPs. The well-known CSP dichotomy conjecture of Feder and Vardi [13] states that every CSP is either tractable or NP-complete, and progress towards this conjecture has been steady during the last fifteen years. From a complexity-theoretic perspective, the classification of CSP(B) as in P or being NP-complete is rather coarse and therefore somewhat dissatisfactory. Consequently, understanding the fine-grained complexity of CSPs gained considerable attention during the last few years. Ultimately, one would like to know the precise complexity of a CSP lying in P, i.e., to identify a \u201cstandard\u201d complexity class for which a given CSP is complete. Towards this, it was established that Schaefer\u2019s P \u2212 NP dichotomy for Boolean CSPs [24] can indeed be refined: each CSP over the Boolean domain is either definable in first order logic, or complete for one of the classes L, NL, \u2295L, P or NP under AC0 -reductions [2]. The question whether some form of this fine-grained classification extends to non-Boolean domains is rather natural. The two most important tools to study CSPs whose complexity is below P are symmetric Datalog and linear Datalog, syntactic restrictions of the database-inspired logic programming language Datalog. We say that co-CSP(B)\u2013the complement of CSP(B)\u2013is definable in (linear, symmetric) Datalog if the set of structures that do not homomorphically map to B is accepted by a (linear, symmetric) Datalog program.1 Symmetric Datalog programs can be evaluated in logarithmic space (L), and in fact, it is conjectured that if co-CSP(B) is in L then it can also be defined in symmetric Datalog [11]. There is a considerable amount of evidence supporting this conjecture (see, for example, [11, 10, 9, 20, 6]), and therefore providing tools to show whether co-CSP(B) can be defined in symmetric Datalog is an important task. It is well known and easy to see that for any structure B, there is a set of structures O, called an obstruction set, such that a structure A homomorphically maps to B if and only if there is no structure in O that homomorphically maps to A. In fact, there are many possible obstruction sets for any structure B. We say that B has duality X, if B has an obstruction set which has the special property X. The following two well-known theorems relate definability of co-CSP(B) in Datalog and linear Datalog to B having bounded treewidth and bounded pathwidth duality, respectively: 1. co-CSP(B) is definable in Datalog if and only if B has bounded treewidth duality [13]; 2. co-CSP(B) is definable in linear Datalog if and only if B has bounded pathwidth duality [7]. 1 The reason we define co-CSP(B) instead of CSP(B) in (linear, symmetric) Datalog is a technicality explained in Section 2.5. 2 It was stated as an open problem in [4] to find a duality for symmetric Datalog in the spirit of the previous two theorems. We provide two such dualities: symmetric bounded pathwidth duality (SBPD) and piecewise symmetric bounded pathwidth duality (PSBPD). We note that SBPD is a special case of PSBPD. For both bounded treewidth and bounded pathwidth duality, the structures in the obstruction sets are restricted to have some special form. For SBPD and PSBPD the situation is a bit more subtle. In addition that we require the obstruction sets to contain structures only of a special form (they must have bounded pathwidth), the obstruction sets must also possess a certain \u201csymmetric closure\u201d property. To the best of our knowledge, this is the first instance of a duality where in addition to the local requirement that each structure must be of a certain form, the set must also satisfy an interesting global requirement. Using SBPD, we give a short and simple new proof of the main result of [9] that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d. Considering the simplicity of this proof, we suspect that SBPD (or PSBPD) could be a useful tool in an attempt to prove the algebraic symmetric Datalog conjecture [20], a conjecture that proposes an algebraic characterization of all CSPs lying in L. An equivalent form of this conjecture is that \u201cDatalog + n-permutability \u21d2 symmetric Datalog\u201d (by combining results from [18, 3, 21]), where n-permutability is a generalization of Maltsev. One way to gain more insight into the dividing line between CSPs in L and NL is through studying the complexity of CSPs corresponding to oriented paths. It is known that all these CSPs are in NL (by combining results from [12, 8, 7]), and it is natural to ask whether there are oriented paths for which the CSP is NL-complete and L-complete. We provide two classes of oriented paths, C1 and C2 , such that for any B1 \u2208 C1 , the corresponding CSP is NL-complete, and for any B2 \u2208 C2 , the corresponding CSP is in L. In fact, it can be seen with the help of [20] that for most B2 \u2208 C2 , CSP(B2 ) is L-complete. To prove the membership of CSP(B2 ) in L (for B2 \u2208 C2 ), we use PSBPD in an essential way. One can hope to build on this work to achieve an L-NL dichotomy for oriented paths. In the second part of the paper, we investigate CSPs in NL. Based on the observation that any CSP known to be in NL is also known to be definable by a linear Datalog program, Dalmau conjectured that every CSP in NL can be defined by a linear Datalog program [7]. Linear Datalog(suc,\u00ac) (linDat(suc,\u00ac)) denotes the extension of linear Datalog in which we allow negation and access to an order over the domain of the input. It is known that any problem in NL can be defined by a linDat(suc,\u00ac) program [7, 15, 19], and therefore one way to prove the above conjecture would be to show that any CSP that can be defined by a linDat(suc,\u00ac) program can also be defined by a linear Datalog program. We consider a restriction of the conjecture because proving it in its full generality would separate NL from P (using [1]). Read-once linear Datalog(suc) (1-linDat(suc)) is a subclass of linDat(suc,\u00ac), but a subclass that has interesting computational abilities, and for which we are able to find the chink in the armor. We can easily define some NL-complete problems in 1-linDat(suc), such as the CSP directed st-connectivity (st-Conn), and also problems that are not homomorphismclosed, such as determining if the input graph is a clique on 2n vertices, n \u2265 1. Because any problem that can be defined with a linear Datalog program must be homomorphism closed, it follows that 1-linDat(suc) can define nontrivial problems which are in NL but which are not definable by any linear Datalog program. However, our main result shows that if co-CSP(B) 3 can be defined by a 1-linDat(suc) program, then co-CSP(B) can also be defined by a linear Datalog program. The crux of our argument applies the general case of the Erd\u02ddos-Ko-Rado theorem to show that a 1-linDat(suc) program does not have enough \u201cmemory\u201d to handle structures of unbounded pathwidth. Our proof establishing the above result for 1-linDat(suc) programs can be adapted to show a parallel result for a subclass of nondeterministic branching programs, which constitute an important and well-studied class of computational models (see the book [25]). More precisely, we show that if co-CSP(B) can be defined by a poly-size family of read-once2 monotone nondeterministic branching programs (mnBP1(poly)) then co-CSP(B) can also be defined by a linear Datalog program.3 Finally, our results can be interpreted as lower-bounds on a wide class of CSPs: if B does not have bounded pathwidth duality, then co-CSP(B) cannot be defined with any 1linDat(suc) program or with any mnBP1(poly). A specific example of such a CSP would be the P-complete Horn-3Sat problem, and more generally, Larose and Tesson showed that any CSP whose associated variety admits the unary, affine or semilattice types does not have bounded pathwidth duality (see [20] for details). 2 Preliminaries 2.1 Basic Definitions A vocabulary (or signature) is a finite set of relation symbols with associated arities. The arity function is denoted with ar(\u00b7). If A is a relational structure over a vocabulary \u03c4 , then RA denotes the relation of A associated with the symbol R \u2208 \u03c4 . The lightface equivalent of the name of the structure denotes the universe of the structure, e.g., the universe of A is A. \u02dc over a vocabulary \u03c4 is a pair (A, \u02dc T ): T is a set of pairs of the form A tuple structure A \u02dc i.e., A\u02dc contains (R, t), where R \u2208 \u03c4 and t is an ar(R)-tuple, and A\u02dc is the domain of A, every element that appears in some tuple t, and possibly some other elements. Slightly \u02dc to mean (R, t) \u2208 T , where A \u02dc = (A, \u02dc T ). Clearly, abusing notation, we write (R, t) \u2208 A tuple structures are equivalent to relational structures. If A is a relational structure, we \u02dc and vice versa. For convenience, we use the denote the equivalent tuple structure with A, two notations interchangeably. We note that all structures in this paper are finite. Let B be a structure of the same signature \u03c4 as A. The union A \u222a B of A and B is the \u03c4 -structure whose universe is A \u222a B, and for each R \u2208 \u03c4 , RA\u222aB is defined as RA \u222a RB . (Note that it is possible that A \u2229 B = \u2205.) A homomorphism from A to B is a map f from A to B such that f (RA ) \u2286 RB for each R \u2208 \u03c4 . If there exists a homomorphism from A to B, we f often denote it with A \u2192 B. If that homomorphism is f , we write A \u2212\u2192 B. A structure is called a core if it has no homomorphism to any of its proper substructures. A retract of a structure B is an induced substructure B of B such that there is a homomorphism g : B \u2192 B with g(b) = b for every b \u2208 B . A retract of B that has minimal size among 2 Our read-once restriction for nondeterministic branching programs is less stringent than the usual restriction because we require the programs to be read-once only on certain inputs. 3 A 1-linDat(suc) can be converted into an mnBP1(poly), so another way to present our results would be to do the proofs in the context of mnBP1s, and then to conclude the parallel result for 1-linDat(suc). 4 all retracts of B is called a core of B. It is well known that all cores of a structure are isomorphic, and so one speaks of the core of a structure B, core(B). We denote by CSP(B) the set {A | A is a \u03c4 -structure such that A \u2192 B}, and by co-CSP(B) the complement of CSP(B), i.e., the set {A | A is a \u03c4 -structure such that A \u2192 B}. If we are given a class of \u03c4 -structures C such that for any A \u2208 C, and any B such that A \u2192 B it holds that B \u2208 C, then we say that C is homomorphism-closed. Isomorphism closure is defined in a similar way. An n-ary operation on a set A is a map f : An \u2192 A. Given an h-ary relation R and an n-ary operation f on the same set A, we say that f preserves R or that R is invariant under f if the following holds: given any matrix M of size h \u00d7 n whose columns are in R, applying f to the rows of M produces an h-tuple in R. A polymorphism of a structure B is an operation f that preserves each relation in B. Definition 1 (Maltsev Operation). A ternary operation f : A3 \u2192 A on a finite set A is called a Maltsev operation if it satisfies the following identities: f (x, y, y) = f (y, y, x) = x, \u2200x, y \u2208 A. 2.2 Datalog We provide only an informal introduction to Datalog and its fragments, and the reader can find more details, for example, in [22, 7, 11]. Datalog is a database-inspired query language whose connection with CSP-complexity is now relatively well understood (see, e.g., [3]). Let \u03c4 be some finite vocabulary. A Datalog program over \u03c4 is specified by a finite set of rules of the form h \u2190 b1 \u2227 \u00b7 \u00b7 \u00b7 \u2227 bt , where h and the bi are atomic formulas R(x1 , . . . , xk ). When we specify the variables of an atomic formula, we always list the variables from left to right, or we simply provide a tuple x of variables whose i-th variable is x[i]. We distinguish two types of relational predicates occurring in a Datalog program: predicates that occur at least once in the head of a rule (i.e., its left-hand side) are called intensional database predicates (IDBs) and are not in \u03c4 . The predicates which occur only in the body of a rule (its right-hand side) are called extensional database predicates (EDBs) and must all lie in \u03c4 . A rule that contains no IDB in the body is called a nonrecursive rule, and a rule that contains at least one IDB in the body is called a recursive rule. A Datalog program contains a distinguished IDB of arity 0 which is called the goal predicate; a rule whose head IDB is a goal IDB is called a goal rule. Linear Datalog is a syntactic restriction of Datalog in which there is at most one IDB in the body of each rule. The class of linear Datalog programs that contains only rules with at most k variables and IDBs with at most j \u2264 k variables is denoted by linear (j, k)-Datalog. We say that the width of such a linear Datalog program is (j, k). Symmetric Datalog is a syntactic restriction of linear Datalog. A linear Datalog program \u00af P is symmetric if for any recursive rule I(x) \u2190 J(y) \u2227 E(z) of P (except for goal rules), \u00af where E(z) is a shorthand for the conjunction of the EDBs of the rule over variables in z, \u00af the symmetric pair J(y) \u2190 I(x) \u2227 E(z) of that rule is also in P. The width of a symmetric Datalog program is defined similarly to the width of a linear Datalog program. We explain the semantics of linear (symmetric) Datalog using derivations (it could also be explained with fixed point operators, but that would be inconvenient for the proofs). Let 5 P be a linear Datalog program with vocabulary \u03c4 . A P-derivation with codomain D is a sequence of pairs D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ), where \u03c1 is a rule of P, and \u03bb is a function from the variables V of \u03c1 to D, \u2200 \u2208 [q]. The sequence D must satisfy the following properties. Rule \u03c11 is nonrecursive, and \u03c1q is a goal rule. For all \u2208 [q \u2212 1], the head IDB I of \u03c1 is the IDB in the body of \u03c1 +1 , and if the variables of I in the head of \u03c1 and the body of \u03c1 +1 are x and y, respectively, then \u03bb (x[i]) = \u03bb +1 (y[i]), \u2200i \u2208 [ar(I)]. Let D be a derivation. Let R(z) be an EDB (with variables z) appearing in some rule \u03c1 of D. We write R(t) to denote that \u03bb (z) = t, i.e., that \u03bb instantiates the variables of R(z) to t. If R(z) appears in some rule \u03c1 of D and \u03bb (z) = t, we say that R(t) appears in \u03c1 , or less specifically, that R(t) appears in D. Given a structure A and a derivation D with codomain A for a program P, we say that D \u02dc The notation is a derivation for A if for every R(t) that appears in a rule of D, (R, t) \u2208 A. for a P-derivation for a structure A will have the form DP (A). A linear (symmetric) Datalog program P accepts an input structure A if there exists a P-derivation for A. Definition 2 (Read-once Derivation). We say that a derivation D is read-once if every R(t) that appears in D appears exactly once in D, except when R is the special EDB suc, first, or last, defined in Section 4. An example is given in Figure 1. The vocabulary is \u03c4 = {E 2 , S 1 , T 1 }, where the superscripts denote the arity of the symbols. Notice that in the symmetric Datalog program P, rules (2) and (3) form a symmetric pair. It is not difficult to see that P accepts a \u03c4 -structure A if and only if there is an oriented path (see Section 3.1) in E A from an element in S A to an element in T A . 2.3 Path Decompositions and Derivations Definition 3. [Path Decomposition] Let S be a \u03c4 -structure. A (j, k)-path decomposition (or path decomposition of width (j, k)) of S is a sequence S0 , . . . , Sn\u22121 of subsets of S such that \u02dc \u2203 \u2208 {0, . . . , n \u2212 1} such that {a1 , . . . , aar(R) } \u2286 S ; 1. For every (R, (a1 , . . . , aar(R) )) \u2208 S, 2. If a \u2208 Si \u2229 Si (i < i ) then a \u2208 S for all i < < i ; 3. \u2200 \u2208 {0, . . . , n \u2212 1}, |S | \u2264 k, and \u2200 \u2208 {0, . . . , n \u2212 2}, |S \u2229 S +1 | \u2264 j. For ease of notation, it will be useful to introduce a concept closely related to path decompositions. Let \u03c4 be a vocabulary. Let S be a \u03c4 -structure that can be expressed as S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 , where the S0 , . . . , Sn\u22121 (the universes of the Si ) satisfy properties 2 and 3 above. We say that S is a (j, k)-path, and that (S0 , . . . , Sn\u22121 ) is a (j, k)-path representation of S. We denote (j, k)-path representations with script letters, e.g., S = (S0 , . . . , Sn\u22121 ). The substructure Si \u222a \u00b7 \u00b7 \u00b7 \u222a Si of S (assuming a (j, k)-representation is fixed) is denoted by S[i,i ] . We call n the length of the representation. Obviously, a structure is a (j, k)-path if and only if it admits a (j, k)-path decomposition. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation for some linear or symmetric program P with vocabulary \u03c4 . We can extract from D a \u03c4 -structure Ex(D) such that D is a derivation \u02dc for each R(t) that appears in D for Ex(D). We specify Ex(D) as a tuple structure A: 6 I(x) \u2190 S(x) I(y) \u2190 I(x) \u2227 E(x, y) I(x) \u2190 I(y) \u2227 E(x, y) G \u2190 I(x) \u2227 T (x) S G = {a} T G = {d} (1) (2) (3) (4) b f g e I(b) I(c) I(d) G E(a, b) E(c, b) E(c, d) T (d) \u03c12 \u03c13 \u03c14 \u03c15 \u03c11 \u03bb1 (x) = a c a I(a) S(a) d \u03bb2 (x) = a \u03bb3 (x) = c \u03bb2 (y) = b \u03bb3 (y) = b \u03bb4 (x) = c \u03bb4 (y) = d \u03bb5 (x) = d Figure 1: Top left: Symmetric Datalog program P. Top right: Input structure G where the binary relation E G is specified by the digraph. Bottom: Visualization of a P-derivation DP (G) = (\u03c11 , \u03bb1 ), . . . , (\u03c15 , \u03bb5 ) for G, where \u03c11 is the nonrecursive rule (1), \u03c12 , \u03c14 are rule (2), \u03c13 is rule (3), and \u03c15 is the goal rule (4). In the diagram, for example, the dashed box corresponds to rule \u03c12 , and it is the rule I(y) \u2190 I(x) \u2227 E(x, y) of P, where \u03bb2 assigns a to variable x and b to variable y. Observe that DP (G) is read-once. \u02dc and set A\u02dc to be the set of those elements that appear (R \u2208 \u03c4 ), we add the pair (R, t) to A, in a tuple. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation. For each x that is in a rule \u03c1 for some \u2208 [q], call x the indexed version of x. We define an equivalence relation Eq(D) on the set of indexed variables of D. First we define a graph G = (V, E) as: \u2022 V is the set of all indexed versions of variables in D; \u2022 (x , y ) \u2208 E if = + 1, x is the i-th variable of the head IDB I of \u03c1 , and y is the i-th variable of the body IDB I of \u03c1 +1 . Two indexed variables x and y are related in Eq(D) if they are connected in G. Observe that if C = {x11 , x22 , . . . , xcc } is a connected component of G, then it must be that \u03bb 1 (x1 ) = \u03bb 2 (x2 ) = \u00b7 \u00b7 \u00b7 = \u03bb c (xc ). Definition 4 (Free Derivation). Let P be a linear Datalog program and D = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ) be a derivation for P. Then D is said to be free if for any two (x , y ) \u2208 Eq(D), \u03bb (x) = \u03bb (y). Intuitively, this definition says that D is free if any two variables in D which are not \u201cforced\u201d to have the same value are assigned different values. 2.4 Canonical Programs Fix a \u03c4 -structure B and j \u2264 k. Let Q1 , . . . , Qn be all possible at most j-ary relations over B. The canonical linear (j, k)-Datalog program for B ((j, k)-CanL(B)) contains an IDB Im of 7 \u00af the same arity as Qm for each m \u2208 [n]. The rule Ic (x) \u2190 Id (y)\u2227E(z) belongs to the canonical \u00af program if it contains at most k variables, and the implication Qc (x) \u2190 Qd (y) \u2227 E(z) is true for all possible instantiation of the variables to elements of B. The goal predicate of this program is the 0-ary IDB Ig , where Qg = \u2205. The canonical symmetric (j, k)-Datalog program for B ((j, k)-CanS(B)) has the same definition as (j, k)-CanL(B), except that it has less rules due to the following additional \u00af \u00af restriction. If Ic (x) \u2190 Id (y) \u2227 E(z) is in the program, then both Qc (x) \u2190 Qd (y) \u2227 E(z) and \u00af Qd (y) \u2190 Qc (x) \u2227 E(z) must hold for all possible instantiation of the variables to elements of B. The program (j, k)-CanS(B) is obviously symmetric. When it is clear from the context, we write CanL(B) and CanS(B) instead of (j, k)-CanL(B) and (j, k)-CanS(B), respectively. 2.5 Defining CSPs The following discussion applies not just to Datalog but also to its symmetric and linear fragments. It is easy to see that the class of structures accepted by a Datalog program is homomorphism-closed, and therefore it is not possible to define CSP(B) in Datalog. However, co-CSP(B) is closed under homomorphisms, and in fact, it is often possible to define co-CSP(B) in Datalog. The following definition is key. Definition 5 (Obstruction Set). A set O of \u03c4 -structures is called an obstruction set for B, if for any \u03c4 -structure A, A \u2192 B if and only if there exists S \u2208 O such that S \u2192 A. In other words, an obstruction set defines co-CSP(B) implicitly as A \u2208 co-CSP(B) if and only if there exists S \u2208 O such that S \u2192 A. If O above can be chosen to have property X, then we say that B has X-duality. In the next section we show that co-CSP(B) is definable in symmetric Datalog if and only if B has symmetric bounded pathwidth duality. 3 3.1 On CSPs in symmetric Datalog Definitions An oriented path is a digraph obtained by orienting the edges of an undirected path. In other words, an oriented path has vertices v0 , . . . , vq+1 and edges e0 , . . . , eq , where ei is either (vi , vi+1 ), or (vi+1 , vi ). The length of an oriented path is the number of edges it contains. We call (vi , vi+1 ) a forward edge and (vi+1 , vi ) a backward edge. Oriented paths can be thought of as relational structures over the vocabulary {E 2 }, so we denote them with boldface letters. For an oriented path P, we can find a mapping level : P \u2192 {0, 1, 2, . . . } such that level(b) = level(a)+1 whenever (a, b) is an edge of P. Clearly, there is a unique such mapping with the smallest possible values. The level of an edge (a, b) of P is level(a), i.e., the level of the starting vertex of (a, b). The height(P) of an oriented path P is maxa\u2208P level(a). Let P be an oriented path that has a vertex u with indegree 0 and outdegree 1, and a vertex v with indegree 1 and outdegree 0. We say that P is minimal if u is in the bottommost level and v is in the topmost level, and there are no other vertices of P in the bottommost or the topmost levels. 8 A zigzag operator \u03be takes a (j, k)-path representation S = (S0 , . . . , Sn\u22121 ) of a (j, k)-path S and a minimal oriented path P = e0 , . . . , eq such that height(P) = n, and it returns another (j, k)-path \u03be(S , P). Intuitively, \u03be(S , P) is the (j, k)-path S \u201cmodulated\u201d by P such that the forward and backward edges ei of P are mimicked in \u03be(S , P) by \u201cforward and backward\u201d copies of Slevel(ei ) . Before the formal definition, it could help the reader to look at the right side of Figure 2, where the oriented path used to modulate the (j, k)-path over the vocabulary E 2 (i.e., digraphs) with representation (S0 , S1 , S2 ) is P on the left side. The left side is a more abstract example, and the reader might find it useful after reading the definition. We inductively define the (j, k)-path \u03be(S , P) as (Se0 , Se1 , . . . , Seq ) together with a sequence of isomorphisms \u03d5e0 , \u03d5e1 , . . . , \u03d5eq , where \u03d5ei is an isomorphism from Sei to Slevel(ei ) , 0 \u2264 i \u2264 q. For the base case, we define Se0 to be an isomorphic copy of S0 , and \u03d5e0 to be the isomorphism that maps Se0 back to S0 . Assume inductively that Se0 , . . . , Sei\u22121 and \u03d5e0 , . . . , \u03d5ei\u22121 are already defined. Let Sei be an isomorphic copy of Slevel(ei ) with domain disjoint from Se0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sei\u22121 , and fix \u03d5ei to be the isomorphism that maps back Sei to Slevel(ei ) . We \u201cglue\u201d Sei to Sei\u22121 by renaming some elements of Sei to elements of Sei\u22121 . To facilitate understanding, we can think of the already constructed structures Se0 , . . . , Sei\u22121 as labels of the edges e0 , . . . , ei\u22121 of P, respectively, and we want to determine Sei , the label of the next edge. The connection between Sei\u22121 and Sei will be defined such that Sei\u22121 and Sei \u201cmimic\u201d the orientation of the edges ei\u22121 and ei . We resume our formal definition. Set = level(ei ), and let = \u2212 1 if ei is a forward edge, and = + 1 if ei is a backward edge. If an element x \u2208 Sei and an element y \u2208 Sei\u22121 are both copies of the same element a \u2208 S \u2229 S , then rename x to y in Sei . After all such (a) in Sei elements are renamed, Sei becomes Sei . That is, for all a \u2208 S \u2229 S , rename \u03d5e\u22121 i (a) to obtain S . to \u03d5\u22121 ei ei\u22121 We define the isomorphism \u03d5ei from Sei to Slevel(ei ) as: \u03d5ei (x) = 3.2 if x \u2208 Sei and x \u2208 Sei\u22121 \u03d5ei (x) \u03d5ei\u22121 (x) if x \u2208 Sei \u2229 Sei\u22121 . Two Dualities for Symmetric Datalog The two main theorems (Theorems 9 and 15) of this section can be combined to obtain the equivalence of the statements (1), (3) and (4) in Theorem 6 below. The proof of the implication (1) \u2192 (2) is a direct adaptation of the proof of the result from [13] that if co-CSP(B) is defined by a (j, k)-Datalog program, then it is also defined by the canonical (j, k)-Datalog program (see also [9]). Note that (1) \u2192 (2) is also obvious from the proof of Theorem 9 below. Theorem 6. For a finite structure B, TFAE: 1. There is a symmetric Datalog program that defines co-CSP(B); 2. The canonical symmetric (j, k)-Datalog program defines co-CSP(B); 3. B has symmetric bounded pathwidth duality (for some parameters); 9 P S \u03be(S , P) e4 S2 Se4 cd c d e1 S1 ab S0 e2 e3 Se1 ab e0 Se0 \u03be(S , P) S Se4 S2 \u03be cd Se2 Se3 S1 Se1 Se2 Se3 a b S0 Se0 Figure 2: Left: Applying a zigzag operator to the (j, k)-path S with the (j, k)-representation S = (S0 , S1 , S2 ). Suppose that S0 \u2229 S1 = {a, b} and S1 \u2229 S2 = {c, d}. We demonstrate how Se0 and Se2 are obtained. Se0 is a disjoint copy of S0 (and the copy of a and b in Se0 are a and b , respectively). To obtain Se2 , first make a disjoint copy Se2 of Slevel(e2 ) = S1 . Set = level(e2 ) = 1. Since e1 is a forward edge and e2 is a backward edge, = + 1 = 2. Therefore to \u201cglue\u201d Se2 to Se1 , we need to look at S \u2229 S = {c, d}. Assume that the copy of c and d in Se1 are c and d , respectively. Furthermore, assume that the copy of c and d \u02dc respectively. To obtain Se , we rename c\u02dc to c , and d\u02dc to d in S . Right: in Se2 are c\u02dc and d, 2 e2 A specific example when S0 , S1 , S2 are the digraphs in the boxes. The dashed lines indicate identification of vertices. The level of (Se2 , Se3 ), for example, is 0 since e2 and e3 share a vertex at vertex level 1. 4. B has piecewise symmetric bounded pathwidth duality (for some parameters). 3.2.1 Symmetric Bounded Pathwidth Duality Definition 7 ((j, k)-symmetric). Assume that O is a set of (j, k)-paths. Suppose furthermore that a (j, k)-path representation can be fixed for each structure in O such that the following holds. For every S \u2208 O with representation S of some length n, and every minimal oriented path P of height n, it holds that \u03be(S , P) \u2208 O. Then O is said to be (j, k)-symmetric. Definition 8 (SBPD). A structure B has (j, k)-symmetric bounded pathwidth duality ((j, k)SBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k)-symmetric. The following is our main duality theorem for symmetric Datalog: Theorem 9. For a finite structure B, co-CSP(B) can be defined by a symmetric (j, k)Datalog program if and only if B has (j, k)-SBPD. We will use Lemma 10 in the proof of Theorem 9. Lemma 10 can be proved using the standard canonical Datalog program argument. Lemma 11 is also used in the proof of Theorem 9 and it is the main technical lemma of the section. 10 Lemma 10. If CanS(B) accepts a structure A, then A \u2192 B. Proof. Structure B is not accepted by CanS(B) because a derivation could be translated into a valid chain of implications, which is not possible by the definition of CanS(B). If CanS(B) accepts A and A \u2192 B, then CanS(B) accepts B, a contradiction. Lemma 11. For any \u03c4 -structures A and B, if there exists a structure S with a (j, k)-path representation S of some length n such that S \u2192 A, and for any minimal oriented path P of height n, it holds that \u03be(S , P) \u2192 B, then (j, k)-CanS(B) accepts A. To prove Lemma 11 we need to define an additional concept related to the zigzag operator. Once the (j, k)-path \u03be(S , P) = (Se0 , . . . , Seq ) is defined, where P is the path e0 , . . . , eq , each pair (Sei , Sei+1 ), \u2200i \u2208 {0, . . . , q \u2212 1} is assigned a level : level(Sei , Sei+1 ) is the level of the vertex v minus 1, where v is the vertex that ei and ei+1 share (see Figure 2). Proof of Lemma 11. For the rest of this proof, let CS denote (j, k)-CanS(B), and CL denote (j, k)-CanL(B). If program CS accepts structure S then because S \u2192 A, CS also accepts A. So it is sufficient to show that program CS accepts structure S. First we specify how to associate a CL-derivation with \u03be(S , P), where P is a minimal oriented path of height n. Assume that \u03be(S , P) = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sq . For each i \u2208 {0, . . . , q \u2212 1}, fix an arbitrary order on the elements of Si \u2229 Si+1 . Assume that |Si \u2229 Si+1 | = j (\u2264 j), and define the j -tuple si such that si [ ] is the -th element of Si \u2229 Si+1 . We define sq to be the empty tuple. It is good to keep in mind that later, si will be associated with the IDB Ji . The derivation will be DCL (\u03be(S , P)) = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ). We specify \u03c1i as \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y if i \u2208 [q] \u00af 0) J0 (x0 ) \u2190 E(y if i = 0. We begin with describing the EDBs of a rule \u03c1i together with their variables. Assume that Si = {d1 , . . . , dt }, and observe that t \u2264 k. The variables of \u03c1i are v1 , . . . , vt . For every R \u2208 \u03c4 , and every tuple (df (1) , . . . , df (r) ) \u2208 RSi , where r = ar(R), R(vf (1) , . . . , vf (r) ) is an EDB of \u03c1i . We describe the variables of the IDBs Ji\u22121 and Ji . Assume that si\u22121 = (dg(1) , . . . , dg(j1 ) ) and si = (dh(1) , . . . , dh(j2 ) ). Then the IDB in the body of \u03c1i together with its variables is Ji\u22121 (vg(1) , . . . , vg(j1 ) ), and the head IDB together with its variables is Ji (vh(1) , . . . , vh(j2 ) ). The function \u03bbi simply assigns the value dg to the variable vg , \u2200g \u2208 [t]. It remains to specify the IDBs, i.e., which IDBs of CL the Ji -s correspond to. For each i \u2208 {0, . . . , q}, Ii denotes IMiP , where MiP is a subset of B j for some j \u2264 j. We define the sequence M0P , M1P , . . . , MqP inductively. To define M0P , consider the nonrecursive \u00af 0 ). Assume that the arity of J0 is j , and that y0 contains k varirule J0 (x0 ) \u2190 E(y ables. (Note that the variables in x0 and y0 are not necessarily disjoint.) For all possible functions \u03b1 : x0 [1], . . . , x0 [j ], y0 [1], . . . , y0 [k ] \u2192 B such that the conjunction of EDBs P \u00af E(\u03b1(y 0 [1]), . . . , \u03b1(y0 [k ])) is true, place the tuple (\u03b1(x0 [1]), . . . , \u03b1(x0 [j ])) into M0 . P Assume that Mi\u22121 is already defined. Then similarly to the base case, for each possible P instantiation \u03b1 of the variables of \u03c1i over B with the restriction that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , if the P conjunction of EDBs of \u03c1i is true, then add the tuple \u03b1(xi ) to Mi . It is not difficult to see that if MqP = \u2205, then we can construct a homomorphism from \u03be(P, P) to B which would be a contradiction. 11 For each i \u2208 {0, . . . , q \u2212 1}, assume that (Si , Si+1 ) has level i . Then we say that the IDB Ji has level i and we write level(Ji ) = i . We proceed to construct a CS-derivation DCS (S) for S. Let Q be a directed path of height n. We construct DCS (S) just like we would construct DCL (\u03be(S , Q)) above, except Q that we will define the subscripts of the IDBs, M0Q , . . . , Mn\u22121 , differently, so that every rule of the resulting derivation belongs to CS. From now on we write M0 , . . . , Mn\u22121 instead of Q . M0Q , . . . , Mn\u22121 To define M0 , . . . , Mn\u22121 , let P0 , P1 , . . . be an enumeration of all (finite) minimal oriented paths of height n. Intuitively, we will collect in Nm all subscripts (recall that a subscript is a relation) of all those IDBs which have the same level in DL (\u03be(S , Pm )). Formally, for each \u2208 {0, . . . , n \u2212 1} define Nm = {MtPm | level(Jt ) = }. Then we collect the subscripts at a fixed level in O over all derivations corresponding to P0 , P1 , . . . . Formally, for each \u2208 {0, . . . , n \u2212 1}, we define O = N0 \u222a N1 , . . . . We are ready to define M0 , . . . , Mn\u22121 . For each s \u2208 {0, . . . , n \u2212 1}, define Ms = W \u2208Os W . It remains to show that every rule of the derivation we defined is in S and that the last IDB is the goal IDB. If the last IDB is not the goal IDB of S, then Mn\u22121 = \u2205. By definition, m it must be that for some minimal oriented path Pm of height n and length qm , MP qm \u22121 = \u2205 m (note that the last IDB of DCL (\u03be(P, Pm )) has subscript MP qm \u22121 ). As noted before, this would mean that \u03be(P, Pm ) \u2192 B, a contradiction. We show that each rule of DCS (S) as defined above belongs to CanS(B). Suppose DCS (S) contains a rule \u03c1 \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y that is not in CanS(B). By definition, there cannot be an instantiation \u03b1 of variables of \u03c1 to elements of B such that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , the conjunction of EDBs holds, but \u03b1(xi ) \u2208 Mi . Assume then that there is an \u03b1 such that \u03b1(xi ) \u2208 Mi , the conjunction of EDBs holds, but \u03b1(xi\u22121 ) \u2208 Mi\u22121 . It is also not difficult to see that this is not possible because we used all minimal oriented paths in the construction of DCS (S). Proof of Theorem 9. If CSP(B) is defined by a symmetric (j, k)-Datalog program P, then using the symmetric property of P, it is laborious but straightforward to show that O= D is a free derivation of P {Ex(D)} is a (j, k)-symmetric obstruction set for B. For the converse, assume that B has (j, k)-SBPD. Let O be a symmetric obstruction set of width (j, k) (i.e., the path decomposition of every structure in O has width (j, k)) for B. We claim that (j, k)-CanS(B) defines CSP(B). Assume that A \u2192 B. Then by Lemma 10, (j, k)CanS(B) does not accept A. Suppose now that A \u2192 B. Then by assumption, there exists a (j, k)-path S \u2208 O with a representation S of length n such that S \u2192 A. Furthermore, since O is symmetric, for any minimal oriented path P of height n, \u03be(S , P) \u2192 B. It follows from Lemma 11 that CanS(B) accepts A. 12 3.2.2 Piecewise Symmetric Bounded Pathwidth Duality Piecewise symmetric bounded pathwidth duality (PSBPD) for symmetric Datalog is less stringent than SBPD; however, the price is larger program width. Although the following definitions might seem technical, the general idea is simple: a piecewise symmetric obstruction set O does not need to contain all (j, k)-paths obtained by \u201czigzagging\u201d (j, k)-paths in O in all possible ways. It is sufficient to zigzag a (j, k)-path S using only oriented paths which \u201cavoid\u201d certain segments of S: some constants c and d are fixed for O, and there are at most c fixed segments of S that are avoided by the zigzag operator, each of size at most d. We give the formal definitions. Definition 12 ((c, d)-filter). Let S be a (j, k)-path with a representation S = S0 , . . . , Sn\u22121 . A (c, d)-filter F for S is a set of intervals {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} such that \u2022 c \u2264 c; 0 \u2264 s1 ; tc \u2264 n \u2212 1; si \u2264 ti , \u2200i \u2208 [c ]; and t + 2 \u2264 s +1 , \u2200 \u2208 [c \u2212 1]; \u2022 | i\u2208[s ,t ] Si | \u2264 d, \u2200 \u2208 [c ]. Elements of F are called delimiters. An oriented path P of height n obeys a (c, d)-filter F if for any delimiter [si , ti ] \u2208 F , the set of edges e of P such that si \u2264 level(e) \u2264 ti form a (single) directed path. A demonstration is given in Figure 3. S P FS S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 Figure 3: S is a (j, k)-path representation of S. FS is the (3, 2k)-filter {[0, 0], [3, 4], [7, 8]} for S . P is an oriented path that obeys the filter. For example, observe that the edges at levels 3 and 4 form a directed subpath, and that \u201czigzagging\u201d happens only at those parts of P that do not fall into the intervals of the filter. Definition 13 (Piecewise Symmetric). Assume that O is a set of (j, k)-paths, and c and d are nonnegative integers. Suppose furthermore that for each S \u2208 O, there is a (j, k)-path representation S , and a (c, d)-filter FS such that the following holds. For every S \u2208 O of some length n, and every minimal oriented path P of height n that obeys the filter FS , it holds that \u03be(S , P) \u2208 O. Then O is (j, k, c, d)-piecewise symmetric. Roughly speaking, an oriented path P is allowed to modulate only those segments of S which do not correspond to any delimiters in FS . Compare Definition 13 with Definition 7, and observe that the only difference is that in the piecewise case, the oriented paths must be of a restricted form. Therefore a set that is (j, k)-symmetric is also (j, k, c, d)-piecewise symmetric for any c and d. We simply associate the empty (c, d)-filter with each structure. 13 Definition 14 (PSBPD). A structure B has (j, k, c, d)-piecewise symmetric bounded pathwidth duality ((j, k, c, d)-PSBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k, c, d)-piecewise symmetric. Theorem 15. For a finite structure B, B has SBPD (for some parameters) if and only if B has PSBPD (for some parameters). We need the corollary of the following lemma in the proof of the above theorem. Lemma 16. Let P be a minimal oriented path e0 , . . . , en\u22121 with the (1, 2)-path representation P = (e0 , . . . , en\u22121 ), where we think of ei as a structure with two domain elements and a binary relation that contains the tuple ei . Let Q be a minimal oriented path f0 , . . . , fm with n edge levels. Then the oriented path \u03be(P, Q) is minimal and has the same height as P. Proof. It is obvious that \u03be(P, Q) is an oriented path. Furthermore the map that assigns every vertex of \u03be(P, Q) to its original in P is a homomorphism. It is easy to check that this homomorphism maps the edges of \u03be(P, Q) back to their originals and the level of an edge in \u03be(P, Q) is the same as the level of the original of that edge. Checking the minimality of \u03be(P, Q) is also straightforward. Corollary 17. Let O be a set of (j, k)-paths, where a (j, k)-representation is fixed for each path. Let O be the set that contains all (j, k)-paths that can be obtained from a (j, k)-path in O by applying some zigzag operator. Then O is (j, k)-symmetric. Remark: A similar statement holds in the piecewise symmetric case. Proof. Let S be an element of O . If we can show that applying an arbitrary zigzag operator to S yields a (j, k)-path in O , then we are clearly done. So assume that S was obtained from S \u2208 O by applying a zigzag operator. The (j, k)-path S inherits the (j, k)-representation of S in a natural way. Then we apply any zigzag operator to S to obtain S , and we need to show that S is in O . We get from S to S using a zigzag operator and from S to S another zigzag operator. Using Lemma 17, we can see that we can replace these two zigzag operators by a single one to obtain S from S directly. Proof of Theorem 15. Let O be a (j, k)-symmetric obstruction set for B. As observed above, for any c and d, O is also (j, k, c, d)-piecewise symmetric. For the converse, let O be a (j, k, c, d)-piecewise symmetric obstruction set. Our goal is to construct a (j , k )-symmetric obstruction set Osym for B as follows. For each structure S \u2208 O, let S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation. Using the filter for S, we \u201cregroup\u201d S0 , . . . , Sn\u22121 to obtain (j , k )-path representation S = T0 \u222a \u00b7 \u00b7 \u00b7 \u222a Tm of S. We add each S together with its new representation to Osym , and also add every structure that is needed to ensure that Osym is symmetric. Finally, we show that Osym is a symmetric obstruction set for B. We begin with the regrouping procedure. Let S \u2208 O, S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation, and {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} be the (c, d)-filter FS . The regrouping procedure is quite pictorial and it is demonstrated in Figure 4. We define 14 S S S17 S16 S15 S14 S13 S12 S11 S10 S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 S11 T5 S5 S10 T4 S4 S9 S16 T3 S3 S8 S15 T2 S2 S7 S14 T1 S13 S1 S6 S17 T0 S12 S0 Figure 4: An example regrouping for the proof of Theorem 15. The filter FS = {[0, 1], [6], [12, 13], [17]}. The structures corresponding to the filter are laying inside the rectangles with lines. The complement of the filter is F\u00afS = {[2, 3, 4], [7, 8, 9, 10, 11], [14, 15, 16]}. The structure corresponding to F\u00afS lay in the gray ovals. The new (j , k )-path representation S of S is on the right. Notice the following pattern: the segments of S determined by FS are placed next to each other in S . T0 = S. \u2208[a,b]: [a,b]\u2208FS This places all substructures in S which correspond to delimiters of FS into one big initial structure. Note though that |T0 | \u2264 c \u00b7 d. Define the complement of FS as F\u00afS = {[0, s1 \u2212 1], [t1 + 1, s2 \u2212 1], [t2 + 1, s3 \u2212 1], . . . , [tc , n \u2212 1]}, and set m = max (b \u2212 a). [a,b]\u2208F\u00afS Intuitively, m is the length of the longest interval in S between any two delimiters. We define T as follows. For each interval [a, b] \u2208 F\u00afS take the ( \u2212 1)-th structure Sa+ \u22121 in that interval and define T to be the union of these structures. Formally, for every \u2208 {1, . . . , m}, set T = Si . i=a+ \u22121\u2264b: [a,b]\u2208F\u00afS 15 S : the representation of S in O Q T : the representation of S in Ops \u03be(S , Q) \u03be(T , P) P S9 S9 S8 S8 S8 S7 S7 S6 S6 S5 S5 S4 S4 S3 , S8 , D S3 , S8 , D S3 S2 , S7 , D S2 , S7 , D S1 , S6 , D S1 , S6 , D S0 , S5 , D S0 , S5 , D S7 S8 S7 S4 , S9 , D S3 S3 S2 S2 S1 S1 S0 S0 S3 S2 S4 , S9 , D S3 , S8 , D S2 , S7 , D S3 , S8 , D S2 , S7 , D S2 Figure 5: Example in the proof of Theorem 15. See the proof for details. Observe that |T | \u2264 k \u00b7 (c + 1). We need to ensure property 2 in Definition 3, so we need to place some additional elements into the domains of the T . Let [x, y] \u2208 FS and [z, w] \u2208 F\u00afS be such that z = y + 1. Then the set of elements Sx \u222a \u00b7 \u00b7 \u00b7 \u222a Sw is called a column. (For the beginning and end of S a column is defined in the natural \u201ctruncated\u201d way.) Because S is a (j, k)-path representation, it follows from the definition that the intersection of any pair of columns has size at most j. Let C1 , . . . , Cr be an enumeration of all the columns. Set D = = C \u2229 C and observe that |D| \u2264 j \u00b7 2r . We add D to the domain of T0 , and also to the domain of Ti to obtain Ti , \u2200i \u2208 {1, . . . , m}. It is straightforward to see that the new representation T = (T0 , . . . , Tm ) satisfies property 2 of Definition 3. Using the remarks about the sizes of the sets, we observe that T is a (j , k )-path decomposition of S, where j and k are functions of j, k, c and d. We place all structures S \u2208 O into Osym but we associate the new representation with S. For a structure S \u2208 Osym , we also apply all valid zigzag operators to S (with respect to the new representation) and add all these structure to Osym . By Lemma 17, Ops is a (j , k )symmetric set. We need to establish that Ops is an obstruction set. Because O \u2286 Osym , it is sufficient to show that no structure in Osym maps to B. To do that we show that for any structure in Ops , there is a structure in O that homomorphically maps to it. Giving a formal proof would lead to unnecessary notational complications and therefore we give an example that is easier to follow and straightforward to generalize. The example is represented in Figure 5. Let S \u2208 Ops such that S is also in O. Assume that the (j , k )representation of S in Ops is T . We consider \u03be(T , P) for some minimal oriented path and show how to find a minimal oriented path Q such that \u03be(S , Q) \u2192 \u03be(T , P). To construct Q, we make a copy of P aligned with S0 , S1 , S2 , S3 , S4 in S . This is represented by the dashed lines in Figure 5. We also make a copy of P aligned with S5 , S6 , S7 , S8 , S9 . This is represented with the dash dotted lines. Note that the resulting minimal oriented path 16 respects the delimiters, i.e., the zigzag operator will not \u201czigzag\u201d S0 and S5 . (In general, we never need to \u201czigzag\u201d structures that were placed into T0 , i.e., the structures that correspond to the delimiters, because P is minimal.) In \u03be(T , P) we denote the copies of the Si with Si and primed Si . Using the definition of the zigzag operator, it follows that the function f that maps an element of S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222aS2 \u222aS3 \u222aS4 in \u03be(S , Q) to the corresponding element in S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222a S2 \u222a S3 \u222a S4 is a homomorphism. We similarly define a homomorphism h from S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(S , Q) to S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(T , P). If we can make sure that if an element x is in the domain of both f and h, and both homomorphisms map x to the same element then we have the desired homomorphism. Assume for example that the element x appears in S2 and also in S8 in \u03be(S , Q), and suppose that f (x) = y and h(x) = y . Let the originals of y and y be z and z in T , respectively. We also identify z and z in S2 and S8 in S . Observe that x in S2 in \u03be(S , Q) is a copy of z and x in S8 in \u03be(S , Q) is a copy of z . If z = z (in S ) then x could not appear both in S2 and S8 by the definition of the zigzag operator. Therefore z = z , z \u2208 D, and by definition, z is in every bag of T . The elements y and y are copies of z, and because z appears in every \u201cbag\u201d of T , all copies of z in \u03be(T , P) are identified to be the same element. In particular, f (x) = y = y = h(x). 3.3 3.3.1 Applications Datalog + Maltsev \u21d2 Symmetric Datalog Using SBPD, we give a short and simple re-proof of the main result of [9]: Theorem 18 ([9]). Let B be a finite core structure. If B is invariant under a Maltsev operation and co-CSP(B) is definable in Datalog, then co-CSP(B) is definable in symmetric Datalog (and therefore CSP(B) is in L by [11]). We only need to show that if co-CSP(B) is in linear Datalog and B is preserved by a Maltsev operation, then co-CSP(B) is in symmetric Datalog. The \u201cjump\u201d from Datalog to linear Datalog essentially follows from already established results, as observed in [9]. For the sake of completeness, we give an approximate outline of the argument without being too technical.4 If co-CSP(B) is definable in Datalog and B has a Maltsev polymorphism, then B also has a majority polymorphism. If B has a majority polymorphism, then co-CSP(B) is definable in linear Datalog [8]. Hence, to re-prove Theorem 18, it is sufficient to prove Lemma 19. Our proof relies on the notion of SBPD. Lemma 19. If co-CSP(B) is definable by a linear Datalog program and B is invariant under a Maltsev operation m, then co-CSP(B) is definable by a symmetric Datalog program. To get ready for the proof of Lemma 19, we define an N -digraph of size s as an oriented path that consists of s forward edges, followed by s backward edges, followed by another s forward edges. Proposition 20 is easy to prove, and the Maltsev properties are used in Lemma 21. 4 The interested reader can consult Lemma 6 (originally in [23]) and Lemma 9 in [9]. For Lemma 9, note that if B has a Maltsev polymorphism, then V(A(B)) is congruence permutable, see [5]. 17 Proposition 20. A minimal oriented path is either a directed path, or it contains a subpath which is an N -digraph. Lemma 21. Let B be a structure invariant under a Maltsev operation m, S be a (j, k)-path with a (j, k)-representation S = (S0 , . . . , Sn\u22121 ), and P = e0 , . . . , eq be a minimal oriented path of height n. If \u03be(S , P) \u2192 B, then S \u2192 B. Proof. Using Proposition 20, there is an index t such that Q = et , et+1 , . . . , et+(3s\u22121) is an N digraph of size s in P. Assume that the first and last vertices of Q are v and w, respectively. Let P be the oriented path obtained from P by removing Q, and adding a directed path Q = ft , ft+1 , . . . , ft+(s\u22121) of length s from v to w. We claim that there is a homomorphism \u03b3 from \u03be(S , P ) to B. Once this is established, repeating the argument sufficiently many times clearly yields that S \u2192 B. Let \u03be(S , P) = (Se0 , . . . , Seq ), and \u03d5e0 , . . . , \u03d5eq be the corresponding isomorphisms (recall the zigzag operator definition in Section 3.1). Similarly, let \u03be(S , P ) = (Sf0 , . . . , Sfq\u22122s ), and \u03c8f0 , . . . , \u03c8fq\u22122s be the corresponding isomorphisms. Because S[e0 ,et\u22121 ] and S[et+3s ,eq ] are isomorphic to S[f0 ,ft\u22121 ] and S[ft+s ,fq\u22122s ] , respectively, \u03b3 for elements in S[f0 ,ft\u22121 ] \u222a S[ft+s ,eq\u22122s ] is defined in the natural way. It remains to define \u03b3 for every d \u2208 S[ft ,ft+(s\u22121) ] . Assume that d \u2208 Sft+ for some \u2208 {0, . . . , s \u2212 1}. Find the original of d in S and let it be do , i.e., do = \u03c8ft+ (d). Then we find the three copies d1 , d2 , d3 of do in S[ft ,ft+(3s\u22121) ] . That is, first we find the three edges e 1 , e 2 , e 3 of Q which have the same level as ft+ (all levels are with respect to P and P ). Then di = \u03d5\u22121 e (do ), i \u2208 [3]. We define \u03b3(d) = m(d1 , d2 , d3 ). By the i \u03b3 Maltsev properties of m, \u03b3 is well-defined. As B is invariant under m, \u03be(S , P ) \u2212\u2192 B. Proof of Lemma 19. If co-CSP(B) can be defined by a linear (j, k)-Datalog program, then there is an obstruction set O for B in which every structure is a (j, k)-path by [7]. We construct a symmetric obstruction set Osym for B as follows. For every (j, k)-path S with a (j, k)-representation S = S0 , . . . , Sn\u22121 in O and for every minimal oriented path P of height n, place \u03be(S , P) into Osym . By Corollary 17, Osym is (j, k)-symmetric. Observe that O \u2286 Osym , so it remains to show that no element of Osym maps to B. But if T \u2208 Osym , then T = \u03be(S , P) for some S \u2208 O and P. By Lemma 21, if \u03be(S , P) \u2192 B, then S \u2192 B. This contradicts the assumption that O is an obstruction set for B. 3.3.2 A class of oriented paths for which the CSP is in L, and a class for which the CSP is NL-complete In this section we define a class C of oriented paths such that if B \u2208 C then co-CSP(B) is in symmetric Datalog. Our strategy is to find an obstruction set O for B \u2208 C, and then to show that our obstruction set is piecewise symmetric. We need some notation. We say that a directed path is forward to mean that its first and last vertices are the vertices with indegree zero and outdegree zero, respectively. Let P be an oriented path with \u00af is a copy of the first vertex v and last vertex w. Then the reverse of P, denoted by P, \u00af is a copy of w and its last oriented path P in the reverse direction, i.e., the first vertex of P vertex is a copy of v. Let Q be another oriented path. The concatenation of P and Q is the oriented path PQ in which the last vertex of P is identified with the first vertex of Q. For a 18 t a c P2 P1 E1 s e E2 P4 P3 b P5 d Figure 6: 2-wave in the proof of Theorem 23. nonnegative integer r, Pr denotes P1 P2 \u00b7 \u00b7 \u00b7 Pr , where the P are disjoint copies of P. Given two vertices v and w, we denote the presence of an edge from v to w with v \u2192 w. \u00af r PE2 , where Ei Definition 22 (Wave). If an oriented path Q can be expressed as E1 (PP) (i \u2208 [2]) denotes the forward directed path that is a single edge, P is a forward directed path of length , and r \u2265 0, then Q is called an r-wave. A 2-wave is shown in Figure 8, 1. Theorem 23. Let Q be a wave. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. We prove the case when Q is an r-wave for r = 2. For larger r-s, the proof generalizes in a straightforward manner. Let P be a directed path of length h, P1 , P3 , P5 be disjoint copies of P, and P2 , P4 be copies of the reverse of P. Let E1 and E2 be forward edges. Assume the 2-wave Q is E1 P1 P2 P3 P4 P5 E2 (Figure 6). We will provide a piecewise symmetric obstruction set Ops for Q, such that every element of Ops is an oriented path. To do this, first we observe that by [17], Q has path duality, i.e., we can assume that the set O of all oriented paths that do not homomorphically map to Q form an obstruction set for Q. To construct Ops from O, we will place certain elements of O into Ops such that Ops is still an obstruction set for Q. We begin with some simple observations. Any oriented path that has height at most h + 1 maps to Q, so these oriented paths can be neither in O nor in Ops . Any oriented path that has height strictly larger than h + 2 obviously does not map to Q, so all such paths are in O and we also place these paths into Ops . Assume that P \u2208 O has height exactly h + 2. It is easy to see that if P is not minimal, then it contains a minimal subpath that does not map to Q. Therefore, it is sufficient to place only those oriented paths from O of height h + 2 into Ops which are minimal. Let P \u2208 Ops of height h + 2 (then P is minimal). Intuitively, any attempt to homomorphically map the vertices of P to Q starting by first mapping the first vertex of P to the first vertex of Q and then progressively finding the image of the vertices of P from left to right would get stuck at a or c. Formally, assume that the vertices of P are v1 , . . . , vn . Let P[i] denote the subpath of P \u03d5 on the first i vertices. Choose i to be the largest index such that P[i] \u2212\u2192 Q and \u03d5(v1 ) = s. Then \u03d5 cannot be extended to vi+1 for one of the following reasons. Clearly, \u03d5 must map vi to a source or a sink other than s or t, i.e., to a,b,c or d. Furthermore, we can assume that vi is not mapped to b or d. This is because if vi is mapped to b or d, then level(vi ) = 1, so the edge between vi and vi+1 is from vi to vi+1 , and therefore \u03d5 can be extended. So we can assume that vi is mapped to a or c. Because we cannot extend \u03d5, vi+1 must be at level 19 + 2, so it must be that vi+1 is the last vertex vn of P. Because P \u2192 Q, P[n\u22121] must be an oriented path such that any homomorphism \u03d5 from P[n\u22121] to Q such that \u03d5(v1 ) = s maps vn\u22121 to a or c but not to e. We assume first that any homomorphism \u03d5 from P[n\u22121] to Q maps vn\u22121 to a. We follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. If there is a vertex to the right of wa at level 1, then because P[n\u22121] will have to reach level h + 1 again, we will be able to map vn\u22121 to c, and that is not possible by assumption. So P must have the following form (Form 1): (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2192 w6 ), where X is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X, and Y is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of Y. See Figure 7, left. For the second case, we assume that P[n\u22121] is such that vn\u22121 can be mapped to c. Again, we follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. We must have a vertex going back to level 1 (otherwise we could not \u201cpass\u201d b and could not map vn\u22121 to c). Let wb be the first such vertex. We will have to go back to level h + 1 again, so let wc be the first vertex at that level. Finally, we cannot go back to level 1 again, since then the last vertex of P[n\u22121] can be mapped to e. We can \u201cgo down\u201d to at most level 2 of P[n\u22121] . So P must have the form (Form 2) (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2190 w6 )Z(w7 \u2192 w8 )W(w9 \u2192 w10 ), where X (Z) is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X (Z), Y is any oriented path of height h \u2212 1 with first vertex at the top and last vertex at the bottom level of Y, and W is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of W. See Figure 7, right. w6 wa = w4 w3 w10 w5 wa = w4 wc = w8 w3 w7 Y X w2 w1 X w2 w1 Y w5 w9 W Z wb = w6 Figure 7: Obstructions of height h + 2 for a 2-wave. Because Ops \u2286 O and for any structure S \u2208 O, there is a structure S \u2208 Ops such that S \u2192 S, Ops is an obstruction set for Q. It remains to show that Ops is piecewise symmetric. Let S be an oriented path of height more than h + 2, and assume the vertex set of S is v1 , . . . , vn . We need to define a representation S , and a filter FS for S. The representation (S0 , S1 , . . . , Sn\u22122 ) is (v1 , v2 ), (v2 , v3 ), . . . , (vn\u22121 , vn ) (width (1, 2)). The filter FS is the empty filter. Note that if we apply a zigzag operation to S, we get an oriented path of the same height as S, so Ops is closed under zigzagging of obstructions of height greater than h + 2. Let S be an oriented path of height h + 2 of Form 1, and assume the vertex set of S is v1 , . . . , vn . The representation S = (S0 , S1 , . . . , Sn\u22122 ) is constructed as in the previous paragraph. We specify FS to be the following (3, 6)-filter. Assume that the edge (w3 , w4 ) is structure Si . Then FS = {[0, 0], [i, i], [n \u2212 2, n \u2212 2]}. Using the definitions it is easy to see 20 that if P obeys the filter FS , then \u03be(S , P) is also an oriented path of Form 1. Therefore Ops is closed under zigzagging of obstructions of Form 1. Obstructions of Form 2 can be handled similarly. We state the following generalization of waves. \u00af r P, \u00af where Definition 24 (Staircase). A monotone wave is an oriented path of the form (PP) P is a forward directed path and r \u2265 0. We call the vertices of a monotone wave in the topmost level peaks, and the vertices in the bottommost level troughs. If a minimal oriented path Q can be expressed as P1 W1 P2 W2 . . . Pn\u22121 Wn\u22121 Pn , where P1 , . . . , Pn are forward directed paths, W1 , . . . , Wn\u22121 are monotone waves, and for any i \u2208 [n \u2212 1], the troughs of Wi are in a level strictly below the level of the troughs of Wi+1 , and also, the peaks of Wi are in a level strictly below the level of the peaks of Wi+1 , then Q is called a staircase. An example is given in Figure 8, 2. Theorem 25. Let Q be a staircase. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. Assume that the height Q is h. As for waves, we use [17] to conclude that Q has path duality. We will construct a piecewise symmetric obstruction set Ops for Q by placing three classes of oriented paths into Ops . First, Ops contains all oriented paths which have height strictly greater than h. These oriented paths obviously do not map to Q. The next class of oriented paths we place into Ops are those which have height precisely h. Recall that Q consists of waves patched together with directed paths in between. Let the wave subpaths of Q be W1 , . . . , Wn , from left to right. For each Wi , we construct a class of oriented paths. Assume that Wi has height hi and let Oi be the set of minimal oriented paths of height hi which do not map to Wi . For each R \u2208 Oi , we construct C = B1 RB2 , where B1 and B2 are oriented paths (possibly empty) such that C has height h, and the level of R in C matches the level of Wi . Observe that there cannot be a homomorphism from C to Q. We place all such constructed C into Ops . Let be the length of the longest directed subpath of Q. The third class of oriented paths are those that have height h , where < h < h. For every such h , we produce a set of obstructions. (Remark: we set < h because any oriented path of length or less maps to Q.) Assume inductively (the base case is trivial) that we already have a piecewise symmetric obstruction set for every staircase of height strictly less than h. Consider every subpath Q1 , . . . , Qm of Q of height h . Notice that core(Qi ) is a staircase which is not a directed path. By the inductive hypothesis we have a piecewise symmetric obstruction set Ui for Qi . We keep only those oriented paths in Ui which have height at most h ; observe that Ui = \u2205. Construct D = B1 T1 \u00b7 \u00b7 \u00b7 Bm Tm Bm+1 , where (T1 , . . . , Tm ) \u2208 U1 \u00d7 \u00b7 \u00b7 \u00b7 \u00d7 Um and the Bj are arbitrary oriented paths such that the height of D is h . Place all these D-s into Ops . Notice that D does not map to Q for the following. Assume for contradiction that D maps to a subpath S of Q. Then D also maps to the core of S which is a staircase. But by construction D contains a subpath that does not map to S. We show that Ops is an obstruction set for Q. If an structure Z \u2208 Ops homomorphically maps to an input structure A, then obviously, there cannot be a homomorphism from A to 21 Q. Assume for contradiction that no structure in Ops maps to A but A does not map to Q. Then O contains an oriented path P that maps to A. So if we show the following claim then we are done. Claim. For any oriented path P that does not homomorphically map to Q, there is an oriented path Z \u2208 Ops that homomorphically maps to P. Proof of Claim. Assume that P has height precisely h. We show that there exists Z \u2208 Ops of height h such that Z \u2192 P. Assume for contradiction that none of the oriented paths of height h in Ops map to P. As before, let W1 , . . . , Wn be the wave segments of Q, from left to right, and assume without loss of generality that none of the Wi is a directed path. Let the initial and final vertices of Wi be ai and bi respectively, i \u2208 [n]. For each i \u2208 [n], find the minimal oriented subpaths of P whose initial vertices have the same level as ai , and final vertices have the same level as bi , or vice versa (note that because of the structure of Q, no such oriented path could contain another as a subpath, however, these oriented paths could overlap). For any such subpath R of P associated with Wi , map the lowest vertex of R to ai , and the highest vertex of R to bi . Remark 1: In fact there is no other choice. The rest of the vertices of R can be mapped to Q as follows. If R does not map to Wi with first and last vertices matched then by definition, P is in Ops and we have a contradiction. Therefore let the homomorphism for R be \u03d5R . Remark 2: Also observe that \u03d5R maps the inner vertices of R to vertices of the staircase which are between ai and bi . We show that the partial homomorphisms \u03d5R map the same vertex of P to the same vertex in Q, and furthermore we can also map those vertices of P to an element of Q that are not mapped anywhere by the \u03d5R . This way we obtain a homomorphism from P to Q and this would be a contradiction. First, any vertex v is assigned to a vertex of Q by at most two homomorphisms which correspond to consecutive wave segments of Q. This is because in Q, Wi and Wj are disjoint unless j = i + 1. Using Remarks 1 and 2, we can see that if a vertex v of P is in the domain of two \u201cnon-consecutive\u201d homomorphisms, then because those homomorphisms could not agree on where to map v, it is not possible that P \u2192 Q. This is a contradiction. Let \u03d5R1 and \u03d5R2 (assume without loss of generality that R1 and R2 correspond to W1 and W2 , respectively) be two partial homomorphisms such that their domains overlap. Then the markers a1 , b1 , a2 , b2 appear in the order a1 , a2 , b1 , b2 when traversing P from left to right. The vertices that are in the domain of both homomorphisms are the ones from a2 to b1 . By the choice of a1 , b1 , a2 , b2 , the segment of P from a2 to b1 is a minimal oriented path. Checking the images of the vertices going back from b1 to a2 under the map \u03d5R1 , we see that these vertices are mapped to the rightmost directed path segment of W1 . Similarly, the image of these vertices under \u03d5R1 is the leftmost directed path of the W2 . That is, the two homomorphisms coincide for the vertices from a2 to b1 . Furthermore, some vertices of P are not in the domain of any partial homomorphisms. Consider the two minimal oriented paths S and S on the two sides of such a maximal continuous sequence of vertices in P. There are two cases. First, assume that S and S both correspond to the same Wi . Let the markers for S be a and b an the markers for S be a and b . Then following P from left to right, the markers appear in the order a, b, b , a . The images of the vertices from b to b are not defined. (Observe that b and b are mapped to the same vertex.) Consider the last directed path segment of Wi together with the first directed 22 path segment of Wi+1 (or just the last edges of Q if i = n). Observe that the vertices from b to b can be mapped to this directed path. The case when S and S correspond to different waves of Q is handled similarly. Suppose lastly that P has height h < h. Because P does not map to any of the subpaths of Q of height h , for each subpath Q1 , . . . , Qm of Q of height h , P contains a subpath Si such that Si \u2192 Qi , i \u2208 [m]. If Si \u2192 Qi then Si \u2192 core(Qi ). Recall that core(Qi ) is a staircase and by definition, Ui contains an oriented path Si such that Si \u2192 Si . It is clear that we can choose oriented paths B1 , . . . , Bm+1 such that B1 S1 B2 . . . Bm Sm Bm+1 \u2192 P. Finally, it is not hard to see from the construction how to associate filters with the elements of Ops to establish that Ops is piecewise symmetric. We also give a large class of oriented paths for which the CSP is NL-complete. We need the following propositions to prove Theorem 28. Proposition 26. Let P1 and P2 be two minimal oriented paths of the same height h. Then there is a minimal oriented path Q of height h such that Q \u2192 P1 , P2 . Proof. Not hard, see e.g. [16]. Proposition 27. A core oriented path has a single automorphism, i.e., it is rigid. Proof. Let P be a core oriented path and P be an isomorphic copy of P . There are at most two isomorphisms from P to P (because a vertex with indegree 0 must be mapped to a vertex with indegree 0, and similarly for a vertex with outdegree 0). One possibility is to map the first vertex of P to the first vertex of P and the last vertex of P to the last vertex of P. For contradiction, assume that the second possibility happens, i.e., there is an isomorphism \u03d5 that maps the first vertex of P to the last vertex of P and the last vertex of P to the first vertex of P. Assume that both the first vertex v and last vertex w of P have indegree zero (the other case is similar). Then the level(v) = level(w). This implies that the number of forward and backward edges in P is the same, so P has 2q edges. By \u00af and such an oriented path is clearly not a the existence of \u03d5, P must have the form QQ, core. Theorem 28. Let B be a core oriented path that contains a subpath P1 P2 P3 of some height h with the following properties: P1 , P2 and P3 are minimal oriented paths, they all have height h, and there is a minimal oriented path Q of height h such that Q \u2192 P1 , Q \u2192 P3 but Q \u2192 P2 . Then CSP(B) is NL-complete. An example is given in Figure 8, 3 and 4. Proof of Theorem 28. We show that the less-than-or-equal-to relation on two elements, R\u2264 = {(0, 0), (0, 1), (1, 1)}, and the relations {0} and {1} can be expressed from P using a primitive positive (pp) formula (i.e., a first order formula with only existential quantification, conjunction and equality). It is easy to see and well known that CSP({R\u2264 , {0}, {1}}) is equivalent to the NL-complete directed st-Conn problem. Since P is a core, it is rigid by Proposition 27. Assume that the first vertex of P1 is in a level lower than the level of the last vertex of P1 (the other case can be handled 23 1 2 P2 E2 P1 P2 P3 P4 4 3 P1 P3 Q P5 E1 Figure 8: 1: A 2-wave. 2: A staircase. 3: An example oriented path for which the CSP is NL-complete. 4: The oriented path Q in Theorem 28 corresponding to the oriented path in 3. similarly). See the illustration in Figure 9. Assume that the first vertex of P1 is 0 and the first vertex of P3 is 1. We construct a structure G with two special vertices x and y such h that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . It is well known and easy to show that then R\u2264 can also be expressed from P using a pp-formula. Let P be an isomorphic copy of P. We refer P G P23 P1 P2 P3 0 P1 1 P2 P3 c Q x P123 y Figure 9: Construction of the gadget G. to copies of P1 , P2 , P3 as P1 , P2 , P3 , respectively. Using Proposition 26, we find a minimal oriented path P23 of height h that maps to both P2 and P3 . Similarly, we find a minimal oriented path P123 that maps to each of P1 , P2 , P3 . We rename the first vertex of Q to x, and the first vertex of P123 y to y. To construct G, we identify the topmost vertices of the oriented paths P23 , Q and P123 . Then we identify the first vertex of P23 with the vertex c of P that is shared by P2 and P3 . Observe that any homomorphism from G to P, must map h c to 1. It is straightforward to verify that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . Because P is rigid, any relation of the form {v} where v \u2208 P can be expressed by a pp-formula. 4 4.1 On CSPs in NL Definitions Let \u03c4 be a vocabulary. A successor \u03c4 -structure S is a relational structure with vocabulary \u03c4 \u222a {first, last, suc}, where first and last are unary symbols and suc is a binary symbol. 24 Without loss of generality, the domain S is defined as {1, . . . , n}, firstS = {1}, lastS = {n}, and sucS contains all pairs (i, i + 1), i \u2208 [n \u2212 1]. Because firstS , lastS and sucS depend only on n, they are called built-in relations. When we say that a class of successor structures is homomorphism/isomorphism-closed, all structures under consideration are successor structures, and we understand that homomorphism/isomorphism closure, respectively, is required only for non-built-in relations. Definition 29 (Split Operation). A split operation produces a \u03c4 -structure A from a \u03c4 structure A as follows. For an element a \u2208 A let Ta be defined as Ta = {(t, R, i) | t = (t1 , . . . , tr ) \u2208 RA where R \u2208 \u03c4 , and ti = a}. If |Ta | \u2264 1 for every a \u2208 A, then no split operation can be applied. Otherwise we choose a strict nonempty subset T of Ta , and for each triple (t, R, i) \u2208 T , we replace t = (t1 , . . . , tr ) in RA with (t1 , . . . , ti\u22121 , a , ti+1 , . . . , tr ) to obtain A (and A = A \u222a {a }). Definition 30 (Split-Minimal, Critical). Let C be a class of structures over the same vocabulary. We say that a structure A \u2208 C is split-minimal in C if for every possible nonempty sequence of split operations applied to A, the resulting structure is not in C. We say that a structure A \u2208 C is critical in C if no proper substructure of A is in C. For a class of successor \u03c4 -structures, criticality and split-minimality is meant only with respect to non-built-in relations. Definition 31 (Read-Once Datalog). Let P be a (linear, symmetric) Datalog program that defines a class of structures C. If for every critical and split-minimal element of C there is a P-derivation that is read-once, then we say that P is read-once. Definition 32 (Read-Once mnBP1). A monotone nondeterministic branching program (mnBP) H with variables X = {x1 , . . . , xn } computes a Boolean function fH : {0, 1}n \u2192 {0, 1}. H is a directed graph with distinguished nodes s and t and some arcs are labeled with variables from X (not all arcs must be labeled). An assignment \u03c3 to the variables in X defines a subgraph H\u03c3 of H as follows: an arc a belongs to H\u03c3 if \u03c3(x) = 1, where x is the label of a, or if a has no label. The function fH is defined as fH (\u03c3) = 1 if and only if there is a directed path in H\u03c3 from s to t (an accepting path). The size of an mnBP is |VH |. Let \u03c4 be a vocabulary and n \u2265 1. We assume without loss of generality that any relational structure whose domain has size n has domain {1, 2, . . . , n}. Let (R1 , t1 ), (R2 , t2 ), . . . , (Rq , tq ) be an enumeration of all pairs such that Ri \u2208 \u03c4 and ti \u2208 {1, 2, . . . , n}ar(Ri ) . We associate a variable xi with (Ri , ti ), for each i = 1, 2, . . . , q. Then if all labels of a branching program Hn are among x1 , x2 , . . . , xq , we say that Hn is over the vocabulary \u03c4 for input size n. We say that a family of branching programs F defines a class of \u03c4 -structures C, if for each n \u2265 1, F contains precisely one branching program Hn over \u03c4 for input size n such that fHn (x1 , x2 , . . . , xq ) = 1 if and only if the tuple structure with domain {1, 2, . . . , n} and containing precisely those pairs (Ri , ti ) for which xi = 1 is in C. Let F be a family of mnBP1s that contains precisely one branching program for each n \u2265 1. We say that F is a poly-size family if there is a polynomial p such that for each n \u2265 1, |V (Hn )| \u2264 p(n). Such a family is denoted by mnBP1(poly). If for every n and every 25 structure of domain size n in C, Hn contains an accepting path P such that any label on P is associated with at most one arc of P , then we say that F is read-once. (This read-once condition can be made a bit weaker.) 4.2 Examples We give some examples of problems definable by a 1-linDat(suc) program or by an mnBP1(poly). The program in Section 2.2, Figure 1 without rule 3 is a read-once linear Datalog(suc) program that defines the problem directed st-Conn. To see that this program Pst\u2212Conn is read-once, let G be any input that is accepted (we do not even need G to be critical and split-minimal). Then we find a directed path in E G connecting an element of S G to an element of T G without repeated edges. We build a Pst\u2212Conn -derivation for this path in the obvious way. For this section, by a clique we mean an ordinary undirected clique but each vertex may or may not have a self-loop. Let EvenCliques be the class of cliques of even size. The read-once linear Datalog(suc) program PEC below defines EvenCliques. The goal predicate of PEC is G2 , and E is the symbol for the edge relation of the input. The first part of PEC checks if the domain size n of the input is even. The second part goes through all pairs (x, y) \u2208 [n]2 , and at the same time, checks if (x, y) is an edge in E. This is achieved by accessing the order on the domain. Program PEC goes through every pair of vertices precisely once, so every PEC -derivation is read-once, and therefore PEC is read-once. I(y) \u2190 first(x) \u2227 suc(x, y) I(z) \u2190 I(x) \u2227 suc(x, y) \u2227 suc(y, z) G1 \u2190 I(x) \u2227 last(x) J(x, y) \u2190 G1 \u2227 first(x) \u2227 first(y) J(x, z) \u2190 J(x, y) \u2227 suc(y, z) \u2227 E(x, z) \u2227 E(z, x) J(z, w) \u2190 J(x, y) \u2227 last(y) \u2227 suc(x, z) \u2227 suc(z, w)\u2227 E(z, w) \u2227 E(w, z) G2 \u2190 J(x, y) \u2227 suc(x, y) \u2227 last(y). Figure 10: The read-once linear Datalog(suc) program PEC for EvenCliques. In fact, we can easily test much more complicated arithmetic properties than the property of being even (e.g., being a power of k) with a 1-linDat(suc) program. However, linear Datalog cannot define any set of cliques with a non-trivial domain size property in the following sense. Let K be a clique of size n, and K be the clique obtained by identifying any two vertices of K. Then K homomorphically maps to K , and therefore if a linear Datalog program accepts K, then it also accepts K . Therefore EvenCliques or, in fact, any set of cliques that contains a clique of size n but no clique of size n \u2212 1 cannot be defined by a 26 linear Datalog program. Since it is not difficult to convert a 1-linDat(suc) program into an mnBP1(poly), the aforementioned problems can also be defined with an mnBP1(poly). The additional power the successor relation gives to 1-linDat is at least twofold. For example, read-once linear Datalog(suc) can do some arithmetic, as demonstrated above. In addition, let\u2019s define the density of a graph to be the number of edges divided by the number of vertices. The density of an n-clique is n2 /n = \u03b8(n). As demonstrated above, access to an order allows read-once linear Datalog(suc) to accept only structures of linear density. On the other hand, any linear Datalog program P accepts structures of arbitrary low density. For let S be a structure accepted by P. Then adding sufficiently many new elements to the domain of S yields a structure S whose density is arbitrarily close to 0, and S is still accepted by P. One consequence of Corollary 34 is that if a read-once linear Datalog(suc) defines co-CSP(B), then both aforementioned additional abilities are of no use. 4.3 Main Results We begin with stating the results for 1-linDat(suc) and poly-size families of mnBP1s discussed in the Introduction. Theorem 33. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a 1-linDat(suc) program of width (j, k), then every critical and split-minimal element of C has a (j, k + j)-path decomposition. Corollary 34. If co-CSP(B) can be defined by a 1-linDat(suc) program of width (j, k), then co-CSP(B) can also be defined by a linear Datalog program of width (j, k + j). Theorem 35. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a family of mnBP1s of size O(nj ), then every critical and split-minimal element of C has a (j, r + j)-path decomposition, where r is the maximum arity of the symbols in \u03c4 . Corollary 36. If co-CSP(B) can be defined by a family of mnBP1s of size O(nj ), then co-CSP(B) can also be defined by a linear Datalog program of width (j, r + j), where r is the maximum arity of the relation symbols in the vocabulary of B. As discussed before, a wide class of CSPs\u2013CSPs whose associated variety admits the unary, affine or semilattice types\u2013does not have bounded pathwidth duality [20]. It follows that all these CSPs are not definable by any 1-linDat(suc) program, or with any mnBP1 of poly-size. An example of such a CSP is the P-complete CSP Horn-3Sat. After some definitions, we give a high-level description of the proof of Theorem 33. Any \u03c4 -structure M with domain size n can be naturally converted into an isomorphic successor structure M(\u03c0), where \u03c0 is a bijective function \u03c0 : M \u2192 {1, . . . , n}. We define the domain M (\u03c0) as {1, . . . , n} (note that this automatically defines firstM\u03c0 , lastM\u03c0 and sucM\u03c0 ) and for any R \u2208 \u03c4 , and (t1 , . . . , tar(R) ) \u2208 RM , we place the tuple (\u03c0(t1 ), . . . , \u03c0(tar(R) )) into RM\u03c0 ). When we want to emphasize that a structure under consideration is a successor \u03c4 -structure, we use the subscript s, for example Ms . Given a successor \u03c4 -structure Ms , M denotes the structure Ms but with the relations firstMs , lastMs and sucMs removed. We make the simple but important observation that we are interested only in isomorphismclosed classes. For example, co-CSP(B) is obviously isomorphism-closed. We will crucially 27 use the fact that if Ms is accepted by a 1-linDat(suc) program P, then P must also accept M(\u03c0) for any bijective function \u03c0. We are ready to describe the intuition behind the proof of Theorem 33. A 1-linDat(suc) program that ensures that the class of successor-structures C it defines is homomorphism-closed (and therefore isomorphism-closed) does not have enough \u201cmemory\u201d\u2013 due to its restricted width\u2013to also ensure that some key structures in C are \u201cwell-connected\u201d. If these key structures are not too connected, then we can define co-CSP(B) in linear Datalog. The more detailed proof plan is the following. Assume that co-CSP(B), where the input is a successor structure, is defined by a linDat(suc) program P of width (j, k). We choose a \u201cminimal\u201d structure M in C that is accepted, and assume for contradiction that M does not have width (j, k). Then roughly speaking, for all possible \u201cpermutations of the domain elements of M\u201d, M must be accepted; therefore for each of these isomorphic structures, P must be able to provide a derivation. Because this procedure will provide many enough derivations, we will be able to find some derivations which are of a desired form. The identification of these \u201cgood\u201d derivations also crucially uses the generalized Erd\u02ddos-Ko-Rado theorem. Once these derivations are detected, they can be combined to produce a derivation that \u201cencodes\u201d a structure of bounded pathwidth. The structures of bounded pathwidth produced this way can be used to define co-CSP(B) in linear Datalog. We give the formal proofs. We need the following additional definitions related to linear Datalog. In addition to extracting Ex(D) from D, we can also extract a decomposition of Ex(D) reminiscent of a \u02dc by adding (R, t) to path decomposition. For each \u2208 [q], we define a tuple structure B \u02dc \u02dc the -th bag, and B if R(t) appears in \u03c1 . In such a representation of Ex(D), we call B \u02dc 1, . . . , B \u02dc q ) the tuple distribution of Ex(D). It will be useful to remove empty bags from the (B \u02dc 1, . . . , B \u02dc q ) to obtain the sequence (B \u02dc i1 , . . . , B \u02dc it ), where i < i if < . For list of bags (B \u02dc i1 , B \u02dc i2 . . . . , B \u02dc it ) to (B \u02dc 1, B \u02dc 2, . . . , B \u02dc t ). We call the sequence simpler notation, we renumber (B \u02dc 1, . . . , B \u02dc t ) the pruned tuple distribution of D. The following is easy to prove. (B Proposition 37. Let A be a \u03c4 -structure obtained from a \u03c4 -structure A by applying a sequence of split operations. Then A \u2192 A. We recall the following theorem tailored a bit to our needs. Theorem 38 (Erd\u02ddos-Ko-Rado, general case; see, e.g., [14]). Suppose that F is a family of s-subsets of {1, . . . , n}, where n \u2265 n0 (s, j + 1). Suppose that for any two sets S1 , S2 \u2208 F, |S1 \u2229 S2 | \u2265 j + 1. Then |F| \u2264 n\u2212(j+1) = O(ns\u2212(j+1) ). s\u2212(j+1) Proof of Theorem 33. Let the read-once linear Datalog(suc) program that defines C be P. Let M be a structure in C such that M is critical and split-minimal, but assume for contradiction that M has no (j, k)-path decomposition. Suppose that M = {m1 , . . . , ms }. We choose a large enough n divisible by s (for convenience): how large n should be will become clear later. We begin with constructing a class of successor structures from M. Let \u03d5 : M \u2192 {1, . . . , n} be a function that for all i \u2208 [s], maps mi to one of the numbers in (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns . We call such a function an embedder. Observe that there are ( ns )s possible embedder functions. For each embedder \u03d5, we define a successor structure M\u03d5 as follows. M\u03d5 is obtained from M by renaming mi to \u03d5(mi ) for each i \u2208 [s], and adding all numbers inside {1, . . . , n} but not in the range of \u03d5 to the domain of the structure. 28 Obviously for any embedder \u03d5, M\u03d5 contains an isomorphic copy of M, and therefore M \u2192 M\u03d5 . Since C is closed under homomorphisms (and successor-invariant), it follows that for any embedder \u03d5, M\u03d5 is accepted by P. Our goal now is to show that P accepts a structure that can be obtained from M by applying a nonempty sequence of split operations. This would contradict the split-minimality of M with respect to C. Let \u03d51 , . . . , \u03d5t be an enumeration of all t = ( ns )s embedders, and M\u03d51 , . . . , M\u03d5t the corresponding successor structures. Since P is read-once, we can assume that for each i \u2208 [t], there is a read-once P-derivation for M\u03d5i : D(M\u03d5i ) = (\u03c1i1 , \u03bbi1 ), . . . , (\u03c1iqi , \u03bbiqi ). \u02dci,...,B \u02dc i ). Let \u03c8i (B \u02dci,...,B \u02dci ) For each D(M\u03d5i ) we denote its pruned tuple distribution as (B 1 wi 1 wi \u02dc i ), where M \u02dc i for each \u2208 [wi ] is obtained as follows. For every (R, t) \u2208 \u02dc i,...,M denote (M wi 1 \u02dc i , place (R, \u03d5\u22121 (t)) into M \u02dc i . We call \u03c8i (B \u02dci,...,B \u02dc i ) the prototype of (B \u02dci,...,B \u02dc i ). We B 1 wi 1 wi i \u02dci,...,B \u02dc i ) and (B \u02dci ,...,B \u02dc i ) are similar if they say that two pruned tuple distributions (B 1 wi 1 wi i i i i \u02dc \u02dc \u02dc \u02dc have the same prototypes, i.e., \u03c8i (B1 , . . . , Bwi ) = \u03c8i (B1 , . . . , Bwi ). Note that the codomain of \u03c8i , for any i, is a sequence S of bags such that a bag contains \u02dc Because by definition, every bag in S is nonempty, and D(M\u03d5 ) is readelements of M. i \u02dc once, we have that |S| \u2264 |M|. Therefore the number of possible bag sequences can be upper-bounded by a function of s; let this upper bound be cs . It follows that there must be \u02dc ) \u02dc ,...,B at least t = cts embedders \u03d5i1 , . . . , \u03d5it such that for any , \u2208 {i1 , i2 , . . . , it }, (B w 1 \u02dc ,...,B \u02dc ) are similar. Let the common prototype of all these similar pruned tuple and (B 1 w \u02dc 1, . . . , M \u02dc w ) (i.e., \u03c8i1 (B \u02dc i1 , . . . , B \u02dc i1 )). Because M \u02dc is critical, it follows distributions be (M 1 wi1 \u02dc =M \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc w 5. that M To give a heads-up to the reader, our goal now is to construct a derivation D using the derivations D(M\u03d5i1 ), D(M\u03d5i2 ), . . . , D(M\u03d5i ), such that Ex(D ) is isomorphic to a structure t \u02dc that can be obtained from M \u02dc by a nonempty sequence of split operations. Because M \u02dc M is split-minimal, this contradiction will complete the proof. \u02dc1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc g , and Yg = M \u02dcg \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w for g \u2208 [w]. If there is no g \u2208 [w \u2212 1] Define Xg = M such that |Xg \u2229 Yg+1 | > j, then we construct a (j, k + j)-path decomposition S1 , . . . , Sw for \u02dc 1 , Sw = M \u02dc w , and S = M \u02dc \u222a (X \u22121 \u2229 Y +1 ), for 2 \u2264 \u2264 w \u2212 1. M as follows. Define S1 = M The first condition of Definition 3 is obviously satisfied. For the second condition, take Si \u02dc i and a \u2208 M \u02dc i for some i \u2264 i and i \u2264 i , and Si and i < < i . If a \u2208 Si \u2229 Si then a \u2208 M so a \u2208 S . For the first part of the third condition observe that because P has width (j, k), \u02dc | \u2264 k. Because we added at most j new elements to M \u02dc to obtain S , |S | \u2264 k + j for |M any . For the second part of the third condition, observe that S \u2286 X and S +1 \u2286 Y +1 , so |S \u2229 S +1 | \u2264 j for any . For the other case, suppose that for some g, |Xg \u2229 Yg+1 | > j. Recall that for each \u02dc g was constructed from the bag B \u02dc , and B \u02dc was constructed from a \u2208 {i1 , i2 , . . . , it }, M g g rule \u03c1g for some g , i.e., the g -th rule in the derivation D(M\u03d5 ) = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ). Let \u03b9 be the number of IDBs of P and \u03ba the maximum arity of any IDB of P. Recall that \u02dc is critical and C is homomorphism closed, M \u02dc cannot contain isolated elements Note that because M \u02dc except when M is a structure with a single element and no tuples. In this case the only critical and split\u02dc and the empty set is a (0, 0)-path decomposition for M. \u02dc minimal element is M 5 29 since P has width (j, k), any IDB contains at most j variables. Assume that the head IDB of \u03c1g is Ig (x g ). Then there are at most \u03b9j \u03ba nj possibilities for the head IDB Ig together with its variables instantiated to numbers in [n]. This means that there is an IDB I and a tuple t such that for at least t = \u03b9j \u03bat nj values of \u2208 {i1 , i2 , . . . , it }, it holds that Ig = I, and \u03bbg (x g ) = t. Let these t values be { 1 , . . . , t }. We establish later that we can choose values a , b \u2208 { 1 , . . . , t } such that the following inequality holds: \u02dc b \u2264 j. \u02dca \u2229 B \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B \u02dc1a \u222a \u00b7 \u00b7 \u00b7 \u222a B B w w Assuming that we have such a and b, (5) we define D as: (\u03c11a , \u03bb1a ), . . . , (\u03c1gaa , \u03bbgaa ), (\u03c1gb +1 , \u03bbg b +1 ), . . . , (\u03c1q b b b b , \u03bbqb ). b That is, we \u201ccut\u201d the derivations D(M\u03d5 a ) at the g a -th rule, and cut the derivation D(M\u03d5 b ) at the g b -th rule, and concatenate the first part of D(M\u03d5 a ) with the second part of D(M\u03d5 b ). D is a valid derivation because at the point of concatenation, the head IDB of \u03c1gaa is the same as the IDB in the body of \u03c1gb +1 , and the variables of this IDB are instantib ated to the same values in both rules. Observe that the pruned tuple distribution of D is \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b. \u02dc =B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222aB \u02dc b ,...,B \u02dc b ). Set B \u02dc a, . . . , B \u02dc a, B (B g+1 1 g+1 1 w g w g \u02dc is isomorphic to a structure that can be obtained from M \u02dc by a nonempty sequence Claim. B of split operations. \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc g of M \u02dc is isomorphic to B \u02dc a\u222a Proof of Claim. Observe that the substructure M 1 b \u02dc g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w is isomorphic to B \u02dc \u02dc b \u02dc a through \u03d5 a . Similarly, M \u00b7\u00b7\u00b7\u222aB g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a Bw through g \u02dc and B. \u02dc \u03d5 b . Our goal is to understand the difference between M Notice that because any embedder maps mi \u2208 M into the interval (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns , and for any i = i , (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns \u2229 (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns = \u2205, if i = i , then \u03d5 a (mi ) = \u03d5 b (mi ). Therefore \u03d5 a and \u03d5 b can return the same value only if they both \u02dc where get the same input. The set Xg \u2229 Yg+1 can be thought of as those elements of M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc g and M \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w are \u201cglued together\u201d to obtain M. \u02dc Let U = B \u02dc1a \u222a\u00b7 \u00b7 \u00b7\u222a B \u02dc ga M b \u02dc where \u02dcg+1 \u02dcwb . The set U \u2229 V can be thought of as those elements of B and V = B \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a and B \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b are \u201cglued together\u201d to obtain B. \u02dc B 1 g+1 g w \u02dc would be isomorphic to M, \u02dc If for all elements m \u2208 Xg \u2229 Yg+1 , \u03d5 a (m) = \u03d5 b (m), then B \u02dc a \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc a would be glued to B \u02dc b \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc b to obtain B \u02dc the same way as M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222aM \u02dcg i.e., B 1 g+1 g w \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w to obtain M. \u02dc But by Inequality 5, |Xg \u2229Yg+1 | > |U \u2229V |. In other is glued to M words, there are some elements m \u2208 Xg \u2229Yg+1 which have one copy for \u03d5 a , and another copy \u02dc Identifying \u03d5 a (m) and \u03d5 (m) for all such m would convert B \u02dc to a structure for \u03d5 b in B. b \u02dc Now it is easy to see that going backwards, splitting elements of M \u02dc would isomorphic to M. \u02dc yield a structure isomorphic to B. It remains to show why we can choose a and b to satisfy Inequality 5. Note that ( n )s \u02dc \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc is an st = cs \u03b9js \u03ba nj \u2265 \u2126(ns\u2212j ). Also note that for any \u2208 { 1 , . . . , t }, B 1 w a \u02dc \u02dcwa \u2229 subset of [n]. So by Theorem 38, if for every pair a , b \u2208 { 1 , . . . , t }, B1 \u222a \u00b7 \u00b7 \u00b7 \u222a B 30 \u02dcwb \u2265 j + 1, then t \u2264 O(ns\u2212j\u22121) ). But as observed t \u2265 \u2126(ns\u2212j ), so for a \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B B large enough n (as a function of s,j, \u03b9 and \u03ba, so n can be chosen in advance) Inequality 5 must hold for some a , b \u2208 { 1 , . . . , t }. Proof of Corollary 34. Let O = co-CSP(B), i.e., the set of all those successor structures that do not homomorphically map to B. We construct an obstruction set O for B such that every structure in O has pathwidth (j, k + j). O is the set of all critical and split minimal structures of O. Theorem 33 tells us that every structure in O has a (j, k + j)-path decomposition. To see that O is an obstruction set for B, take any structure S \u2208 co-CSP(B) = O. Keep on applying split operations to S and taking substructures of S (again, these operations are with respect to non-built-in relations only), as long as the resulting structure is still in O. That is, if we apply any split operation to S , or if we take any substructure of it, then the resulting structure is not in O any more. Then S \u2208 O because S is critical and split minimal with respect to O. Using Proposition 37, we also see that S \u2192 S. Because O is an obstruction set for B such that every structure in O has width (j, k +j), it follows from results of Dalmau in [7] that co-CSP(B) is definable in linear (j, k + j)Datalog. These proofs can be adapted for mnBP1s to obtain Theorem 35 and Corollary 36. Acknowledgement We thank Benoit Larose and Pascal Tesson for useful discussions and comments on an earlier draft. We also thank the anonymous referees for their helpful comments. References [1] F. Afrati and S. S. Cosmadakis. Expressiveness of restricted recursive queries. In Proceedings of the 42th ACM Symposium on Theory of Computing (STOC), pages 113\u2013 126, 1989. [2] E. Allender, M. Bauland, N. Immerman, H. Schnoor, and H. Vollmer. The complexity of satisfiability problems: Refining Schaefer\u2019s theorem. Journal of Computer and System Sciences, 75(4):245\u2013254, 2009. [3] L. Barto and M. Kozik. Constraint satisfaction problems of bounded width. In Proceedings of The 50th Annual Symposium on Foundations of Computer Science (FOCS), 2009. [4] A. A. Bulatov, A. A. Krokhin, and B. Larose. Dualities for constraint satisfaction problems. In N. Creignou, P. G. Kolaitis, and H. Vollmer, editors, Complexity of Constraints, volume 5250 of Lecture Notes in Computer Science, pages 93\u2013124. Springer, 2008. [5] S. Burris and H. P. Sankappanavar. A Course in Universal Algebra. Number 78 in Graduate Texts in Mathematics. Springer-Verlag, 1981. 31 [6] C. Carvalho, L. Egri, M. Jackson, and T. Niven. On Maltsev digraphs. In Proceedings of the 6th International Computer Science Symposium in Russia (CSR), pages 181\u2013194, 2011. [7] V. Dalmau. Constraint satisfaction problems in non-deterministic logarithmic space. In Proceedings of the 29th International Colloquium on Automata, Languages and Programming, ICALP, pages 414\u2013425. Springer-Verlag, 2002. [8] V. Dalmau and A. Krokhin. Majority constraints have bounded pathwidth duality. European Journal of Combinatorics, 29(4):821\u2013837, 2008. [9] V. Dalmau and B. Larose. Maltsev + Datalog \u2192 symmetric Datalog. In IEEE Symposium on Logic in Computer Science (LICS), pages 297\u2013306, 2008. [10] L. Egri, A. A. Krokhin, B. Larose, and P. Tesson. The complexity of the list homomorphism problem for graphs. Theory of Computing Systems, 51(2):143\u2013178, 2012. [11] L. Egri, B. Larose, and P. Tesson. Symmetric Datalog and constraint satisfaction problems in logspace. In IEEE Symposium on Logic in Computer Science (LICS), pages 193\u2013202, 2007. [12] T. Feder. Classification of homomorphisms to oriented cycles and of k-partite satisfiability. SIAM Journal on Discrete Mathematics, 14(4):471\u2013480, 2001. [13] T. Feder and M. Y. Vardi. The computational structure of monotone monadic SNP and constraint satisfaction: A study through Datalog and group theory. SIAM Journal on Computing, 28(1):57\u2013104, 1999. [14] P. Frankl and R. L. Graham. Old and new proofs of the Erd\u00a8os-Ko-Rado Theorem. Journal of Sichuan University Natural Science Edition, 26, 1989. [15] E. Gr\u00a8adel. Capturing complexity classes by fragments of second-order logic. Theoretical Computer Science, 101(1):35\u201357, 1992. [16] R. H\u00a8aggkvist, P. Hell, D. J. Miller, and V. Neumann-Lara. On multiplicative graphs and the product conjecture. Combinatorica, 8:63\u201374, 1988. [17] P. Hell and X. Zhu. Homomorphisms to oriented paths. Discrete Mathematics, 132:107\u2013 114, 1994. [18] D. Hobby and R. McKenzie. The Structure of Finite Algebras, volume 76 of Contemporary Mathematics. American Mathematical Society, Providence, R.I., 1988. [19] N. Immerman. Descriptive complexity. Graduate Texts in Computer Science. Springer, 1999. [20] B. Larose and P. Tesson. Universal algebra and hardness results for constraint satisfaction problems. Theoretical Computer Science, 410(18):1629\u20131647, 2009. 32 [21] B. Larose and L. Z\u00b4adori. Bounded width problems and algebras. Algebra Universalis, 56(3-4):439\u2013466, 2007. [22] L. Libkin. Elements of finite model theory. Springer, 2004. [23] A. F. Pixley. Distributivity and permutability of congruence relations in equational classes of algebras. Proceedings of the American Mathematical Society (AMC), 14:105\u2013 109, 1963. [24] T. Schaefer. The complexity of satisfiability problems. In Proceedings of the 10th ACM Symposium on Theory of Computing (STOC), pages 216\u2013226, 1978. [25] I. Wegener. Branching programs and binary decision diagrams: theory and applications. Society for Industrial and Applied Mathematics (SIAM), Philadelphia, PA, USA, 2000. 33"} \ No newline at end of file diff --git a/interactive-mining-backend/madoap/src/static/exampleClarinProfile.oamp b/interactive-mining-backend/madoap/src/static/exampleClarinProfile.oamp new file mode 100644 index 0000000..ec3c5f3 Binary files /dev/null and b/interactive-mining-backend/madoap/src/static/exampleClarinProfile.oamp differ diff --git a/interactive-mining-backend/madoap/src/static/exampleProfile.oamp b/interactive-mining-backend/madoap/src/static/exampleProfile.oamp index 8d016c5..789f84a 100755 Binary files a/interactive-mining-backend/madoap/src/static/exampleProfile.oamp and b/interactive-mining-backend/madoap/src/static/exampleProfile.oamp differ