From e1251236ff3ec10c2450b030234fccc421d7d8bb Mon Sep 17 00:00:00 2001 From: sosguns2002 Date: Wed, 18 Apr 2018 18:00:21 +0300 Subject: [PATCH] Clarin full example --- .../configuration/configuration.component.ts | 4 ++-- .../resultspreview.component.ts | 8 +++++-- .../settings/settings.component.html | 8 +++---- .../madoap/src/madserverv3.py | 22 ++++++++---------- .../madoap/src/static/exampleClarinDocs.json | 4 +++- .../src/static/exampleClarinProfile.oamp | Bin 5120 -> 5120 bytes .../madoap/src/static/exampleProfile.oamp | Bin 5120 -> 5120 bytes 7 files changed, 25 insertions(+), 21 deletions(-) diff --git a/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts b/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts index c420656..4a333e5 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts +++ b/interactive-mining-angular-frontend/src/app/configuration/configuration.component.ts @@ -29,10 +29,10 @@ export class ConfigurationComponent implements OnInit, AfterViewInit { localStorage.setItem('negwords', '{}'); } if (!localStorage.getItem('contextprev') || localStorage.getItem('contextprev') === 'undefined') { - localStorage.setItem('contextprev', '10'); + localStorage.setItem('contextprev', '20'); } if (!localStorage.getItem('contextnext') || localStorage.getItem('contextnext') === 'undefined') { - localStorage.setItem('contextnext', '20'); + localStorage.setItem('contextnext', '10'); } if (!localStorage.getItem('wordssplitnum') || localStorage.getItem('wordssplitnum') === 'undefined') { localStorage.setItem('wordssplitnum', '1'); diff --git a/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.ts b/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.ts index baf4906..2b10936 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.ts +++ b/interactive-mining-angular-frontend/src/app/configuration/resultspreview/resultspreview.component.ts @@ -170,7 +170,9 @@ export class ResultspreviewComponent implements OnInit { // hightlight positive words for (let posword in JSON.parse(localStorage.getItem('poswords'))) { const search_regexp = new RegExp(posword, 'g'); - context = context.replace(search_regexp, '' + posword + ''); + context = context.replace(search_regexp, function (x) { + return '' + x + ''; + }); } // hightlight acknowledgment keywords for (let ackn of values.acknmatch) { @@ -180,7 +182,9 @@ export class ResultspreviewComponent implements OnInit { // hightlight negative words for (let negword in JSON.parse(localStorage.getItem('negwords'))) { const search_regexp = new RegExp(negword, 'g'); - context = context.replace(search_regexp, '' + negword + ''); + context = context.replace(search_regexp, function (x) { + return '' + x + ''; + }); } context = this.highlightInElement(context, values.match); match.context = context; diff --git a/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html b/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html index f94c30e..52fc0e2 100755 --- a/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html +++ b/interactive-mining-angular-frontend/src/app/configuration/settings/settings.component.html @@ -138,8 +138,8 @@
  • Mining area size - before: {{settings.contextprev}} - after: {{settings.contextnext}} + before: {{settings.contextprev}} + after: {{settings.contextnext}}

    You may edit the length of the text area that the algorithm uses to decide if a match is a true positive.
    For advanced users only

    @@ -147,13 +147,13 @@
    - +
    - +
    diff --git a/interactive-mining-backend/madoap/src/madserverv3.py b/interactive-mining-backend/madoap/src/madserverv3.py index 0aeff72..8756940 100755 --- a/interactive-mining-backend/madoap/src/madserverv3.py +++ b/interactive-mining-backend/madoap/src/madserverv3.py @@ -592,7 +592,7 @@ class GetExampleProfilesHandler(BaseHandler): try: data = {} example_profiles = [] - example_profiles.append({'name': 'Clarin', 'contents': 11, 'documents': 7}) + example_profiles.append({'name': 'Clarin', 'contents': 4, 'documents': 9}) example_profiles.append({'name': 'Communities', 'contents': 25, 'documents': 104}) example_profiles.append({'name': 'AOF', 'contents': 66, 'documents': 1023}) example_profiles.append({'name': 'RCUK', 'contents': 263, 'documents': 140}) @@ -912,8 +912,8 @@ class GetDocSamplesHandler(BaseHandler): data = {} doc_samples = [] doc_samples.append({'name': 'Egi', 'documents': 104}) - doc_samples.append({'name': 'Clarin', 'documents': 1023}) - doc_samples.append({'name': 'SNSF', 'documents': 140}) + doc_samples.append({'name': 'Clarin', 'documents': 7}) + doc_samples.append({'name': 'Wellcome Trust', 'documents': 250}) doc_samples.append({'name': 'ARIADNE', 'documents': 502}) doc_samples.append({'name': 'RCUK', 'documents': 104}) doc_samples.append({'name': 'TARA', 'documents': 1023}) @@ -1043,11 +1043,9 @@ class ChooseDocSampleHandler(BaseHandler): if doc_sample == "Egi": sample_file_name = "static/egi_sample.tsv" elif doc_sample == "Clarin": - sample_file_name = "static/clarin_docs.json" - elif doc_sample == "Rcuk": - sample_file_name = "static/rcuk_sample.tsv" - elif doc_sample == "Arxiv": - sample_file_name = "static/arxiv_sample.tsv" + sample_file_name = "static/exampleClarinDocs.json" + elif doc_sample == "Wellcome Trust": + sample_file_name = "static/exampleWTDocs.json" else: self.set_status(400) self.write("No Doc sample with this name") @@ -1158,13 +1156,13 @@ class RunMiningHandler(BaseHandler): contextmiddle = [r for r in cursor.execute(querygrantsize)][0][0]+1 if 'contextprev' in mining_parameters and mining_parameters['contextprev'] != '': contextprev = int(mining_parameters['contextprev']) - if contextprev < 0 or contextprev > 20: + if contextprev < 0 or contextprev > 50: self.set_status(400) self.write("Context size must be in its limits...") return if 'contextnext' in mining_parameters and mining_parameters['contextnext'] != '': contextnext = int(mining_parameters['contextnext']) - if contextnext < 0 or contextnext > 20: + if contextnext < 0 or contextnext > 50: self.set_status(400) self.write("Context size must be in its limits...") return @@ -1231,11 +1229,11 @@ class RunMiningHandler(BaseHandler): data['negwords'].append(key) neg_set += "0" if pos_set != '' and neg_set != '': - conf = ", ({0} - {1})".format(pos_set, neg_set) + conf = ", ({0} - ({1}))".format(pos_set, neg_set) elif pos_set != '': conf = ", {0}".format(pos_set) elif neg_set != '': - conf = ", -{0}".format(neg_set) + conf = ", -({0})".format(neg_set) if conf != '': conf += ' as conf' whr_conf = 'and conf>=0' diff --git a/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json b/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json index 3e6e3d7..544e5d3 100644 --- a/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json +++ b/interactive-mining-backend/madoap/src/static/exampleClarinDocs.json @@ -4,4 +4,6 @@ {"id":"oai:arXiv.org:1309.2788","text":"Training in Data Curation as Service in a Federated Data Infrastructure - the FrontO\ufb03ce\u2013BackO\ufb03ce Model Ingrid Dillo, Rene van Horik, and Andrea Scharnhorst arXiv:1309.2788v1 [cs.DL] 11 Sep 2013 Data Archiving and Networked Services, Anna van Saksenlaan 10, 2593 HT The Hague , The Netherlands {ingrid.dillo,rene.van.horik,andrea.scharnhorst}@dans.knaw.nl http://www.dans.knaw.nl Abstract. The increasing volume and importance of research data leads to the emergence of research data infrastructures in which data management plays an important role. As a consequence, practices at digital archives and libraries change. In this paper, we focus on a possible alliance between archives and libraries around training activities in data curation. We introduce a so-called FrontO\ufb03ce\u2013BackO\ufb03ce model and discuss experiences of its implementation in the Netherlands. In this model, an e\ufb03cient division of tasks relies on a distributed infrastructure in which research institutions (i.e., universities) use centralized storage and data curation services provided by national research data archives. The training activities are aimed at information professionals working at those research institutions, for instance as digital librarians. We describe our experiences with the course DataIntelligence4Librarians. Eventually, we re\ufb02ect about the international dimension of education and training around data curation and stewardship. Keywords: data curation, data management, training, data sharing, data archive, digital libraries, education, science policy, documentation 1 Introduction A research archive can be depicted as a safe haven for research data, carefully selected, documented and stored for future consultation. Accordingly, the core tasks of a data archivist could be imagined to be con\ufb01ned to proper documentation, and the care for material preservation. In short: \u201dOur service starts where others drop the data\u201d1 . The current practices of archivists seem to deviate from such an archetype to a large extent. This turn of tables can best be understood by a recall to the history of archival sciences. In general, for archives of research data the same principles hold as for any other archive. In 1898, in the handbook, one of the foundational texts in archival sciences [1], Muller, Feith, and Fruin describe the archive as an organic entirety whose function cannot be determined 1 Personal communication Henk Koning, former Technical Archivist at DANS 2 Dillo Fig. 1. The federated data infrastructure - a collaborative framework. Scheme designed by Peter Doorn based on the Collaborative Data Infrastructure as envisioned in [6, p. 31] . a priori. On the contrary, its function needs to be de\ufb01ned and rede\ufb01ned depending on the development of the institution (i.e., a board or government) whose selected traces it is obliged to archive. In other words, Muller et al. describe a co-evolution of the institution and its archive. This view applied to a research data archive, the corresponding institution is none other than the science system. From out this viewpoint, it is not surprising that the profound changes in scienti\ufb01c practice [2] and scholarly communication [3] in\ufb02uence the expectations placed on a data archive or, more speci\ufb01cally, a sustainable digital archive (Trusted Digital Repository). The changing modes of scholarly communication and practice alter the form and content of what is seen worth to be preserved. [5] Changing research practices require new negotiations on the division of labor. Who is responsible for setting up digital research infrastructures including virtual research environments - the information service providers such as Trusted Digital Repositories (TDRs) or the research institutions? Who takes care of the preparation of (meta)\u2013data and formats prior to archiving? Who should preserve software tools - the labs which developed them or the archive together with \u2019data\u2019 for which they have been developed? The high volatility of the environment in which archives are currently operating Training in Data Curation 3 in\ufb02uences their function as reliable, stable reference point for important information. Open Access, Data Management Plan, Data Stewardship, Data Curation, Trusted Digital Repositories, BigData and SmartData are some of the \ufb02oating around buzzwords of the last decade. They stand for the struggle to identify and communicate most urgent trends and to coordinate actions across the di\ufb00erent stakeholders in the \ufb01eld of data curation. Important to note here is the reference model for Open Archival Information Systems (in short OAIS model, ISO 14721:2012), a model foundational for the discussion of structure and function of any archive. Its key elements are Ingest, Archival Storage, Data Management, Administration, Preservation Planning and Access. Allison emphasizes that the OAIS model is not an architectural model for implementation, but instead o\ufb00ers a shared terminology. [4]. Inside of our own organization, Data Archiving and Networked Services (DANS), the OAIS model is often used in discussions about internal work\ufb02ows and their improvement and further development. In this paper, we focus on institutional networks around an archive as DANS. Hereby we rely on schemata as depicted in Fig. 1 which sketch the complexity of the research data landscape, its stakeholders and infrastructure [6]. Coming back to it later, in a \ufb01rst step we can use this scheme in an exercise to locate a TDR such as DANS. Starting at the bottom of Fig. 1 the basic (technical) infrastructure entails storage. In the Netherlands this level of Basic Infrastructure is provided by SURFsara, the Dutch network of computing facilities whose services DANS is using itself. The following three levels could be seen as the heart of activities of an archive of digital research data. They form a kind of back-o\ufb03ce. The three boxes at the next level, labeled as front o\ufb03ce, contain the funding agencies, as NWO2 in the Netherlands, university libraries, and research infrastructures such as CLARIN3 , or DARIAH4 , which are in themselves complex organizations. They could be seen as \u2019clients\u2019 of an archive. But actually, DANS is also part of them. The same holds true for the top level of data providers and users. DANS as part of research infrastructures harvests information from other data providers. With its own research and development activities it is even part of the data production cycle. In short, DANS plays di\ufb00erent roles in di\ufb00erent contexts and, therefore, can be located at many places in this scheme. Correspondingly, at DANS a variety of di\ufb00erent activities take place. In the next section, we discuss how, together with this increase in complexity, the need emerges to build alliances and to coordinate actions among di\ufb00erent institutional players in the data landscape. At the core of the paper we propose a speci\ufb01c model to articulate possibilities of collaboration, coordination, and division of labour. We report about steps towards its concrete implementation at the Dutch national level. At the end of the paper we discuss links to international developments. 2 3 4 www.nwo.nl www.clarin.eu www.dariah.eu 4 Dillo 2 The archivist as a consultant DANS is one of the national research data archives in the Netherlands. With roots in the social sciences and humanities back to the 1960s, in its current form, it was founded in 2005 as an institute of NWO - the Netherlands Organization for Scienti\ufb01c Research and the KNAW - the Royal Netherlands Academy of Arts and Sciences. DANS is primarily an information service institute and, despite of a small in-house research group, not a research institute. This makes DANS much more comparable to a classical, stand-alone archive. The mission of DANS it to promote sustained access to digital research data. For this purpose, DANS encourages researchers to archive and reuse data in a sustained manner, e.g. through the online (self)archiving system EASY5 . DANS also provides access, via NARCIS.nl6 , to thousands of scienti\ufb01c datasets, epublications and other research information in the Netherlands. EASY and NARCIS are two services which form the core of DANS. In di\ufb00erence to many other knowledge-domain speci\ufb01c archives, DANS operates cross-disciplinary with a focus on social sciences and humanities. It is also an exclusively digital archive and it is placed - as an institution - outside the Dutch university system. All this together positions DANS as a gateway to the diverse Dutch research data landscape and as a hub in it. Activities and practices at DANS can be ordered along three dimensions: \u2013 Archive: selection, preservation, and description of data collections \u2013 Research and Development: maintenance and development of the ICT infrastructure for seamless access and exploitation and for long-term preservation \u2013 Science Policy: in\ufb02uence on research data policies and data curation strategies on the national and international levels The \ufb01rst dimension corresponds to a large extent to the image of a traditional research archive. But due to ongoing ICT innovations both in the area of research as well as of information services, a digital archive cannot operate without means to adopt its technological backbone to those innovations. The process of adopting and inventing services entails to a large extent what Andrew Prescott called \u201dtinkering\u201d, when he compared practices at digital libraries with the craftsmanship needed in labs and workshops in the high-time of industrialization [7]. ICT is usually depicted as an e\ufb03ciency engine. What is often forgotten is the existence of a transition period during which old and new forms of practices coexist. On the work \ufb02oor, this means that traditional services of acquisition, community support, and documentation are pursued in parallel to designing new work\ufb02ows, testing and implementing them. So, before ICT leads to more e\ufb03ciency, temporarily the actual workload often increases. Project-based work and external funding for projects can only partly bu\ufb00er this extension of activities at an archive. On top of archiving and related R&D, the changing environment 5 6 www.easy.dans.knaw.nl www.narcis.nl Training in Data Curation 5 in which the archive operates requires continuous attention. Hence, a third dimension - science policy - appears. Participation in national and international networks of research infrastructures require substantive investment of time. The point we make is that the current portfolio of activities at information service institutions is much more diverse than in the past. For DANS this changing role of an archive is re\ufb02ected in its name as Data Archiving and Networked Services. Among the increased portfolio of activities, consultancy plays a special role [8]. It appears in many forms: in the foundation of a Data Seal of Approval for TDRs7 , in the advisory role in research projects, in contributions to data policy documents, and in training activities. Consultancy contributes to knowledge diffusion around data curation practices and the coordination of data management at a national (partly also international) level. It also supports the emergence of a distributed network structure which we describe in the next section. 3 Strategic alliance between archives and libraries - the FrontO\ufb03ce \u2013 BackO\ufb03ce model Profound and timely data management together with a sustainable storage of data \u2013 during and after the research \u2013 are indispensable preconditions for sharing data. It is of great importance that universities and other research institutions develop a clear data policy themselves. An adequate infrastructure is needed to coordinate and implement those policies. In the Netherlands, with its rich institutional landscape of information service providers and research institutions, we encounter a discussion around a federated data infrastructure. It is quite clear that no single organization will be able to deliver individually tailored support for all possible data depositors. It it also clear that it is not possible for a single organization to provide services across all levels, from storage up to interactions with individual researchers. In order to create a sustainable national infrastructure for data management and curation, it is important to support a network of local data stewards close to the actual scienti\ufb01c practice combined with centralized services. Fig. 1 designs such a federated data infrastructure. It introduces at the same time a FrontO\ufb03ce\u2013BackO\ufb03ce model (FO\u2013BO model) as part of it. 3.1 Description of the model The FO\u2013BO model clari\ufb01es the interaction between researcher and information service provider concerning research data management. It also clari\ufb01es the relation among di\ufb00erent information service providers. Front o\ufb03ces should be placed at institutions where research takes place in order to support the research community at those institutions. An example could be a front o\ufb03ce as part of a university library. The front o\ufb03ce is responsible for raising awareness for data sharing and re-use, for taking care of the local data management, and for organizing training for researchers. Virtual Research Environments (VRE\u2019s) could be 7 www.datasealofapproval.org 6 Dillo also part of the service at a front o\ufb03ce. In particular, temporary data archiving on platforms as Sharepoint or Dataverse could be part of the VRE\u2019s. Once a research project is \ufb01nished the front o\ufb03ce - in consultation with the back o\ufb03ce - takes care of the transfer of data to a TDR. So, data acquisition is an inherent part of the front o\ufb03ce tasks. The core tasks of the back o\ufb03ce consist in the storage and documentation of research data which arrive via the front o\ufb03ces. The back o\ufb03ce provides access to data, and possibly enriches and links data. The back o\ufb03ce acquires expert knowledge around data management, and the long-term, sustainable and persistent archiving of research data. Part of the back o\ufb03ce portfolio is to disseminate this expertise by means of training of information professionals, such as data librarians/managers/stewards, working at front o\ufb03ces. The back o\ufb03ce acts as an expertise centrum for the front o\ufb03ce and as an innovation centrum concerning new trends in data curation. Fig. 2 summarizes the bene\ufb01ts of the model for researchers, front o\ufb03ces and back o\ufb03ce organizations. By means of the FO\u2013BO model we also try to reduce the complexity of interactions in the data infrastructure. With this model the role of DANS (and of comparable institutions) is restricted to the back o\ufb03ce function. In the next subsection we report about one key element of the model: training for front o\ufb03ce personal. Fig. 2. Poster about the FrontO\ufb03ce\u2013BackO\ufb03ce model. Designed by Carolien van Zuilekom, Fieke Schoots, Madeleine de Smaele and Ingrid Dillo Training in Data Curation 3.2 7 Implementation - the DataIntelligence4Librarians In the FO\u2013BO model training for information professionals is part of the back o\ufb03ce portfolio. The DataIntelligence4Librarians course is an example for such a training. Organized by the 3TU.Datacenter8 and DANS, it is based on an earlier course of the 3TU.Datacenter developed for data-librarians. The currently envisioned audience reaches from sta\ufb00 at libraries to everybody interested in the topic independently of the disciplinary background. Fig. 3. Snapshot of the website http://dataintelligence.3tu.nl/en/home/ - host of the course \u201dDataIntelligence4Librarians\u201d Description of the course The course design \ufb01ts into the professional education format. It combines distance learning with four face2face (f2f) sessions and maintains next to an eLearning environment also a public website (see Fig. 3, in 8 The 3TU.Datacenter \u2013 a network organization of the university libraries of Delft University of Technology, Eindhoven University of Technology, and the University of Twente \u2013 o\ufb00ers facilities for the preservation and the sustained availability of technical research data, similar to the services at DANS (see http://data.3tu.nl/ repository/) 8 Dillo Dutch) with background material. Google Plus was used as the platform for the eLearning part. Participants are supposed to study theoretical parts as homework. Between the f2f sessions more homework is assigned. The website contains a description for the \ufb01rst practical task. More of them are distributed in the eLearning environment. Both coach and participants give feedback at f2f sessions as well as on-line. Didactically, feedback and knowledge sharing is used as an important element next to knowledge transfer. During the \ufb01rst f2f session, an introduction into the course and the eLearning environment is given. An introduction into the module Data Management follows and homework is assigned labeled State-of-Art Map. This task starts with reading a report, and continues with a number of search tasks using the phrase research data management across bibliographic databases (Scopus, Web of Science), but also in Twitter and Google. Participants are advised to subscribe to speci\ufb01c mailing lists to get an impression of the actual discussion around the topic. At the second f2f session participants presents their resulting map. The module Technical Skills is introduced and tools (3TU.Databrowser, DANS/EASY) are demonstrated. The third f2f meeting starts with the same scheme of sharing homework and getting feedback. Content-wise the module for this session is Acquisition and Consultation Skills. Speci\ufb01c attention is given to the question how to overcome barriers for data sharing. The instrument of a Data Interview with possible data depositors is introduced. During the fourth and last session the acquisition assignment is discussed and the course is evaluated. At the end of the course a certi\ufb01cate is issued under the condition that all sessions have been attended and the tasks have been ful\ufb01lled. During the modules, di\ufb00erent experts from the organizing institutions give guest lectures. Examples of topics are legal aspects, issues of data selection, audit and certi\ufb01cation of TDRs, and the FO\u2013BO model itself. Experiences with the course One goal of the course is to sharpen the insight into the role of research data in scienti\ufb01c practices. Eventually, the participant should be able to advise and inform researchers how data curation can enhance data use and re-use. In summary, the goals are rather diverse and broad compared with the limited time of the course. So far, the course has been run three times: February 2012 to June 2012 (16 participants, organized by 3TU.Datacentre), September 2012 to December 2012 (16 participants), and February 2013 to May 2013 (13 participants). The last two events have been organized in collaboration between the 3TU.Datacentre and DANS. Most of the participants were information professionals, either working at a library or archive, or for one of the network organizations, such as SURF. In the evaluation, the participants named a couple of critical points not unusual for distance learning. Among them are problems with the eLearning environment, or the spreading out of the course over a rather long period. Another critical remark concerns the demonstrations. Obviously the participants did not seek hands-on experiences with a tool, platform, or interface. They seemed to be more interested in guidance and factual information in the area of data cura- Training in Data Curation 9 tion. This springs also out from the positive reactions. Information about actual developments from experts involved in the practice of data curation have been highly appreciated. Further, a need to get to know each other and to learn from each others practices is articulated. This holds true even for a small country as the Netherlands. One of the suggestions of the participants was to form a special interest group. 4 Conclusions In this paper we discussed changing portfolios of responsibilities for archives and libraries. Data infrastructures emerge in response to data science, open access, and data sharing policies. In the making of a data infrastructure, the division of tasks between di\ufb00erent information service providers needs to be re-negotiated. We present a federal data infrastructure with a layered architecture including a FrontO\ufb03ce\u2013BackO\ufb03ce model. This model allows to articulate di\ufb00erent roles in the interaction with research communities, the acquisition of expert knowledge, and the provision of data management services. The model is in line with the Data pyramid [6] which classi\ufb01es data according to permanence and function. Data management is tailored towards certain classes of data and specialization in data curation is allocated to di\ufb00erent organizations. Front o\ufb03ces, naturally to be placed at academic libraries, take care of data management for transient and cyclic data produced by individuals and research communities. Trusted Digital Repositories as DANS act as back o\ufb03ce and take care for patrimonial data. They also become expertise center and knowledge transfer hubs for data curation. Training plays a key role in the FO\u2013BO model. It is a way to disseminate the idea of the model. At the same time, it is an instantiation of the model. The experiences in the Netherlands are encouraging. Several Dutch universities signaled interest in this approach and the challenge is now to implement more front o\ufb03ces there. At the same time, a coordination among possible back o\ufb03ce organizations is needed. DANS recently signed an coalition agreement with the 3TU.Datacenter to cooperate more closely and to foster the FO\u2013BO model. This coalition, Research Data Netherlands, is open to any other Dutch TDR with at least a Data Seal of Approval. To shape the role of back o\ufb03ces as centers of expertise and innovation is another way to make the model attractive and reliable. To give an example, there is a growing need for auto-ingest of larger data collections. Another shared issue is the question of a sustainable cost model for data archiving. Exploration of these issues needs to be done locally and shared in collaboration. Returning to the issue of training, as we argue in this paper, in the short run, there is an urgent need for education among information professionals. In the mid term, these e\ufb00orts could be connected to comparable modules in curricula for future information professionals, e.g. at iSchools. The FO\u2013BO models contains training also as a part of front o\ufb03ce activities. This is in line with e\ufb00orts in the framework of digital librarianship to develop modules for information lit- 10 Dillo eracy and data stewardship at many universities. The APARSEN project that aims at establishing a virtual centre of excellence on digital preservation carried out a survey concerning the European training landscape in this area [10]. The DataIntelligence4Librarians course \ufb01ts very well to outcomes of this survey. A coordination between those di\ufb00erent training activities will support further professionalization. Shared textbooks, syllabi, best practices guidelines could also help to keep locally provided on-line material up to date. Our experiences show that a natural alliance between (digital) archives and libraries exist which is worth to be explored in daily practice. Current science policies emphasizes the role of data and their re-use. The envisioned coupling of funding with data-sharing and archiving, the Linked Open Data movement, and the rise of data science will put more pressure on information service institutions, but at the same time also o\ufb00ers new possibilities. To form alliances and to coordinate actions seems to be the only possible answer. Here, libraries and archives are natural partners because as Ross formulated \u201dwhen we re\ufb02ect on the core of digital libraries we easily observe that they may be libraries by name, but they are archives by nature\u201d [9]. References 1. Muller, S., Feith, J.A., Fruin, R.: Handleiding voor het Ordenen en Beschrijven van Archiven. Erven B. Van Der Kamp. Groningen. 1920. 2ed. Reprinted in: Horsman, P.J., Ketelaar, F.C.J., Thomassen, T.H.P.M.: Tekst en Context van de Handleiding voor het Ordenen en Beschrijven van Archiven van 1898. Verloren, Hilversum 1998. 2. Wouters, P., Beaulieu, A., Scharnhorst, A., Wyatt, S.: Virtual Knowledge: Experimenting in the Humanities and the Social Sciences. MIT, Cambridge, Mass. 2012. 3. Borgman, C.: Scholarship in the digital age: Information, infrastructure, and the Internet. MIT, Cambridge, Mass. 2007. 4. Allison, J.: OAIS as a reference model for repositories. An evaluation. Report UKOLN University of Bath, 2007 http://eprints.whiterose.ac.uk/id/eprint/ 3464. 5. Doorn, P., Tjalsma, H.: Introduction: archiving research data. Archival Science 7(1), 1\u201320 (2007). DOI 10.1007/s10502-007-9054-6 6. Anonymous: Riding the wave. How Europe can gain from the riding tide of scienti\ufb01c data. Final report of the High Level Expert Group on Scienti\ufb01c Data. A submission of the European Commission. October 2010 http://cordis.europa.eu/fp7/ict/ e-infrastructure/docs/hlg-sdi-report 7. Prescott, A.: Made In She\ufb03eld: Industrial Perspectives on the Digital Humanities. Keynote at the Digital Humanities Congress at the University of She\ufb03eld, 6 September 2012. (The text of this keynote lecture can be found at Andrew Prescott\u2019s blog http://digitalriffs.blogspot.co.uk/2012/09/ made-in-sheffield-industrial.html) 8. Anonymous. Duurzame toegang tot digitale onderzoeksgegevens. Strategienota DANS (in Dutch). DANS, The Hague 2010. http://www.dans.knaw.nl/sites/ default/files/file/Uitgaven/Strategie/DANSSTRATEGIENOTAcompleet_DEF. pdf. A summary in English with the title: Sustained access to digital research data can be found at http://www.dans.knaw.nl/sites/default/files/file/ jaarverslagenenstrategienota/Samenvattingstrategienota_UK_DEF.pdf Training in Data Curation 11 9. Ross, S.: Digital Preservation, Archival Science and Methodological Foundations for Digital Libraries. New Review of Information Networking 17(1), 43\u201368 (2012). DOI 10.1080/13614576.2012.679446 10. Anonymous. APARSEN report: D43.1 Survey for the assessment of training material. Assessment of digital curation requirements. Available at: http://www.alliancepermanentaccess.org/wp-content/uploads/downloads/ 2012/12/APARSEN-REP-D43_1-01-4_1.pdf 5 Acknowledgement The following colleagues have been involved in the DataIntelligence4Librarians course. Nicole Potters, Marina Noordegraaf, Madeleine de Smaele, Ellen Verbakel (from the 3TU.Datacenter) and Rene van Horik, Caroline van Zuilekom, Marion Wittenberg, Ingrid Dillo (from DANS)."} {"id":"oai:arXiv.org:1310.3370","text":"Talking With Scholars: Developing a Research Environment for Oral History Collections Max Kemman1 , Stef Scagliola1 , Franciska de Jong1,2 , and Roeland Ordelman2,3 arXiv:1310.3370v1 [cs.DL] 12 Oct 2013 1 3 Erasmus University Rotterdam, Rotterdam, The Netherlands {kemman,scagliola}@eshcc.eur.nl 2 University of Twente, Enschede, The Netherlands f.m.g.dejong@utwente.nl Netherlands Institute for Sound and Vision, Hilversum, The Netherlands rordelman@beeldengeluid.nl Abstract. Scholars are yet to make optimal use of Oral History collections. For the uptake of digital research tools in the daily working practice of researchers, practices and conventions commonly adhered to in the subfields in the humanities should be taken into account during development. To this end, in the Oral History Today project a research tool for exploring Oral History collections is developed in close collaboration with scholarly researchers. This paper describes four stages of scholarly research and the first steps undertaken to incorporate requirements of these stages in a digital research environment. Keywords: Oral History, scholarly research, user-centered design, exploration, result presentation, data curation, word cloud, visual facets 1 Introduction The digital turn has profoundly influenced historical culture and has led to a rise in the creation of audio-visual archives with personal narratives, commonly identified as Oral History. For the general public, searching these archives by making use of standard search tools may be sufficient. Yet for scholars, the full value of this type of data cannot be exploited optimally as available tools do not enable scholars to engage with the content for the purposes of research. When working with audio-visual content, the availability of annotations is key to the process of digging up interesting fragments. In the past years, a lot of effort has been put in tools for creating manual annotations and generating annotations (semi-)automatically. But to accelerate scholarly research, tools are required that can take available annotations layers as input and provide means for visualization, compression and aggregation of the data. Thus allowing the researcher to explore and process the data, both at fragment-, item- and collection-level. However, to develop such dedicated data exploration tools, technology specialists and researchers in the humanities have to engage in a process of mutual understanding and joint development. Taking carefully into account the specific set of practices and conventions commonly adhered to within the subfields in the humanities is a minimum requirement for the uptake of the technology in the daily working practice of scholars. In this paper we present a research tool developed in close collaboration with scholars that enables searching and exploration of aggregated, heterogeneous Oral History content. 2 Four stages of scholarly research The user interface development is based upon four stages of scholarly research that were defined on the basis of an investigation of use scenarios reported in [1]. Exploration and selection. In the first stage, the focus is on the exploration and selection of one or more content sets within an archive that may be suitable for addressing a certain scholarly issue. The first steps in content exploration by a researcher often come down to searching for material. Research starts with the search for new or additional data. This stage can get the form of plain browsing, but it can also be strongly purpose-driven, (e.g., checking details, searching for complementary sources), item-oriented (e.g., finding the first interview with a specific person), or directed towards patterns in a collection, in which case an entire data set is the focus of attention. Exploration and investigation. Once the relevant materials have been identified, the focus in the next stage is mostly on the further exploration of the collected materials, the ordering, comparison (by individual researchers or in joint efforts) and analysis, and the documentation of the interpretation. This exploration stage may generate new ideas and perspectives, requiring new searches and inquiries. Result presentation. After the analysis has been completed, the third stage is the presentation of research results. In the digital realm it has become feasible to link annotations that capture the results of an analytical step to the data on which they are based. Data and annotations can be shared with peers, both during collaboration as well as in publications. Instead of a printed book, one can produce a digital publication with links to audio-visual content. Data curation. The fourth and final stage of the process is the long-term preservation of the data and the results of the investigation that has been carried out. Especially audio-visual materials that have been processed with digital tools are not the kind of research result that can be stored in a cupboard; they should be deposited in a trusted digital repository [2]. Ideally the depositing of material should be in line with emerging standards for Open Data, as this would allow the data and annotations to be reused by scholars with similar interests. For example, links can then be created to other data sets to place the data in a broader context [3]. Although the actual curation process itself is out-of-scope in this specific research project, workspaces can provide a form of data curation through the individual collecting of interviews, cutting interesting fragments with a virtual cutter [5] and creating additional manual annotations that can be fed into the existing metadata and thereby enrich the collection even further. 3 Oral History Today research environment Visual search. The Oral History Today research interface is based upon the four stages described above. As the search process for the exploration and selection and exploration and investigation stages is reminiscent of Shneiderman\u2019s Visual Information-Seeking Mantra of overview first, zoom and filter, then detailson-demand [6], we developed a visual search interface to provide overview and zooming facilities, as well as support exploration strategies. Two visualizations were developed to complement the search interface and allow visual searching: word clouds and visual facets. Word clouds provide a textual insight in the material available, with the additional benefit that a better insight is gained in what terminology is used in the collections explored; an issue identified for keyword search interfaces [4]. Visual facets (Figure 1) provide a visual overview of the facets. Facets are shown as graphical bars, where the length of each value represents the number of related search results, as demonstrated previously in Relation Browser++ [7]. A difference with RB++ is that the facet values are stacked into a single bar representing the facet. On mouse-hovering a tooltip is shown with a textual description and the number of corresponding items. When the user selects a facet value, the facet bar is moved to the top to allow the user to keep a history of selected facets. Visual facets not only give a more visual overview of the search results, but also allow for faster interactions with the facets. Evaluation. To allow user feedback to be incorporated in the development process, evaluation is undertaken in multiple cycles. To elicit a broad range of responses with regard to usability as well as applicability to research practices, the first cycle was performed with semi-structured interviews. Five scholars were asked to try research subjects of their own interest. The results of this first evaluation are very positive. Concerning the visualisations described above, it was generally agreed that word clouds enable the searcher to acquire an idea of what material is available. However, they did not think word clouds would provide them with keywords to improve their queries. Visual facets were considered interesting and felt as a very fast way to both acquire an overview of the search results as well as refine search results. Fig. 1. Visual Facets Further adjustments. Scholars noted that being able to quickly assess the importance of search results is vital during the exploration and selection stage. To enable fast assessments, we added the ability to expand summary-descriptions in the search results, no longer requiring scholars to open each individual search result. After this assessment, scholars need to be able to save important items. Therefore, we developed workspaces, which allow researchers to save interviews in project-specific sets for later analysis, as well as for referencing in publications as needed in the result presentation stage described above. 4 Conclusion The results of the first evaluation are promising. The positive responses of the scholars indicated that the chosen approach for exploring Oral History data is in the right direction. In the near future, this evaluation will receive a larger followup in the final evaluation of the research interface. After this final evaluation, the tool will be released to the Oral History research community, allowing us to investigate how it will eventually be used in daily research practices. Acknowledgments. The work reported in this paper was funded by the EU Project AXES - Access to Audiovisual Archives (FP7-269980) and the Dutch national program CLARIAH (http://www.clariah.nl/). We thank Dispectu (www.dispectu.com) and Spinque (www.spinque.nl) for their collaboration in the research project Oral History Today. References 1. F. de Jong, R. Ordelman, and S. Scagliola. Audio-visual collections and the user needs of scholars in the humanities: a case for co-development. In Proceedings of the 2nd Conference on Supporting Digital Humanities (SDH 2011), page 7, Copenhagen, Denmark, 2011. Centre for Language Technology, Copenhagen. 2. R. Jantz and M. J. Giarlo. Digital preservation: Architecture and technology for trusted digital repositories. Microform & imaging review, 34(3):135\u2013147, 2005. 3. M. Kemman and M. Kleppe. PoliMedia - Improving Analyses of Radio, TV & Newspaper Coverage of Political Debates. In T. Aalberg, M. Dobreva, C. Papatheodorou, G. Tsakonas, and C. Farrugia, editors, TPDL 2013, Valletta, Malta, September 22-26, 2013. Proceedings, Valletta, Malta, 2013. Springer. 4. T. Mann. Will Google\u2019s Keyword Searching Eliminate the Need for LC Cataloging and Classification? Journal of Library Metadata, 8(2):159\u2013168, June 2008. 5. A. Rosendaal and J. Oomen. The Davideon Project: Capitalizing the Possibilities of Streaming Video as Flexible Learning Objects for the Humanities. Innovate: Journal of Online Education, 2(1), 2005. 6. B. Shneiderman. The eyes have it: a task by data type taxonomy for information visualizations. In Proceedings 1996 IEEE Symposium on Visual Languages, pages 336\u2013343. IEEE Comput. Soc. Press, 1996. 7. J. Zhang and G. Marchionini. Evaluation and evolution of a browse and search interface: Relation browser. In In: Proceedings of the 2005 national conference on Digital government research, pages 179\u2013188, 2005."} {"id":"oai:arXiv.org:1312.3393","text":"arXiv:1312.3393v2 [cs.LG] 17 Dec 2013 Relative Upper Confidence Bound for the K-Armed Dueling Bandit Problem Masrour Zoghi ISLA, University of Amsterdam, The Netherlands m.zoghi@uva.nl Shimon Whiteson ISLA, University of Amsterdam, The Netherlands s.a.whiteson@uva.nl Remi Munos INRIA Lille - Nord Europe, Villeneuve d\u2019Ascq, France remi.munos@inria.fr Maarten de Rijke ISLA, University of Amsterdam, The Netherlands Abstract This paper proposes a new method for the K-armed dueling bandit problem, a variation on the regular K-armed bandit problem that offers only relative feedback about pairs of arms. Our approach extends the Upper Confidence Bound algorithm to the relative setting by using estimates of the pairwise probabilities to select a promising arm and applying Upper Confidence Bound with the winner as a benchmark. We prove a finite-time regret bound of order O(log t). In addition, our empirical results using real data from an information retrieval application show that it greatly outperforms the state of the art. 1. Introduction In this paper, we propose and analyze a new algorithm, called Relative Upper Confidence Bound (RUCB), for the K-armed dueling bandit problem (Yue et al., 2012), a variation on the K-armed bandit problem, where the feedback comes in the form of pairwise preferences. We assess the performance of this algorithm using one of the main current applications of the K-armed dueling bandit problem, ranker evaluation (Hofmann et al., 2013; Joachims, 2002; Yue & Joachims, 2011), which is used in information retrieval, ad placement and recommender systems, among others. derijke@uva.nl The K-armed dueling bandit problem is part of the general framework of preference learning (F\u00a8 urnkranz & H\u00a8 ullermeier, 2010; F\u00a8 urnkranz et al., 2012), where the goal is to learn, not from real-valued feedback, but from relative feedback, which specifies only which of two alternatives is preferred. Developing effective preference learning methods is important for dealing with domains in which feedback is naturally qualitative (e.g., because it is provided by a human) and specifying real-valued feedback instead would be arbitrary or inefficient (F\u00a8 urnkranz et al., 2012). Other algorithms proposed for this problem are Interleaved Filter (IF) (Yue et al., 2012), Beat the Mean (BTM) (Yue & Joachims, 2011), and SAVAGE (Urvoy et al., 2013). All of these methods were designed for the finite-horizon setting, in which the algorithm requires as input the exploration horizon, T , the time by which the algorithm needs to produce the best arm. The algorithm is then judged based upon either the accuracy of the returned best arm or the regret accumulated in the exploration phase.1 All three of these algorithms use the exploration horizon to set their internal parameters, so for each T , there is a separate algorithm IFT , BTMT and SAVAGET . By contrast, RUCB does not require this input, making it more useful in practice, since a good exploration horizon is often difficult to guess. Nonetheless, RUCB outperforms these algorithms in terms of the accuracy and regret metrics used in the finite-horizon setting. The main idea of RUCB is to maintain optimistic estimates of the probabilities of all possible pairwise out1 These terms are formalized in Section 2. Relative Upper Confidence Bound comes, and (1) use these estimates to select a potential champion, which is an arm that has a chance of being the best arm, and (2) select an arm to compare to this potential champion by performing regular Upper Confidence Bound (Auer et al., 2002) relative to it. We prove a finite-time high-probability bound of O(log t) on the cumulative regret of RUCB, from which we deduce a bound on the expected cumulative regret. These bounds rely on substantially less restrictive assumptions on the K-armed dueling bandit problem than IF and BTM and have better multiplicative constants than those of SAVAGE. Furthermore, our bounds are the first explicitly non-asymptotic results for the K-armed dueling bandit problem. More importantly, The main distinction of our result is that it holds for all time steps. By contrast, given an exploration horizon T , the results for IF, BTM and SAVAGE bound only the regret accumulated by IFT , BTMT and SAVAGET in the first T time steps. Finally, we evaluate our method empirically using real data from an information retrieval application. The results show that RUCB can learn quickly and effectively and greatly outperforms BTM and SAVAGE. The main contributions of this paper are as follows: \u2022 A novel algorithm for the K-armed dueling bandit problem that is more broadly applicable than existing algorithms, \u2022 More comprehensive theoretical results that make less restrictive assumptions than those of IF and BTM, have better multiplicative constants than the results of SAVAGE, and apply to all time steps, and \u2022 Experimental results, based on a real-world application, demonstrating the superior performance of our algorithm compared to existing methods. 2. Problem Setting The K-armed dueling bandit problem (Yue et al., 2012) is a modification of the K-armed bandit problem (Auer et al., 2002): the latter considers K arms {a1 , . . . , aK } and at each time-step, an arm ai can be pulled, generating a reward drawn from an unknown stationary distribution with expected value \u00b5i . The K-armed dueling bandit problem is a variation, where instead of pulling a single arm, we choose a pair (ai , aj ) and receive one of the two as the better choice, with the probability of ai being picked equal to a constant pij and that of aj equal to pji = 1 \u2212 pij . We define the preference matrix P = [pij ], whose ij entry is pij . In this paper, we assume that there exists a Condorcet winner (Urvoy et al., 2013): an arm, which without loss of generality we label a1 , such that p1i > 12 for all i > 1. Given a Condorcet winner, we define regret for each time-step as follows (Yue et al., 2012): if arms ai and aj were chosen for comparison at time t, then \u2206 +\u2206 regret at that time is set to be rt := 1i 2 1j , with 1 \u2206k := p1k \u2212 2 for all k \u2208 {1, . . . , K}. Thus, regret measures the average advantage that the Condorcet winner has over the two arms being compared against each other. Given our assumption on the probabilities p1k , this implies that r = 0 if and only if the best arm is compared against itself. We define cumulative regret T up to time T to be RT = t=1 rt . The Condorcet winner is different in a subtle but important way from the Borda winner (Urvoy et al., 2013), which is an arm ab that satisfies j pbj \u2265 j pij , for all i = 1, . . . , K. In other words, when averaged across all other arms, the Borda winner is the arm with the highest probability of winning a given comparison. In the K-armed dueling bandit problem, the Condorcet winner is sought rather than the Borda winner, for two reasons. First, in many applications, including the ranker evaluation problem addressed in our experiments, the eventual goal is to adapt to the preferences of the users of the system. Given a choice between the Borda and Condorcet winners, those users prefer the latter in a direct comparison, so it is immaterial how these two arms fare against the others. Second, in settings where the Borda winner is more appropriate, no special methods are required: one can simply solve the K-armed bandit algorithm with arms {a1 , . . . , aK }, where pulling ai means choosing an index j \u2208 {1, . . . , K} randomly and comparing ai against aj . Thus, research on the K-armed dueling bandit problem focuses on finding the Condorcet winner, for which special methods are required to avoid mistakenly choosing the Borda winner. The goal of a bandit algorithm can be formalized in several ways. In this paper, we consider two standard settings: 1. The finite-horizon setting: In this setting, the algorithm is told in advance the exploration horizon, T , i.e., the number of time-steps that the evaluation process is given to explore before it has to produce a single arm as the best, which will be exploited thenceforth. In this setting, the algorithm can be assessed on its accuracy, the probability that a given run of the algorithm reports the Condorcet winner as the best arm (Urvoy et al., 2013), which is related to expected simple regret: the regret associated with the algorithm\u2019s choice of the best arm, i.e., rT +1 (Bubeck et al., 2009). Another measure of success in this setting is the amount of regret accumulated during the exploration phase, as for- Relative Upper Confidence Bound mulated by the explore-then-exploit problem formulation (Yue et al., 2012). 2. The horizonless setting: In this setting, no horizon is specified and the evaluation process continues indefinitely. Thus, it is no longer sufficient for the algorithm to maximize accuracy or minimize regret after a single horizon is reached. Instead, it must minimize regret across all horizons by rapidly decreasing the frequency of comparisons involving suboptimal arms, particularly those that fare worse in comparison to the best arm. This goal can be formulated as minimizing the cumulative regret over time, rather than with respect to a fixed horizon (Lai & Robbins, 1985). As we describe in Section 3, all existing K-armed dueling bandit methods target the finite-horizon setting. However, we argue that the horizonless setting is more relevant in practice for the following reason: finitehorizon methods require a horizon as input and often behave differently for different horizons. This poses a practical problem because it is typically difficult to know in advance how many comparisons are required to determine the best arm with confidence and thus how to set the horizon. If the horizon is set too long, the algorithm is too exploratory, increasing the number of evaluations needed to find the best arm. If it is set too short, the best arm remains unknown when the horizon is reached and the algorithm must be restarted with a longer horizon. Moreover, any algorithm that can deal with the horizonless setting can easily be modified to address the finite-horizon setting by simply stopping the algorithm when it reaches the horizon and returning the best arm. By contrast, for the reverse direction, one would have to resort to the \u201cdoubling trick\u201d (Cesa-Bianchi & Lugosi, 2006, Section 2.3), which leads to substantially worse regret results: this is because all of the upper bounds proven for methods addressing the finitehorizon setting so far are in O(log T ) and applying the doubling trick to such results would lead to regret bounds of order (log T )2 , with the extra log factor coming from the number of partitions. To the best of our knowledge, RUCB is the first Karmed dueling bandit algorithm that can function in the horizonless setting without resorting to the doubling trick. We show in Section 4 how it can be adapted to the finite-horizon setting. 3. Related Work In this section, we briefly survey existing methods for the K-armed dueling bandit problem. The first method for the K-armed dueling bandit problem is interleaved filter (IF) (Yue et al., 2012), which was designed for a finite-horizon scenario and which proceeds by picking a reference arm to compare against the rest and using it to eliminate other arms, until the reference arm is eliminated by a better arm, in which case the latter becomes the reference arm and the algorithm continues as before. The algorithm terminates either when all other arms are eliminated or if the exploration horizon T is reached. More recently, the beat the mean (BTM) algorithm has been shown to outperform IF (Yue & Joachims, 2011), while imposing less restrictive assumptions on the K-armed dueling bandit problem. BTM focuses exploration on the arms that have been involved in the fewest comparisons. When it determines that an arm fares on average too poorly in comparison to the remaining arms, it removes it from consideration. More precisely, BTM considers the performance of each arm against the mean arm by averaging the arm\u2019s scores against all other arms and uses these estimates to decide which arm should be eliminated. Both IF and BTM require the comparison probabilities pij to satisfy certain conditions that are difficult to verify without specific knowledge about the dueling bandit problem at hand and, moreover, are often violated in practice (see the supplementary material for a more thorough discussion and analysis of these assumptions). Under these conditions, theoretical results have been proven for IF and BTM in (Yue et al., 2012) and (Yue & Joachims, 2011). More precisely, both algorithms take the exploration horizon T as an input and so for each T , there are algorithms IFT and BTMT ; the results then state the following: for large T , in the case of IFT , we have the expected regret bound E RTIFT \u2264 C K log T , minK j=2 \u2206j and, in the case of BTMT , the high probability regret bound RTBTMT \u2264 C \u03b3 7 K log T with high probability, minK j=2 \u2206j where arm a1 is assumed to be the best arm, and we define \u2206j := p1j \u2212 12 , and C and C are constants independent of the specific dueling bandit problem. The first bound matches a lower bound proven in (Yue et al., 2012, Theorem 4). However, as pointed out in (Yue & Joachims, 2011), this result holds for a very restrictive class of K-armed dueling bandit problems. Relative Upper Confidence Bound In an attempt to remedy this issue, the second bound was proven for BTM, which includes a relaxation parameter \u03b3 that allows for a broader class of problems, as discussed in the supplementary material. The difficulty with this result is that the parameter \u03b3, which depends on the probabilities pij and must be passed to the algorithm, can be very large. Since it is raised to the power of 7, this makes the bound very loose. For instance, in the three-ranker evaluation experiments discussed in Section 6, the values for \u03b3 are 4.85, 11.6 and 47.3 for the 16-, 32- and 64-armed examples. In contrast to the above limitations and loosenesses, in Section 5 we provide explicit bounds on the regret accumulated by RUCB that do not depend on \u03b3 and require only the existence of a Condorcet winner for their validity, which makes them much more broadly applicable. Sensitivity Analysis of VAriables for Generic Exploration (SAVAGE) (Urvoy et al., 2013) is a recently proposed algorithm that outperforms both IF and BTM by a wide margin when the number of arms is of moderate size. Moreover, one version of SAVAGE, called Condorcet SAVAGE, makes the Condorcet assumption and performed the best experimentally (Urvoy et al., 2013). Condorcet SAVAGE compares pairs of arms uniformly randomly until there exists a pair for which one of the arms beats another by a wide margin, in which case the loser is removed from the pool of arms under consideration. We show in this paper that our proposed algorithm for ranker evaluation substantially outperforms Condorcet SAVAGE. The theoretical result proven for Condorcet SAVAGE has the following form (Urvoy et al., 2013, Theorem 3). First, let us assume that a1 is the Condorcet winner and let TCSAVAGET denote the number of iterations the Condorcet SAVAGE algorithm with exploration horizon T requires before terminating and returning the best arm; then, given \u03b4 > 0, with probability 1 \u2212 \u03b4, we have for large T K\u22121 TCSAVAGET \u2264 C j=1 j \u00b7 log KT \u03b4 , \u22062j+1 with the indices j arranged such that \u22062 \u2264 \u00b7 \u00b7 \u00b7 \u2264 \u2206K and \u2206j = p1j \u2212 21 as before, and C a problem independent constant. This bound is very similar in spirit to our high probability result, with the important distinction that, unlike the above bound, the multiplicative factors in our result (i.e., the Dij in Theorem 2 below) do not depend on \u03b4. Moreover, in (Urvoy et al., 2013, Appendix B.1), the authors show that for large T we have the following expected regret bound: K E RTCSAVAGET \u2264C j=2 j \u00b7 log KT 2 + 1. \u22062j This is similar to our expected regret bound in Theorem 3, although for difficult problems where the \u2206j are small, Theorem 3 yields a tighter bound due to the presence of the \u2206j in the numerator of the second summand. An important advantage that our result has over the results reviewed here is an explicit expression for the additive constant, which was left out of the analyses of IF, BTM and SAVAGE. Finally, note that all of the above results bound only RT , where T is the predetermined exploration horizon, since IF, BTM and SAVAGE were designed for the finite-horizon setting. By contrast, in Section 5, we bound the cumulative regret of each version of our algorithm for all time steps. 4. Method We now introduce Relative Upper Confidence Bound (RUCB), which is applicable to any K-armed dueling bandit problem with a Condorcet winner. Algorithm 1 Relative Upper Confidence Bound Input: \u03b1 > 21 , T \u2208 {1, 2, . . .} \u222a {\u221e} 1: W = [wij ] \u2190 0K\u00d7K // 2D array of wins: wij is the number of times ai beat aj 2: for t = 1, . . . , T do W \u03b1 ln t // All operU := [uij ] = W+W T + W+WT x ations are element-wise; 0 := 1 for any x. 4: uii \u2190 21 for each i = 1, . . . , K. 5: Pick any c satisfying ucj \u2265 12 for all j. If no such c, pick c randomly from {1, . . . , K}. 6: d \u2190 arg maxj ujc 7: Compare arms ac and ad and increment wcd or wdc depending on which arm wins. 8: end for Return: An arm ac that beats the most arms, i.e., c wcj > 12 . with the largest count # j| wcj +w jc 3: In each time-step, RUCB, shown in Algorithm 1, goes through the following three stages: (1) RUCB puts all arms in a pool of potential champions. Then, it compares each arm ai against all other arms optimistically: for all i = j, we compute the upper bound uij (t) = \u00b5ij (t) + cij (t), where \u00b5ij (t) is the frequentist estimate of pij at time t and cij (t) is an optimism bonus that increases with t and decreases with the number of comparisons between i and j (Line 3). Relative Upper Confidence Bound If we have uij < 12 for any j, then ai is removed from the pool. Next, a champion arm ac is chosen randomly from the remaining potential champions (Line 5). (2) Regular UCB is performed using ac as a benchmark (Line 6), i.e., UCB is performed on the set of arms a1c . . . aKc . Specifically, we select the arm d = arg maxj ujc . When c = j, ujc is defined as above. When c = j, since pcc = 21 , we set ucc = 12 (Line 4). pij \u00b5ij n+2 \u00b7\u00b7\u00b7 \u00b5ij n+1 Eventually, as more comparisons are conducted, the estimates \u00b51j tend to concentrate above 12 and the optimism bonuses c1j (t) will become small. Thus, both stages of the algorithm will increasingly select a1 , i.e., ac = ad = a1 . Since comparing a1 to itself is optimal, rt declines over time. Note that Algorithm 1 is a finite-horizon algorithm if T < \u221e and a horizonless one if T = \u221e, in which case the for loop never terminates. 5. Theoretical Results In this section, we prove finite-time high-probability and expected regret bounds for RUCB. We first state Lemma 1 and use it to prove a high-probability bound in Theorem 2, from which we deduce an expected regret bound in Theorem 3. To simplify notation, we assume without loss of generality that a1 is the optimal arm in the following. Moreover, given any K-armed dueling bandit algorithm, we define wij (t) to be the number of times arm ai has beaten aj in the first t iterations of the algorithm. We also define uij (t) := wij (t) wij (t)+wji (t) + \u03b1 ln t wij (t)+wji (t) , for any given \u03b1 > 0, and set lij (t) := 1\u2212 uji (t). Moreover, for any \u03b4 > 0, define C(\u03b4) := (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 . Lemma 1. Let P := [pij ] be the preference matrix of a K-armed dueling bandit problem with arms {a1 , . . . , aK }, satisfying p1j > 12 for all j > 1 (i.e. a1 is the Condorcet winner). Then, for any dueling \u00b7\u00b7\u00b7 \u00b5ij n pij \u00b5ij (t) (3) The pair (ac , ad ) are compared and the score sheet is updated as appropriate (Line 7). Note that in stage (1) the comparisons are based on ucj , i.e., ac is compared optimistically to the other arms, making it easier for it to become the champion. By contrast, in stage (2) the comparisons are based on ujc , i.e., ac is compared to the other arms pessimistically, making it more difficult for ac to be compared against itself. This is important because comparing an arm against itself yields no information. Thus, RUCB strives to avoid auto-comparisons until there is great certainty that ac is indeed the Condorcet winner. \u00b7\u00b7\u00b7 Confidence intervals [lij (t), uij (t)] T \u03c4nij ij \u03c4n+1 Figure 1. An illustration of the idea behind Lemma 1 using an example of how the confidence intervals of a single pair of arms (ai , aj ), and their relation to the comparison ij probability pij , might evolve over time. The time-step \u03c4m th denotes the m time when the arms ai and aj were chosen by RUCB to be compared against each other. We also ij define \u00b5ij m := \u00b5ij (\u03c4m ). The time T is when the confidence intervals begin to include pij . The lemma then states that with probability 1 \u2212 \u03b4, we have T \u2264 C(\u03b4). bandit algorithm and any \u03b1 > 1 2 and \u03b4 > 0, we have P \u2200 t > C(\u03b4), i, j, pij \u2208 [lij (t), uij (t)] > 1 \u2212 \u03b4. Proof. See the supplementary material. The idea behind this lemma is depicted in Figure 1, which illustrates the two phenomena that make it possible: first, as long as arms ai and aj are not compared against each other, \u221a the interval [lij (t), uij (t)] will grow in length as log t, hence approaching pij ; second, as the number of comparisons between ai and aj increases, the estimated means \u00b5ij approach pij , hence increasing the probability that the interval [lij (t), uij (t)] will contain pij . Let us now turn to our high probability bound: Theorem 2. Given a preference matrix P = [pij ] and \u03b4 > 0 and \u03b1 > 21 , define C(\u03b4) := 4\u03b1 Dij := min{\u2206 2 ,\u22062 } for each i, j = i j where \u2206i := 12 \u2212 pi1 , and set Dii (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 and 1, . . . , K with i = j, = 0 for all i. Then, if we apply Algorithm 1 to the K-armed dueling bandit problem defined by P, given any pair (i, j) = (1, 1), the number of comparisons between arms ai and aj performed up to time t, denoted by Nij (t), satisfies P \u2200 t, Nij (t) \u2264 max C(\u03b4), Dij ln t > 1 \u2212 \u03b4. (1) Moreover, we have the following high probability bound for the regret accrued by the algorithm: P \u2200 t, Rt \u2264 C(\u03b4)\u2206\u2217 + Dij \u2206ij ln t i>j > 1 \u2212 \u03b4, (2) Relative Upper Confidence Bound \u2206 +\u2206 a1 where \u2206\u2217 := maxi \u2206i and \u2206ij := i 2 j , while Rt is the cumulative regret as defined in Section 2. Proof. Given Lemma 1, we know with probability 1\u2212\u03b4 that pij \u2208 [lij (t), uij (t)] for all t > C(\u03b4). Let us first deal with the easy case when i = j = 1: when t > C(\u03b4) holds, ai cannot be played against itself, since if we get c = i in Algorithm 1, then by Lemma 1 and the fact that a1 is the Condorcet winner we have a1 1 2 ai 1 2 pi1 1 uii (t) = < p1i \u2264 u1i (t), 2 ai aj \u2206i and so d = i. Now, let us assume that distinct arms ai and aj have been compared against each other more than Dij ln t times and that t > C(\u03b4). If s is the last time ai and aj were compared against each other, we must have uij (s) \u2212 lij (s) = 2 \u22642 \u03b1 ln s Nij (t) \u03b1 ln t <2 Nij (t) (3) \u03b1 ln t 4\u03b1 ln t min{\u22062i ,\u22062j } = min{\u2206i , \u2206j }. On the other hand, for ai to have been compared against aj at time s, one of the following two scenarios must have happened: I. In Algorithm 1, we had c = i and d = j, in which case both of the following inequalities must hold: a. uij (s) \u2265 12 , since otherwise c could not have been set to i by Line 5 of Algorithm 1, and b. lij (s) = 1 \u2212 uji (s) \u2264 1 \u2212 p1i = pi1 , since we know that p1j \u2264 u1i (t), by Lemma 1 and the fact that t > C(\u03b4), and for d = j to be satisfied, we must have u1i (t) \u2264 uji (t) by Line 6 of Algorithm 1. From these two inequalities, we can conclude uij (s) \u2212 lij (s) \u2265 1 \u2212 pi1 = \u2206i . 2 (4) This inequality is illustrated using the lower right confidence interval in the (ai , aj ) block of Figure 2, where the interval shows [lij (s), uij (s)] and the distance between the dotted lines is 12 \u2212 pi1 . II. In Algorithm 1, we had c = j and d = i, in which case swapping i and j in the above argument gives uji (s) \u2212 lji (s) \u2265 1 \u2212 pj1 = \u2206j . 2 (5) Similarly, this is illustrated using the lower left confidence interval in the (aj , ai ) block of Figure 2, where the interval shows [lji (s), uji (s)] and the distance between the dotted lines is 12 \u2212 pj1 . aj 1 2 pj1 \u2206j Figure 2. An illustration of the proof of Theorem 2. The figure shows an example of the internal state of RUCB at time s. The height of the dot in the block in row am and column an represents the comparisons probability pmn , while the interval, where present, represents the confidence interval [lmn , umn ]: we have only included them in the (ai , aj ) and the (aj , ai ) blocks of the figure because those are the ones that are discussed in the proof. Moreover, in those blocks, we have included the outcomes of two different runs: one drawn to the left of the dots representing pij and pji , and the other to the right (the horizontal axis in these plots has no other significance). These two outcomes are included to address the dichotomy present in the proof. Note that for a given run, we must have [lji (s), uji (s)] = [1 \u2212 uij (s), 1 \u2212 lij (s)] for any time s, hence the symmetry present in this figure. Putting (4) and (5) together with (3) yields a contradiction, so with probability 1 \u2212 \u03b4 we cannot have Nij be larger than both C(\u03b4) and Dij ln t. This gives us (1), from which (2) follows by allowing for the largest regret, \u2206\u2217 , to occur in each of the first C(\u03b4) steps of the algorithm and adding the regret accrued by Dij ln t comparisons between ai and aj . Next, we prove our expected regret bound: Theorem 3. Given \u03b1 > 1, the expected regret accumulated by RUCB after t iterations is bounded by 1 (4\u03b1 \u2212 1)K 2 2\u03b1\u22121 2\u03b1 \u2212 1 E[Rt ] \u2264 \u2206 2\u03b1 \u2212 1 2\u03b1 \u2212 2 \u2206i + \u2206 j + 2\u03b1 ln t. min{\u22062i , \u22062j } i>j \u2217 (6) Proof. We can obtain the bound in (6) from (2) by integrating with respect to \u03b4 from 0 to 1. This is because given any one-dimensional random variable X with Relative Upper Confidence Bound 1 \u22121 CDF FX , we can use the identity E[X] = 0 FX (q)dq. In our case, X = Rt for a fixed time t and, as illustrated in Figure 3, we can deduce from (2) that FRt (r) > Ht\u22121 (r), which gives the bound FR\u22121 (q) < Ht (q) = C(1 \u2212 q)\u2206\u2217 + t Dij \u2206ij ln t. i>j 6. Experiments Now, assume that \u03b1 > 1. To derive (6) from the above inequality, we need to integrate the righthand side, and since it is only the first term in the summand that depends on q, that is all we need to integrate. To do (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 so, recall that C(\u03b4) := 1 2\u03b1\u22121 , so to simplify 1 2\u03b1\u22121 2 notation, we define L := (4\u03b1\u22121)K . Now, we 2\u03b1\u22121 can carry out the integration as follows, beginning by using the substitution 1 \u2212 q = \u03b4, dq = \u2212d\u03b4: 1 0 q=0 C(1 \u2212 q)dq = 1 = 0 \u03b4=1 (4\u03b1 \u2212 1)K 2 (2\u03b1 \u2212 1)\u03b4 1 1\u2212 2\u03b1\u22121 \u03b4 =L 1\u2212 1 2\u03b1\u22121 1 = 0 \u2212C(\u03b4)d\u03b4 1 2\u03b1\u22121 1 1 \u03b4 \u2212 2\u03b1\u22121 d\u03b4 d\u03b4 = L 0 (4\u03b1 \u2212 1)K 2\u03b1 \u2212 1 2 1 2\u03b1\u22121 2\u03b1 \u2212 1 . 2\u03b1 \u2212 2 q0 FRt (r) 0 q FRt (r0) 1 Remark 4. Note that RUCB uses the upperconfidence bounds (Line 3 of Algorithm 1) introduced in the original version of UCB (Auer et al., 2002) (up to the \u03b1 factor). Recently refined upper-confidence bounds (such as UCB-V (Audibert et al., 2009) or KL-UCB (Capp\u00b4e et al., 2013)) have improved performance for the regular K-armed bandit problem. However, in our setting the arm distributions are Bernoulli and the comparison value is 1/2. Thus, since we have 2\u22062i \u2264 kl(p1,i , 1/2) \u2264 4\u22062i (where kl(a, b) = Ht\u22121(r), the inverse function of Ht(q) := C(1 \u2212 q)\u2206\u2217 + i>j Dij \u2206ij ln t 0 r0 = Ht(q0) a log ab + (1 \u2212 a) log 1\u2212a 1\u2212b is the KL divergence between Bernoulli distributions with parameters a and b), we deduce that using KL-UCB instead of UCB does not improve the leading constant in the logarithmic term of the regret by a numerical factor of more than 2. t r Figure 3. A schematic graph illustrating the proof of Theorem 3. Note that the expression for Ht (q) is extracted from (2), which also implies that Ht\u22121 is necessarily below FRt : formulated in terms of CDFs, (2) states that FRt (Ht (q0 )) > q0 = Ht\u22121 (Ht (q0 )), where q0 = 1 \u2212 \u03b40 is a quantile. From this, we can conclude that FRt (r) > Ht\u22121 (r) for all r. To evaluate RUCB, we apply it to the problem of ranker evaluation from the field of information retrieval (IR) (Manning et al., 2008). A ranker is a function that takes as input a user\u2019s search query and ranks the documents in a collection according to their relevance to that query. Ranker evaluation aims to determine which among a set of rankers performs best. One effective way to achieve this is to use interleaved comparisons (Radlinski et al., 2008), which interleave the documents proposed by two different rankers and presents the resulting list to the user, whose resulting click feedback is used to infer a noisy preference for one of the rankers. Given a set of K rankers, the problem of finding the best ranker can then be modeled as a K-armed dueling bandit problem, with each arm corresponding to a ranker. Our experimental setup is built on real IR data, namely the LETOR NP2004 dataset (Liu et al., 2007). Using this data set, we create a set of 64 rankers, each corresponding to a ranking feature provided in the data set, e.g., PageRank. The ranker evaluation task thus corresponds to determining which single feature constitutes the best ranker (Hofmann et al., 2013). To compare a pair of rankers, we use probabilistic interleave (PI) (Hofmann et al., 2011), a recently developed method for interleaved comparisons. To model the user\u2019s click behavior on the resulting interleaved lists, we employ a probabilistic user model (Craswell et al., 2008; Hofmann et al., 2011) that uses as input the manual labels (classifying documents as relevant or not for given queries) provided with the LETOR NP2004 dataset. Queries are sampled randomly and clicks are generated probabilistically by conditioning on these assessments in a way that resembles the behavior of an actual user (Guo et al., 2009a;b). Following (Yue & Joachims, 2011), we first used the above approach to estimate the comparison probabilities pij for each pair of rankers and then used these probabilities to simulate comparisons between rankers. More specifically, we estimated the full preference matrix by performing 4000 interleaved comparisons on each pair of the 64 feature rankers included in the LETOR dataset. We evaluated RUCB, Condorcet SAVAGE and BTM Relative Upper Confidence Bound LETOR NP2004 Dataset with 16 rankers RUCB \u03b1 = 0.51 Condorcet SAVAGE BTM 103 105 time 102 3 10 106 LETOR NP2004 Dataset with 16 rankers 1.0 0.6 0.4 0.2 104 105 time 106 104 105 time 102 3 10 106 LETOR NP2004 Dataset with 32 rankers 0.8 0.6 0.4 0.2 0.0 3 10 104 103 1.0 best ranker rate (accuracy) 104 0.8 0.0 3 10 104 103 best ranker rate (accuracy) best ranker rate (accuracy) 1.0 105 cumulative regret 104 102 3 10 LETOR NP2004 Dataset with 64 rankers 105 cumulative regret cumulative regret 105 LETOR NP2004 Dataset with 32 rankers 104 105 time 106 104 105 time 106 LETOR NP2004 Dataset with 64 rankers 0.8 0.6 0.4 0.2 0.0 3 10 104 105 time 106 Figure 4. Average cumulative regret and accuracy for 100 runs of BTM, Condorcet SAVAGE and RUCB with \u03b1 = 0.51 applied to three K-armed dueling bandit problems with K = 16, 32, 64. In the top row of plots, both axes use log scales, and the dotted curves signify best and worst regret performances; in the bottom plots, only the x-axis uses a log scale. using randomly chosen subsets from the pool of 64 rankers, yielding K-armed dueling bandit problems with K \u2208 {16, 32, 64}. For each set of rankers, we performed 100 independent runs of each algorithm for a maximum of 4.5 million iterations. For RUCB we set \u03b1 = 0.51, which approaches the limit of our highprobability theoretical results, i.e., \u03b1 > 0.5 as in Theorem 2. We did not include an evaluation of IF, since both BTM and Condocet SAVAGE were shown to outperform it (Urvoy et al., 2013; Yue & Joachims, 2011). Since BTM and SAVAGE require the exploration horizon as input, we ran BTMT and CSAVAGET for various horizons T ranging from 1000 to 4.5 million. In the top row of plots in Figure 4, the markers on the green and the blue curves show the regret accumulated by BTMT and CSAVAGET in the first T iteration of the algorithm for each of these horizons. Thus, each marker corresponds, not to the continuation of the runs that produced the previous marker, but to new runs conducted with a larger T . Since RUCB is horizonless, we ran it for 4.5 million iterations and plotted the cumulative regret, as shown using the red curves in the same plots. In the case of all three algorithms, the solid line shows the expected cumulative regret averaged across all 100 runs and the dotted lines show the minimum and the maximum cu- mulative regret that was observed across runs. Note that these plots are in log-log scale. The bottom plots in Figure 4 show the accuracy of all three algorithms across 100 runs, computed at the same times as the exploration horizons used for BTM and SAVAGE in the regret plots. Note that these plots are in lin-log scale. These results clearly demonstrate that RUCB identifies the best arm more quickly, since it asymptotically accumulates 5 to 10 times less regret than Condorcet SAVAGE, while reaching higher levels of accuracy in roughly 20% of the time as Condorcet SAVAGE, all without knowing the horizon T . The contrast is even more stark when comparing to BTM. 7. Conclusions This paper proposed a new method called Relative Upper Confidence Bound (RUCB) for the K-armed dueling bandit problem that extends the Upper Confidence Bound (UCB) algorithm to the relative setting by using optimistic estimates of the pairwise probabilities to choose a potential champion and conducting regular UCB with the champion as the benchmark. We proved finite-time high-probability and expected Relative Upper Confidence Bound regret bounds of order O(log t) for our algorithm and evaluated it empirically in an information retrieval application. Unlike existing results, our regret bounds hold for all time steps, rather than just a specific horizon T input to the algorithm. Furthermore, they rely on less restrictive assumptions or have better multiplicative constants than existing methods. Finally, the empirical results showed that RUCB greatly outperforms state-of-the-art methods. In future work, we will consider two extensions to this research. First, building off extensions of UCB to the continuous bandit setting (Bubeck et al., 2011; de Freitas et al., 2012; Munos, 2011; Srinivas et al., 2010; Valko et al., 2013), we aim to extend RUCB to the continuous dueling bandit setting, without a convexity assumption as in (Yue & Joachims, 2009). Second, building off Thompson Sampling (Agrawal & Goyal, 2012; Kauffmann et al., 2012; Thompson, 1933), an elegant and effective sampling-based alternative to UCB, we will investigate whether a sampling-based extension to RUCB would be amenable to theoretical analysis. Both these extensions involve overcoming not only the technical difficulties present in the regular bandit setting, but also those that arise from the two-stage nature of RUCB. Acknowledgments This research was partially supported by the European Community\u2019s Seventh Framework Programme (FP7/2007-2013) under grant agreement nr 288024 (LiMoSINe project), the Netherlands Organisation for Scientific Research (NWO) under project nrs 640.004.802, 727.011.005, 612.001.116, HOR-11-10, the Center for Creation, Content and Technology (CCCT), the QuaMerdes project funded by the CLARIN-nl program, the TROVe project funded by the CLARIAH program, the Dutch national program COMMIT, the ESF Research Network Program ELIAS, the Elite Network Shifts project funded by the Royal Dutch Academy of Sciences (KNAW), the Netherlands eScience Center under project number 027.012.105 and the Yahoo! Faculty Research and Engagement Program. Relative Upper Confidence Bound References pairwise online learning to rank for information retrieval. Information Retrieval, 16(1):63\u201390, 2013. Agrawal, S. and Goyal, N. Analysis of thompson sampling for the multi-armed bandit problem. In Conference on Learning Theory, pp. 1\u201326, 2012. Joachims, T. Optimizing search engines using clickthrough data. In KDD \u201902, pp. 133\u2013142, 2002. Audibert, J.-Y., Munos, R., and Szepesv\u00b4ari, C. Exploration-exploitation tradeoff using variance estimates in multi-armed bandits. Theor. Comput. Sci., 410(19):1876\u20131902, 2009. Kauffmann, E., Korda, N., and Munos, R. Thompson sampling: an asymptotically optimal finite time analysis. In International Conference on Algorithmic Learning Theory, 2012. Auer, P., Cesa-Bianchi, N., and Fischer, P. Finite-time analysis of the multiarmed bandit problem. Machine Learning, 47(2-3):235\u2013256, 2002. Lai, T. L. and Robbins, H. Asymptotically efficient adaptive allocation rules. Advances in Applied Mathematics, 6(1):4 \u2013 22, 1985. Bubeck, S., Munos, R., and Stoltz, G. Pure exploration in multi-armed bandits problems. In Algorithmic Learning Theory, 2009. Liu, T.-Y., Xu, J., Qin, T., Xiong, W., and Li, H. Letor: Benchmark dataset for research on learning to rank for information retrieval. In LR4IR \u201907, in conjunction with SIGIR \u201907, 2007. Bubeck, S., Munos, R., Stoltz, G., and Szepesvari, C. X-armed bandits. Journal of Machine Learning Research, 12:1655\u20131695, 2011. Manning, C., Raghavan, P., and Sch\u00a8 utze, H. Introduction to Information Retrieval. Cambridge University Press, 2008. Capp\u00b4e, O., Garivier, A., Maillard, O.-A., Munos, R., and Stoltz, G. Kullback-Leibler upper confidence bounds for optimal sequential allocation. Annals of Statistics, 41(3):1516\u20131541, 2013. Munos, R. Optimistic optimization of a deterministic function without the knowledge of its smoothness. In NIPS, 2011. Cesa-Bianchi, N. and Lugosi, G. Prediction, Learning, and Games. Cambridge University Press, New York, 2006. Radlinski, F., Kurup, M., and Joachims, T. How does clickthrough data reflect retrieval quality? In CIKM \u201908, pp. 43\u201352, 2008. Craswell, N., Zoeter, O., Taylor, M., and Ramsey, B. An experimental comparison of click position-bias models. In WSDM \u201908, pp. 87\u201394, 2008. Srinivas, N., Krause, A., Kakade, S. M., and Seeger, M. Gaussian process optimization in the bandit setting: No regret and experimental design. In ICML, 2010. de Freitas, N., Smola, A., and Zoghi, M. Exponential regret bounds for Gaussian process bandits with deterministic observations. In ICML, 2012. F\u00a8 urnkranz, J. and H\u00a8 ullermeier, E. (eds.). Preference Learning. Springer-Verlag, 2010. F\u00a8 urnkranz, J., H\u00a8 ullermeier, E., Cheng, W., and Park, S.H. Towards preference-based reinforcement learning. Machine Learning, 2012. Guo, F., Li, L., and Faloutsos, C. Tailoring click models to user goals. In WSCD \u201909, pp. 88\u201392, 2009a. Guo, F., Liu, C., and Wang, Y. Efficient multiple-click models in web search. In WSDM \u201909, pp. 124\u2013131, New York, NY, USA, 2009b. ACM. Hofmann, K., Whiteson, S., and de Rijke, M. A probabilistic method for inferring preferences from clicks. In CIKM \u201911, pp. 249\u2013258, USA, 2011. ACM. Hofmann, K., Whiteson, S., and de Rijke, M. Balancing exploration and exploitation in listwise and Thompson, W.R. On the likelihood that one unknown probability exceeds another in view of the evidence of two samples. Biometrika, pp. 285\u2013294, 1933. Urvoy, T., Clerot, F., F\u00b4eraud, R., and Naamane, S. Generic exploration and k-armed voting bandits. In ICML, 2013. Valko, M., Carpentier, A., and Munos, R. Stochastic simultaneous optimistic optimization. In ICML, 2013. Yue, Y. and Joachims, T. Interactively optimizing information retrieval systems as a dueling bandits problem. In ICML, 2009. Yue, Y. and Joachims, T. Beat the mean bandit. In ICML, 2011. Yue, Y., Broder, J., Kleinberg, R., and Joachims, T. The K-armed dueling bandits problem. Journal of Computer and System Sciences, 78(5):1538\u20131556, September 2012. Relative Upper Confidence Bound 8. Appendix Here we provide some details that were alluded to in the main body of the paper. 8.1. The Condorcet Assumption As mentioned in Section 3, IF and BTM require the comparison probabilities pij to satisfy certain difficult to verify conditions. Specifically, IF and BTM require a total ordering {a1 , . . . , aK } of the arms to exist such that pij > 12 for all i < j. Here we provide evidence that this assumption is often violated in practice. By contrast, the algorithm we propose in Section 4 makes only the Condorcet assumption, which is implied by the total ordering assumption of IF and BTM. In order to test how stringent an assumption the existence of a Condorcet winner is compared the total ordering assumption, we estimated the probability of each assumption holding in our ranker evaluation application. Using the same preference matrix as in our experiments in Section 6, we computed for each K = 1, . . . , 64 the probability PK that a given Karmed dueling bandit problem obtained from considering K of our 64 feature rankers would have a Condorcet winner as follows: first, we calculated the number of K-armed dueling bandit that have a Condorcet winner by calculating for each feature ranker r how many K-armed duelings bandits it can be the Condorcet winner of: for each r, this is equal to NKr , where Nr is the number rankers that r beats; next, we divided this total number of K-armed dueling bandit 1.0 with a Condorcet winner by 64 K , which is the number of all K-armed dueling bandit that one could construct from these 64 rankers. The probabilities PK , plotted as a function of K in Figure 5 (the red curve), were all larger than 0.97. The same plot also shows an estimate of the probability that the total ordering assumption holds for a given K (the blue curve), which was obtained by randomly selecting 100, 000 K-armed bandits and searching for ones that satisfy the total ordering assumption. As can be seen from Figure 5, as K grows the probability that the total ordering assumption holds decreases rapidly. This is because there exist cyclical relationships between these feature rankers and as soon as the chosen subset of feature rankers contains one of these cycles, it fails to satisfy the total ordering condition. By contrast, the Condorcet assumption will still be satisfied as long as the cycle does not include the Condorcet winner. Moreover, because of the presence of these cycles, the probability that the Condorcet assumption holds decreases initially as K increases, but then increases again because the number of all possible K-armed dueling bandit decreases as K approaches 64. Furthermore, in addition to the total ordering assumption, IF and BTM each require a form of stochastic transitivity. In particular, IF requires strong stochastic transitivity; for any triple (i, j, k), with i < j < k, the following condition needs to be satisfied: pik \u2265 max{pij , pjk }. BTM requies the less restrictive relaxed stochastic transitivity, i.e., that there exists a number \u03b3 \u2265 1 such that for all pairs (j, k) with 1 < j < k, we have Probability of satisfying the constraint \u03b3p1k \u2265 max{p1j , pjk }. As pointed out in (Yue & Joachims, 2011), strong stochastic transitivity is often violated in practice, a phenomenon also observed in our experiments: for instance, all of the K-armed dueling bandit on which we experimented require \u03b3 > 1. 0.8 0.6 0.4 0.2 Condorcet Total Ordering 0.0 10 20 30 40 Size of the subset 50 60 Figure 5. The probability that the Condorcet and the total ordering assumptions hold for subsets of the feature rankers. The probability is shown as a function of the size of the subset. Even though BTM permits a broader class of K-armed dueling bandit problems, it requires \u03b3 to be explicitly passed to it as a parameter, which poses substantial difficulties in practice. If \u03b3 is underestimated, the algorithm can in certain circumstances be misled with high probability into choosing the Borda winner instead of the Condorcet winner, e.g., when the Borda winner has a larger average advantage over the remaining arms than the Condorcet winner. On the other hand, though overestimating \u03b3 does not cause the algorithm to choose the wrong arm, it nonetheless results in a severe penalty, since it makes the algorithm Relative Upper Confidence Bound pij Chernoff-Hoeffding upper bound Confidence intervals [lij (t), uij (t)] \u00b5ij (t) on P pij \u2208 / [lij (t), uij (t)] pij \u00b7\u00b7\u00b7 \u00b7\u00b7\u00b7 \u00b7\u00b7\u00b7 \u00b5ij n+2 \u00b5ij n+1 \u00b5ij n T \u03c4nij ij \u03c4n+1 time Figure 6. An illustrations of the idea behind Lemma 1 using an example of how the confidence intervals of a single pair ij of arms (ai , aj ), and their relation to the comparison probability pij , might evolve over time. The time-step \u03c4m denotes th the m time when the arms ai and aj were chosen by RUCB to be compared against each other. We also define ij \u00b5ij m := \u00b5ij (\u03c4m ). The time T is when the confidence intervals [lij (t), uij (t)] begin to include pij . The lemma then states that with probability 1 \u2212 \u03b4, we have T \u2264 C(\u03b4). Moreover, for each time-step, the area of the shaded region under the vertical graphs is the bound given by the ChernoffHoeffding (CH) bound on the probability that the confidence interval will not contain pij . Note that the CH bound has ij 2 the form e\u2212(x\u2212\u00b5n ) and so in order for this number to be the area under a graph (hence making it easier to illustrate in a 2 \u2212(x\u2212\u00b5ij n ) , which is why the graphs are equal to figure), we have drawn the derivative of this function, fnij (x) := |x \u2212 \u00b5ij n |e 0 in the middle. Note that this does not mean that \u00b5ij n has very low probability of being close to pij : the graphs drawn here are not the PDFs of the posteriors, but simply a manifestation of the bound given by the Chernoff-Hoeffding bound. lij (t) ij \u221e ij More specifically, the property that they satisfy is that P pij \u2208 / [lij (t), uij (t)] \u2264 \u2212\u221e fNij (t) (x)dx + u (t) fN (x)dx. ij (t) ij much more exploratory, yielding the \u03b3 7 term in the upper bound on the cumulative regret, as discussed in Section 3. 8.2. Proof of Lemma 1 In this section, we prove Lemma 1, whose statement is repeated here for convenience. Recall from Section 5 that we assume without loss of generality that a1 is the optimal arm. Moreover, given any K-armed dueling bandit algorithm, we define wij (t) to be the number of times arm ai has beaten aj in the first t iterations of the algorithm. We also define uij (t) := wij (t) wij (t)+wji (t) + \u03b1 ln t wij (t)+wji (t) , where \u03b1 is any positive contant, and lij (t) := 1 \u2212 uji (t). Moreover, for any \u03b4 > 0, define C(\u03b4) := 2 (4\u03b1\u22121)K (2\u03b1\u22121)\u03b4 1 2\u03b1\u22121 . Lemma 1. Let P := [pij ] be the preference matrix of a K-armed dueling bandit problem with arms {a1 , . . . , aK }, satisfying p1j > 12 for all j > 1 (i.e., a1 is the Condorcet winner). Then, for any dueling bandit algorithm and any \u03b1 > 12 and \u03b4 > 0, we have P \u2200 t > C(\u03b4), i, j, pij \u2208 [lij (t), uij (t)] > 1 \u2212 \u03b4. (7) Proof. To decompose the lefthand side of (7), we introduce the notation Gij (t) for the \u201cgood\u201d event that at time t we have pij \u2208 [lij (t), uij (t)], which satisfies the following: (i) Gij (t) = Gji (t) because of the triple of equalities pji , lji (t), uji (t) = 1 \u2212 pij , 1 \u2212 uij (t), 1 \u2212 lij (t) . (ii) Gii (t) always holds, since (pii , lii (t), uii (t)) = 1 1 1 2 , 2 , 2 . Together with (i), this means that we only need to consider Gij (t) for i < j. Relative Upper Confidence Bound (iii) Define \u03c4nij to be the iteration at which arms i and j were compared against each other for the nth time. If Gij \u03c4nij + 1 holds, then the events Gij (t) hold for ij ij because when t \u2208 \u03c4nij , \u03c4n+1 , wij all t \u2208 \u03c4nij , \u03c4n+1 and wji remain constant and so in the expressions for uij (t) and uji (t) only the ln t changes, which is a monotonically increasing function of t. So, we have lij (t) \u2264 lij (\u03c4nij + 1) \u2264 pij \u2264 uij (\u03c4nij + 1) \u2264 uij (t). Moreover, the same statement holds with \u03c4nij reij , i.e., if we know placed by any T \u2208 \u03c4nij , \u03c4n+1 that Gij (T ) holds, then Gij (t) also holds for all t\u2208 ij T, \u03c4n+1 Nij (T ) \u2264 T , we can replace Nij (T ) with a variable n that can take values between 0 and T . For the second term, we know that \u03c4nij > T , so we can replace \u03c4nij with T and remove the condition \u03c4nij > T and look at all n \u2264 T . For the third term, since we always have that n < \u03c4nij , we can replace \u03c4nij with n and get a higher probability event. Putting all of this together we get the looser bound P \u2203 t > T, i, j s.t. Bij (t) \u2264 . This is illustrated in Figure 6. P \u2203 n \u2208 {0, . . . , T } : pij \u2212 \u00b5ij n > \u03b1 ln T n +P \u2203 n \u2208 {0, . . . , T } : pij \u2212 \u00b5ij n > \u03b1 ln T n +P \u2203 n > T s.t. pij \u2212 \u00b5ij n > i j, Gij (T ) and \u2200 n s.t. \u03c4nij > T, Gij (\u03c4nij ) . Let us now flip things around and look at the complement of these events, i.e. the \u201cbad\u201d event Bij (t) that pij \u2208 / [lij (t), uij (t)] occurs. Then, subtracting both sides of Equation (8) from 1 and using the union bound gives P \u2203 t > T, i, j s.t. Bij (t) \u2264 i T and Bij (\u03c4nij ) . Further decomposing the righthand side using union bounds and making the condition explicit, we get P \u2203 t > T, i, j s.t. Bij (t) \u2264 P i>j pij \u2212 \u00b5ij Nij (T ) > \u03b1 ln T Nij (T ) + P \u2203 n > T s.t. pij \u2212 \u00b5ij n > \u03b1 ln \u03c4nij n \u03b1 ln \u03c4nij , n ij wij (\u03c4n ) ij ij ) wij (\u03c4n )+wji (\u03c4n is the since T < n < \u03c4nij . Here, \u00b5ij n := frequentist estimate of pij after n comparisons between arms ai and aj . Now, in the above sum, we can upper-bound the first term by looking at the higher probability event that Bij (T ) happens for any possible number of comparisons between ai and aj , and since we know that n=0 i P 2 \u221e \u03b1 ln n n pij \u2212 \u00b5ij n > P + n=T +1 . (9) To bound the expression on line (9), we apply the Chernoff-Hoeffding bound, which in its simplest form states that given i.i.d. random variables X1 , . . . , Xn , whose support is contained in [0, 1] and whose expectan tion satisfies E[Xk ] = p, and defining \u00b5n := X1 +\u00b7\u00b7\u00b7+X , n 2 \u22122na we have P (|\u00b5n \u2212 p| > a) \u2264 2e . This gives us P \u2203 t > T, i, j s.t. Bij (t) \u03b1 ln T T \u22122\u2701 n 2 n + \u2264 2e \u271a i T and pij \u2212 \u00b5ij n > T \u2264 \u03b1 ln n n n=1 K(K \u2212 1) = 2 T 4 T 2\u03b1 n=1 2K 2 \u2264 2\u03b1\u22121 + K 2 T 2K 2 \u2264 2\u03b1\u22121 + K 2 T \u221e T \u221e T (4\u03b1 \u2212 1)K 2 = . (2\u03b1 \u2212 1)T 2\u03b1\u22121 Now, since C(\u03b4) = \u221e 2e n=T +1 \u221e + n=T +1 2 n2\u03b1 \u22122\u2701 n \u03b1 ln n n \u271a dx 1 , since 2\u03b1 is decreasing. x2\u03b1 x dx 2K 2 K2 = + x2\u03b1 T 2\u03b1\u22121 (1 \u2212 2\u03b1)x2\u03b1\u22121 \u221e T (10) (4\u03b1\u22121)K 2 (2\u03b1\u22121)\u03b4 the bound in (10) gives us (7). 1 2\u03b1\u22121 for each \u03b4 > 0,"} -{"id":"oai:arXiv.org:1312.4428","text":"On Constraint Satisfaction Problems below P\u2217 arXiv:1312.4428v2 [cs.CC] 17 Dec 2013 L\u00b4aszl\u00b4o Egri\u2020 Abstract Symmetric Datalog, a fragment of the logic programming language Datalog, is conjectured to capture all constraint satisfaction problems (CSP) in L. Therefore developing tools that help us understand whether or not a CSP can be defined in symmetric Datalog is an important task. It is widely known that a CSP is definable in Datalog and linear Datalog if and only if that CSP has bounded treewidth and bounded pathwidth duality, respectively. In the case of symmetric Datalog, Bulatov, Krokhin and Larose ask for such a duality (2008). We provide two such dualities, and give applications. In particular, we give a short and simple new proof of the result of Dalmau and Larose that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d (2008). In the second part of the paper, we provide some evidence for the conjecture of Dalmau (2002) that every CSP in NL is definable in linear Datalog. Our results also show that a wide class of CSPs\u2013CSPs which do not have bounded pathwidth duality (e.g., the P-complete Horn-3Sat problem)\u2013cannot be defined by any polynomial size family of monotone read-once nondeterministic branching programs. 1 Introduction Constraint satisfaction problems (CSP) constitute a unifying framework to study various computational problems arising naturally in various branches of computer science, including artificial intelligence, graph homomorphisms, and database theory. Loosely speaking, an instance of a CSP consists of a list of variables and a set of constraints, each specified by an ordered tuple of variables and a constraint relation over some specified domain. The goal is then to determine whether variables can be assigned domain values such that all constraints are simultaneously satisfied. Recent efforts have been directed at classifying the complexity of the so-called nonuniform CSP. For a fixed finite set of finite relations \u0393, CSP(\u0393) denotes the nonuniform CSP corresponding to \u0393. The difference between an instance of CSP(\u0393) and an instance of the general CSP is that constraints in an instance of CSP(\u0393) take the form (xi1 , . . . , xik ) \u2208 R for some R \u2208 \u0393. Examples of nonuniform CSPs include k-Sat, Horn-3Sat, Graph H-Coloring, and many others. \u2217 Research supported by NSERC, FQRNT, and ERC Starting Grant PARAMTIGHT (No. 280152). Institute for Computer Science and Control, Hungarian Academy of Sciences (MTA SZTAKI), Budapest, Hungary. {laszlo.egri@mail.mcgill.ca} \u2020 1 For a relational structure B, the homomorphism problem HOM(B) takes a structure A as input, and the task is to determine if there is a homomorphism from A to B. For instance, consider structures that contain a single symmetric binary relation, i.e., graphs. A homomorphism from a graph G to a graph H is a mapping from VG to VH such that any edge of G is mapped to an edge of H. If H is a graph with a single edge then HOM(H) is the set of graphs which are two-colorable. There is a well-known and straightforward correspondence between the CSP and the homomorphism problem. For this reason, from now on we work only with the homomorphism problem instead of the CSP. Nevertheless, we call HOM(B) a CSP and we also write CSP(B) instead of HOM(B), as it is often done in the literature. The CSP is of course NP-complete, and therefore research has focused on identifying \u201cislands\u201d of tractable CSPs. The well-known CSP dichotomy conjecture of Feder and Vardi [13] states that every CSP is either tractable or NP-complete, and progress towards this conjecture has been steady during the last fifteen years. From a complexity-theoretic perspective, the classification of CSP(B) as in P or being NP-complete is rather coarse and therefore somewhat dissatisfactory. Consequently, understanding the fine-grained complexity of CSPs gained considerable attention during the last few years. Ultimately, one would like to know the precise complexity of a CSP lying in P, i.e., to identify a \u201cstandard\u201d complexity class for which a given CSP is complete. Towards this, it was established that Schaefer\u2019s P \u2212 NP dichotomy for Boolean CSPs [24] can indeed be refined: each CSP over the Boolean domain is either definable in first order logic, or complete for one of the classes L, NL, \u2295L, P or NP under AC0 -reductions [2]. The question whether some form of this fine-grained classification extends to non-Boolean domains is rather natural. The two most important tools to study CSPs whose complexity is below P are symmetric Datalog and linear Datalog, syntactic restrictions of the database-inspired logic programming language Datalog. We say that co-CSP(B)\u2013the complement of CSP(B)\u2013is definable in (linear, symmetric) Datalog if the set of structures that do not homomorphically map to B is accepted by a (linear, symmetric) Datalog program.1 Symmetric Datalog programs can be evaluated in logarithmic space (L), and in fact, it is conjectured that if co-CSP(B) is in L then it can also be defined in symmetric Datalog [11]. There is a considerable amount of evidence supporting this conjecture (see, for example, [11, 10, 9, 20, 6]), and therefore providing tools to show whether co-CSP(B) can be defined in symmetric Datalog is an important task. It is well known and easy to see that for any structure B, there is a set of structures O, called an obstruction set, such that a structure A homomorphically maps to B if and only if there is no structure in O that homomorphically maps to A. In fact, there are many possible obstruction sets for any structure B. We say that B has duality X, if B has an obstruction set which has the special property X. The following two well-known theorems relate definability of co-CSP(B) in Datalog and linear Datalog to B having bounded treewidth and bounded pathwidth duality, respectively: 1. co-CSP(B) is definable in Datalog if and only if B has bounded treewidth duality [13]; 2. co-CSP(B) is definable in linear Datalog if and only if B has bounded pathwidth duality [7]. 1 The reason we define co-CSP(B) instead of CSP(B) in (linear, symmetric) Datalog is a technicality explained in Section 2.5. 2 It was stated as an open problem in [4] to find a duality for symmetric Datalog in the spirit of the previous two theorems. We provide two such dualities: symmetric bounded pathwidth duality (SBPD) and piecewise symmetric bounded pathwidth duality (PSBPD). We note that SBPD is a special case of PSBPD. For both bounded treewidth and bounded pathwidth duality, the structures in the obstruction sets are restricted to have some special form. For SBPD and PSBPD the situation is a bit more subtle. In addition that we require the obstruction sets to contain structures only of a special form (they must have bounded pathwidth), the obstruction sets must also possess a certain \u201csymmetric closure\u201d property. To the best of our knowledge, this is the first instance of a duality where in addition to the local requirement that each structure must be of a certain form, the set must also satisfy an interesting global requirement. Using SBPD, we give a short and simple new proof of the main result of [9] that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d. Considering the simplicity of this proof, we suspect that SBPD (or PSBPD) could be a useful tool in an attempt to prove the algebraic symmetric Datalog conjecture [20], a conjecture that proposes an algebraic characterization of all CSPs lying in L. An equivalent form of this conjecture is that \u201cDatalog + n-permutability \u21d2 symmetric Datalog\u201d (by combining results from [18, 3, 21]), where n-permutability is a generalization of Maltsev. One way to gain more insight into the dividing line between CSPs in L and NL is through studying the complexity of CSPs corresponding to oriented paths. It is known that all these CSPs are in NL (by combining results from [12, 8, 7]), and it is natural to ask whether there are oriented paths for which the CSP is NL-complete and L-complete. We provide two classes of oriented paths, C1 and C2 , such that for any B1 \u2208 C1 , the corresponding CSP is NL-complete, and for any B2 \u2208 C2 , the corresponding CSP is in L. In fact, it can be seen with the help of [20] that for most B2 \u2208 C2 , CSP(B2 ) is L-complete. To prove the membership of CSP(B2 ) in L (for B2 \u2208 C2 ), we use PSBPD in an essential way. One can hope to build on this work to achieve an L-NL dichotomy for oriented paths. In the second part of the paper, we investigate CSPs in NL. Based on the observation that any CSP known to be in NL is also known to be definable by a linear Datalog program, Dalmau conjectured that every CSP in NL can be defined by a linear Datalog program [7]. Linear Datalog(suc,\u00ac) (linDat(suc,\u00ac)) denotes the extension of linear Datalog in which we allow negation and access to an order over the domain of the input. It is known that any problem in NL can be defined by a linDat(suc,\u00ac) program [7, 15, 19], and therefore one way to prove the above conjecture would be to show that any CSP that can be defined by a linDat(suc,\u00ac) program can also be defined by a linear Datalog program. We consider a restriction of the conjecture because proving it in its full generality would separate NL from P (using [1]). Read-once linear Datalog(suc) (1-linDat(suc)) is a subclass of linDat(suc,\u00ac), but a subclass that has interesting computational abilities, and for which we are able to find the chink in the armor. We can easily define some NL-complete problems in 1-linDat(suc), such as the CSP directed st-connectivity (st-Conn), and also problems that are not homomorphismclosed, such as determining if the input graph is a clique on 2n vertices, n \u2265 1. Because any problem that can be defined with a linear Datalog program must be homomorphism closed, it follows that 1-linDat(suc) can define nontrivial problems which are in NL but which are not definable by any linear Datalog program. However, our main result shows that if co-CSP(B) 3 can be defined by a 1-linDat(suc) program, then co-CSP(B) can also be defined by a linear Datalog program. The crux of our argument applies the general case of the Erd\u02ddos-Ko-Rado theorem to show that a 1-linDat(suc) program does not have enough \u201cmemory\u201d to handle structures of unbounded pathwidth. Our proof establishing the above result for 1-linDat(suc) programs can be adapted to show a parallel result for a subclass of nondeterministic branching programs, which constitute an important and well-studied class of computational models (see the book [25]). More precisely, we show that if co-CSP(B) can be defined by a poly-size family of read-once2 monotone nondeterministic branching programs (mnBP1(poly)) then co-CSP(B) can also be defined by a linear Datalog program.3 Finally, our results can be interpreted as lower-bounds on a wide class of CSPs: if B does not have bounded pathwidth duality, then co-CSP(B) cannot be defined with any 1linDat(suc) program or with any mnBP1(poly). A specific example of such a CSP would be the P-complete Horn-3Sat problem, and more generally, Larose and Tesson showed that any CSP whose associated variety admits the unary, affine or semilattice types does not have bounded pathwidth duality (see [20] for details). 2 Preliminaries 2.1 Basic Definitions A vocabulary (or signature) is a finite set of relation symbols with associated arities. The arity function is denoted with ar(\u00b7). If A is a relational structure over a vocabulary \u03c4 , then RA denotes the relation of A associated with the symbol R \u2208 \u03c4 . The lightface equivalent of the name of the structure denotes the universe of the structure, e.g., the universe of A is A. \u02dc over a vocabulary \u03c4 is a pair (A, \u02dc T ): T is a set of pairs of the form A tuple structure A \u02dc i.e., A\u02dc contains (R, t), where R \u2208 \u03c4 and t is an ar(R)-tuple, and A\u02dc is the domain of A, every element that appears in some tuple t, and possibly some other elements. Slightly \u02dc to mean (R, t) \u2208 T , where A \u02dc = (A, \u02dc T ). Clearly, abusing notation, we write (R, t) \u2208 A tuple structures are equivalent to relational structures. If A is a relational structure, we \u02dc and vice versa. For convenience, we use the denote the equivalent tuple structure with A, two notations interchangeably. We note that all structures in this paper are finite. Let B be a structure of the same signature \u03c4 as A. The union A \u222a B of A and B is the \u03c4 -structure whose universe is A \u222a B, and for each R \u2208 \u03c4 , RA\u222aB is defined as RA \u222a RB . (Note that it is possible that A \u2229 B = \u2205.) A homomorphism from A to B is a map f from A to B such that f (RA ) \u2286 RB for each R \u2208 \u03c4 . If there exists a homomorphism from A to B, we f often denote it with A \u2192 B. If that homomorphism is f , we write A \u2212\u2192 B. A structure is called a core if it has no homomorphism to any of its proper substructures. A retract of a structure B is an induced substructure B of B such that there is a homomorphism g : B \u2192 B with g(b) = b for every b \u2208 B . A retract of B that has minimal size among 2 Our read-once restriction for nondeterministic branching programs is less stringent than the usual restriction because we require the programs to be read-once only on certain inputs. 3 A 1-linDat(suc) can be converted into an mnBP1(poly), so another way to present our results would be to do the proofs in the context of mnBP1s, and then to conclude the parallel result for 1-linDat(suc). 4 all retracts of B is called a core of B. It is well known that all cores of a structure are isomorphic, and so one speaks of the core of a structure B, core(B). We denote by CSP(B) the set {A | A is a \u03c4 -structure such that A \u2192 B}, and by co-CSP(B) the complement of CSP(B), i.e., the set {A | A is a \u03c4 -structure such that A \u2192 B}. If we are given a class of \u03c4 -structures C such that for any A \u2208 C, and any B such that A \u2192 B it holds that B \u2208 C, then we say that C is homomorphism-closed. Isomorphism closure is defined in a similar way. An n-ary operation on a set A is a map f : An \u2192 A. Given an h-ary relation R and an n-ary operation f on the same set A, we say that f preserves R or that R is invariant under f if the following holds: given any matrix M of size h \u00d7 n whose columns are in R, applying f to the rows of M produces an h-tuple in R. A polymorphism of a structure B is an operation f that preserves each relation in B. Definition 1 (Maltsev Operation). A ternary operation f : A3 \u2192 A on a finite set A is called a Maltsev operation if it satisfies the following identities: f (x, y, y) = f (y, y, x) = x, \u2200x, y \u2208 A. 2.2 Datalog We provide only an informal introduction to Datalog and its fragments, and the reader can find more details, for example, in [22, 7, 11]. Datalog is a database-inspired query language whose connection with CSP-complexity is now relatively well understood (see, e.g., [3]). Let \u03c4 be some finite vocabulary. A Datalog program over \u03c4 is specified by a finite set of rules of the form h \u2190 b1 \u2227 \u00b7 \u00b7 \u00b7 \u2227 bt , where h and the bi are atomic formulas R(x1 , . . . , xk ). When we specify the variables of an atomic formula, we always list the variables from left to right, or we simply provide a tuple x of variables whose i-th variable is x[i]. We distinguish two types of relational predicates occurring in a Datalog program: predicates that occur at least once in the head of a rule (i.e., its left-hand side) are called intensional database predicates (IDBs) and are not in \u03c4 . The predicates which occur only in the body of a rule (its right-hand side) are called extensional database predicates (EDBs) and must all lie in \u03c4 . A rule that contains no IDB in the body is called a nonrecursive rule, and a rule that contains at least one IDB in the body is called a recursive rule. A Datalog program contains a distinguished IDB of arity 0 which is called the goal predicate; a rule whose head IDB is a goal IDB is called a goal rule. Linear Datalog is a syntactic restriction of Datalog in which there is at most one IDB in the body of each rule. The class of linear Datalog programs that contains only rules with at most k variables and IDBs with at most j \u2264 k variables is denoted by linear (j, k)-Datalog. We say that the width of such a linear Datalog program is (j, k). Symmetric Datalog is a syntactic restriction of linear Datalog. A linear Datalog program \u00af P is symmetric if for any recursive rule I(x) \u2190 J(y) \u2227 E(z) of P (except for goal rules), \u00af where E(z) is a shorthand for the conjunction of the EDBs of the rule over variables in z, \u00af the symmetric pair J(y) \u2190 I(x) \u2227 E(z) of that rule is also in P. The width of a symmetric Datalog program is defined similarly to the width of a linear Datalog program. We explain the semantics of linear (symmetric) Datalog using derivations (it could also be explained with fixed point operators, but that would be inconvenient for the proofs). Let 5 P be a linear Datalog program with vocabulary \u03c4 . A P-derivation with codomain D is a sequence of pairs D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ), where \u03c1 is a rule of P, and \u03bb is a function from the variables V of \u03c1 to D, \u2200 \u2208 [q]. The sequence D must satisfy the following properties. Rule \u03c11 is nonrecursive, and \u03c1q is a goal rule. For all \u2208 [q \u2212 1], the head IDB I of \u03c1 is the IDB in the body of \u03c1 +1 , and if the variables of I in the head of \u03c1 and the body of \u03c1 +1 are x and y, respectively, then \u03bb (x[i]) = \u03bb +1 (y[i]), \u2200i \u2208 [ar(I)]. Let D be a derivation. Let R(z) be an EDB (with variables z) appearing in some rule \u03c1 of D. We write R(t) to denote that \u03bb (z) = t, i.e., that \u03bb instantiates the variables of R(z) to t. If R(z) appears in some rule \u03c1 of D and \u03bb (z) = t, we say that R(t) appears in \u03c1 , or less specifically, that R(t) appears in D. Given a structure A and a derivation D with codomain A for a program P, we say that D \u02dc The notation is a derivation for A if for every R(t) that appears in a rule of D, (R, t) \u2208 A. for a P-derivation for a structure A will have the form DP (A). A linear (symmetric) Datalog program P accepts an input structure A if there exists a P-derivation for A. Definition 2 (Read-once Derivation). We say that a derivation D is read-once if every R(t) that appears in D appears exactly once in D, except when R is the special EDB suc, first, or last, defined in Section 4. An example is given in Figure 1. The vocabulary is \u03c4 = {E 2 , S 1 , T 1 }, where the superscripts denote the arity of the symbols. Notice that in the symmetric Datalog program P, rules (2) and (3) form a symmetric pair. It is not difficult to see that P accepts a \u03c4 -structure A if and only if there is an oriented path (see Section 3.1) in E A from an element in S A to an element in T A . 2.3 Path Decompositions and Derivations Definition 3. [Path Decomposition] Let S be a \u03c4 -structure. A (j, k)-path decomposition (or path decomposition of width (j, k)) of S is a sequence S0 , . . . , Sn\u22121 of subsets of S such that \u02dc \u2203 \u2208 {0, . . . , n \u2212 1} such that {a1 , . . . , aar(R) } \u2286 S ; 1. For every (R, (a1 , . . . , aar(R) )) \u2208 S, 2. If a \u2208 Si \u2229 Si (i < i ) then a \u2208 S for all i < < i ; 3. \u2200 \u2208 {0, . . . , n \u2212 1}, |S | \u2264 k, and \u2200 \u2208 {0, . . . , n \u2212 2}, |S \u2229 S +1 | \u2264 j. For ease of notation, it will be useful to introduce a concept closely related to path decompositions. Let \u03c4 be a vocabulary. Let S be a \u03c4 -structure that can be expressed as S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 , where the S0 , . . . , Sn\u22121 (the universes of the Si ) satisfy properties 2 and 3 above. We say that S is a (j, k)-path, and that (S0 , . . . , Sn\u22121 ) is a (j, k)-path representation of S. We denote (j, k)-path representations with script letters, e.g., S = (S0 , . . . , Sn\u22121 ). The substructure Si \u222a \u00b7 \u00b7 \u00b7 \u222a Si of S (assuming a (j, k)-representation is fixed) is denoted by S[i,i ] . We call n the length of the representation. Obviously, a structure is a (j, k)-path if and only if it admits a (j, k)-path decomposition. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation for some linear or symmetric program P with vocabulary \u03c4 . We can extract from D a \u03c4 -structure Ex(D) such that D is a derivation \u02dc for each R(t) that appears in D for Ex(D). We specify Ex(D) as a tuple structure A: 6 I(x) \u2190 S(x) I(y) \u2190 I(x) \u2227 E(x, y) I(x) \u2190 I(y) \u2227 E(x, y) G \u2190 I(x) \u2227 T (x) S G = {a} T G = {d} (1) (2) (3) (4) b f g e I(b) I(c) I(d) G E(a, b) E(c, b) E(c, d) T (d) \u03c12 \u03c13 \u03c14 \u03c15 \u03c11 \u03bb1 (x) = a c a I(a) S(a) d \u03bb2 (x) = a \u03bb3 (x) = c \u03bb2 (y) = b \u03bb3 (y) = b \u03bb4 (x) = c \u03bb4 (y) = d \u03bb5 (x) = d Figure 1: Top left: Symmetric Datalog program P. Top right: Input structure G where the binary relation E G is specified by the digraph. Bottom: Visualization of a P-derivation DP (G) = (\u03c11 , \u03bb1 ), . . . , (\u03c15 , \u03bb5 ) for G, where \u03c11 is the nonrecursive rule (1), \u03c12 , \u03c14 are rule (2), \u03c13 is rule (3), and \u03c15 is the goal rule (4). In the diagram, for example, the dashed box corresponds to rule \u03c12 , and it is the rule I(y) \u2190 I(x) \u2227 E(x, y) of P, where \u03bb2 assigns a to variable x and b to variable y. Observe that DP (G) is read-once. \u02dc and set A\u02dc to be the set of those elements that appear (R \u2208 \u03c4 ), we add the pair (R, t) to A, in a tuple. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation. For each x that is in a rule \u03c1 for some \u2208 [q], call x the indexed version of x. We define an equivalence relation Eq(D) on the set of indexed variables of D. First we define a graph G = (V, E) as: \u2022 V is the set of all indexed versions of variables in D; \u2022 (x , y ) \u2208 E if = + 1, x is the i-th variable of the head IDB I of \u03c1 , and y is the i-th variable of the body IDB I of \u03c1 +1 . Two indexed variables x and y are related in Eq(D) if they are connected in G. Observe that if C = {x11 , x22 , . . . , xcc } is a connected component of G, then it must be that \u03bb 1 (x1 ) = \u03bb 2 (x2 ) = \u00b7 \u00b7 \u00b7 = \u03bb c (xc ). Definition 4 (Free Derivation). Let P be a linear Datalog program and D = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ) be a derivation for P. Then D is said to be free if for any two (x , y ) \u2208 Eq(D), \u03bb (x) = \u03bb (y). Intuitively, this definition says that D is free if any two variables in D which are not \u201cforced\u201d to have the same value are assigned different values. 2.4 Canonical Programs Fix a \u03c4 -structure B and j \u2264 k. Let Q1 , . . . , Qn be all possible at most j-ary relations over B. The canonical linear (j, k)-Datalog program for B ((j, k)-CanL(B)) contains an IDB Im of 7 \u00af the same arity as Qm for each m \u2208 [n]. The rule Ic (x) \u2190 Id (y)\u2227E(z) belongs to the canonical \u00af program if it contains at most k variables, and the implication Qc (x) \u2190 Qd (y) \u2227 E(z) is true for all possible instantiation of the variables to elements of B. The goal predicate of this program is the 0-ary IDB Ig , where Qg = \u2205. The canonical symmetric (j, k)-Datalog program for B ((j, k)-CanS(B)) has the same definition as (j, k)-CanL(B), except that it has less rules due to the following additional \u00af \u00af restriction. If Ic (x) \u2190 Id (y) \u2227 E(z) is in the program, then both Qc (x) \u2190 Qd (y) \u2227 E(z) and \u00af Qd (y) \u2190 Qc (x) \u2227 E(z) must hold for all possible instantiation of the variables to elements of B. The program (j, k)-CanS(B) is obviously symmetric. When it is clear from the context, we write CanL(B) and CanS(B) instead of (j, k)-CanL(B) and (j, k)-CanS(B), respectively. 2.5 Defining CSPs The following discussion applies not just to Datalog but also to its symmetric and linear fragments. It is easy to see that the class of structures accepted by a Datalog program is homomorphism-closed, and therefore it is not possible to define CSP(B) in Datalog. However, co-CSP(B) is closed under homomorphisms, and in fact, it is often possible to define co-CSP(B) in Datalog. The following definition is key. Definition 5 (Obstruction Set). A set O of \u03c4 -structures is called an obstruction set for B, if for any \u03c4 -structure A, A \u2192 B if and only if there exists S \u2208 O such that S \u2192 A. In other words, an obstruction set defines co-CSP(B) implicitly as A \u2208 co-CSP(B) if and only if there exists S \u2208 O such that S \u2192 A. If O above can be chosen to have property X, then we say that B has X-duality. In the next section we show that co-CSP(B) is definable in symmetric Datalog if and only if B has symmetric bounded pathwidth duality. 3 3.1 On CSPs in symmetric Datalog Definitions An oriented path is a digraph obtained by orienting the edges of an undirected path. In other words, an oriented path has vertices v0 , . . . , vq+1 and edges e0 , . . . , eq , where ei is either (vi , vi+1 ), or (vi+1 , vi ). The length of an oriented path is the number of edges it contains. We call (vi , vi+1 ) a forward edge and (vi+1 , vi ) a backward edge. Oriented paths can be thought of as relational structures over the vocabulary {E 2 }, so we denote them with boldface letters. For an oriented path P, we can find a mapping level : P \u2192 {0, 1, 2, . . . } such that level(b) = level(a)+1 whenever (a, b) is an edge of P. Clearly, there is a unique such mapping with the smallest possible values. The level of an edge (a, b) of P is level(a), i.e., the level of the starting vertex of (a, b). The height(P) of an oriented path P is maxa\u2208P level(a). Let P be an oriented path that has a vertex u with indegree 0 and outdegree 1, and a vertex v with indegree 1 and outdegree 0. We say that P is minimal if u is in the bottommost level and v is in the topmost level, and there are no other vertices of P in the bottommost or the topmost levels. 8 A zigzag operator \u03be takes a (j, k)-path representation S = (S0 , . . . , Sn\u22121 ) of a (j, k)-path S and a minimal oriented path P = e0 , . . . , eq such that height(P) = n, and it returns another (j, k)-path \u03be(S , P). Intuitively, \u03be(S , P) is the (j, k)-path S \u201cmodulated\u201d by P such that the forward and backward edges ei of P are mimicked in \u03be(S , P) by \u201cforward and backward\u201d copies of Slevel(ei ) . Before the formal definition, it could help the reader to look at the right side of Figure 2, where the oriented path used to modulate the (j, k)-path over the vocabulary E 2 (i.e., digraphs) with representation (S0 , S1 , S2 ) is P on the left side. The left side is a more abstract example, and the reader might find it useful after reading the definition. We inductively define the (j, k)-path \u03be(S , P) as (Se0 , Se1 , . . . , Seq ) together with a sequence of isomorphisms \u03d5e0 , \u03d5e1 , . . . , \u03d5eq , where \u03d5ei is an isomorphism from Sei to Slevel(ei ) , 0 \u2264 i \u2264 q. For the base case, we define Se0 to be an isomorphic copy of S0 , and \u03d5e0 to be the isomorphism that maps Se0 back to S0 . Assume inductively that Se0 , . . . , Sei\u22121 and \u03d5e0 , . . . , \u03d5ei\u22121 are already defined. Let Sei be an isomorphic copy of Slevel(ei ) with domain disjoint from Se0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sei\u22121 , and fix \u03d5ei to be the isomorphism that maps back Sei to Slevel(ei ) . We \u201cglue\u201d Sei to Sei\u22121 by renaming some elements of Sei to elements of Sei\u22121 . To facilitate understanding, we can think of the already constructed structures Se0 , . . . , Sei\u22121 as labels of the edges e0 , . . . , ei\u22121 of P, respectively, and we want to determine Sei , the label of the next edge. The connection between Sei\u22121 and Sei will be defined such that Sei\u22121 and Sei \u201cmimic\u201d the orientation of the edges ei\u22121 and ei . We resume our formal definition. Set = level(ei ), and let = \u2212 1 if ei is a forward edge, and = + 1 if ei is a backward edge. If an element x \u2208 Sei and an element y \u2208 Sei\u22121 are both copies of the same element a \u2208 S \u2229 S , then rename x to y in Sei . After all such (a) in Sei elements are renamed, Sei becomes Sei . That is, for all a \u2208 S \u2229 S , rename \u03d5e\u22121 i (a) to obtain S . to \u03d5\u22121 ei ei\u22121 We define the isomorphism \u03d5ei from Sei to Slevel(ei ) as: \u03d5ei (x) = 3.2 if x \u2208 Sei and x \u2208 Sei\u22121 \u03d5ei (x) \u03d5ei\u22121 (x) if x \u2208 Sei \u2229 Sei\u22121 . Two Dualities for Symmetric Datalog The two main theorems (Theorems 9 and 15) of this section can be combined to obtain the equivalence of the statements (1), (3) and (4) in Theorem 6 below. The proof of the implication (1) \u2192 (2) is a direct adaptation of the proof of the result from [13] that if co-CSP(B) is defined by a (j, k)-Datalog program, then it is also defined by the canonical (j, k)-Datalog program (see also [9]). Note that (1) \u2192 (2) is also obvious from the proof of Theorem 9 below. Theorem 6. For a finite structure B, TFAE: 1. There is a symmetric Datalog program that defines co-CSP(B); 2. The canonical symmetric (j, k)-Datalog program defines co-CSP(B); 3. B has symmetric bounded pathwidth duality (for some parameters); 9 P S \u03be(S , P) e4 S2 Se4 cd c d e1 S1 ab S0 e2 e3 Se1 ab e0 Se0 \u03be(S , P) S Se4 S2 \u03be cd Se2 Se3 S1 Se1 Se2 Se3 a b S0 Se0 Figure 2: Left: Applying a zigzag operator to the (j, k)-path S with the (j, k)-representation S = (S0 , S1 , S2 ). Suppose that S0 \u2229 S1 = {a, b} and S1 \u2229 S2 = {c, d}. We demonstrate how Se0 and Se2 are obtained. Se0 is a disjoint copy of S0 (and the copy of a and b in Se0 are a and b , respectively). To obtain Se2 , first make a disjoint copy Se2 of Slevel(e2 ) = S1 . Set = level(e2 ) = 1. Since e1 is a forward edge and e2 is a backward edge, = + 1 = 2. Therefore to \u201cglue\u201d Se2 to Se1 , we need to look at S \u2229 S = {c, d}. Assume that the copy of c and d in Se1 are c and d , respectively. Furthermore, assume that the copy of c and d \u02dc respectively. To obtain Se , we rename c\u02dc to c , and d\u02dc to d in S . Right: in Se2 are c\u02dc and d, 2 e2 A specific example when S0 , S1 , S2 are the digraphs in the boxes. The dashed lines indicate identification of vertices. The level of (Se2 , Se3 ), for example, is 0 since e2 and e3 share a vertex at vertex level 1. 4. B has piecewise symmetric bounded pathwidth duality (for some parameters). 3.2.1 Symmetric Bounded Pathwidth Duality Definition 7 ((j, k)-symmetric). Assume that O is a set of (j, k)-paths. Suppose furthermore that a (j, k)-path representation can be fixed for each structure in O such that the following holds. For every S \u2208 O with representation S of some length n, and every minimal oriented path P of height n, it holds that \u03be(S , P) \u2208 O. Then O is said to be (j, k)-symmetric. Definition 8 (SBPD). A structure B has (j, k)-symmetric bounded pathwidth duality ((j, k)SBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k)-symmetric. The following is our main duality theorem for symmetric Datalog: Theorem 9. For a finite structure B, co-CSP(B) can be defined by a symmetric (j, k)Datalog program if and only if B has (j, k)-SBPD. We will use Lemma 10 in the proof of Theorem 9. Lemma 10 can be proved using the standard canonical Datalog program argument. Lemma 11 is also used in the proof of Theorem 9 and it is the main technical lemma of the section. 10 Lemma 10. If CanS(B) accepts a structure A, then A \u2192 B. Proof. Structure B is not accepted by CanS(B) because a derivation could be translated into a valid chain of implications, which is not possible by the definition of CanS(B). If CanS(B) accepts A and A \u2192 B, then CanS(B) accepts B, a contradiction. Lemma 11. For any \u03c4 -structures A and B, if there exists a structure S with a (j, k)-path representation S of some length n such that S \u2192 A, and for any minimal oriented path P of height n, it holds that \u03be(S , P) \u2192 B, then (j, k)-CanS(B) accepts A. To prove Lemma 11 we need to define an additional concept related to the zigzag operator. Once the (j, k)-path \u03be(S , P) = (Se0 , . . . , Seq ) is defined, where P is the path e0 , . . . , eq , each pair (Sei , Sei+1 ), \u2200i \u2208 {0, . . . , q \u2212 1} is assigned a level : level(Sei , Sei+1 ) is the level of the vertex v minus 1, where v is the vertex that ei and ei+1 share (see Figure 2). Proof of Lemma 11. For the rest of this proof, let CS denote (j, k)-CanS(B), and CL denote (j, k)-CanL(B). If program CS accepts structure S then because S \u2192 A, CS also accepts A. So it is sufficient to show that program CS accepts structure S. First we specify how to associate a CL-derivation with \u03be(S , P), where P is a minimal oriented path of height n. Assume that \u03be(S , P) = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sq . For each i \u2208 {0, . . . , q \u2212 1}, fix an arbitrary order on the elements of Si \u2229 Si+1 . Assume that |Si \u2229 Si+1 | = j (\u2264 j), and define the j -tuple si such that si [ ] is the -th element of Si \u2229 Si+1 . We define sq to be the empty tuple. It is good to keep in mind that later, si will be associated with the IDB Ji . The derivation will be DCL (\u03be(S , P)) = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ). We specify \u03c1i as \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y if i \u2208 [q] \u00af 0) J0 (x0 ) \u2190 E(y if i = 0. We begin with describing the EDBs of a rule \u03c1i together with their variables. Assume that Si = {d1 , . . . , dt }, and observe that t \u2264 k. The variables of \u03c1i are v1 , . . . , vt . For every R \u2208 \u03c4 , and every tuple (df (1) , . . . , df (r) ) \u2208 RSi , where r = ar(R), R(vf (1) , . . . , vf (r) ) is an EDB of \u03c1i . We describe the variables of the IDBs Ji\u22121 and Ji . Assume that si\u22121 = (dg(1) , . . . , dg(j1 ) ) and si = (dh(1) , . . . , dh(j2 ) ). Then the IDB in the body of \u03c1i together with its variables is Ji\u22121 (vg(1) , . . . , vg(j1 ) ), and the head IDB together with its variables is Ji (vh(1) , . . . , vh(j2 ) ). The function \u03bbi simply assigns the value dg to the variable vg , \u2200g \u2208 [t]. It remains to specify the IDBs, i.e., which IDBs of CL the Ji -s correspond to. For each i \u2208 {0, . . . , q}, Ii denotes IMiP , where MiP is a subset of B j for some j \u2264 j. We define the sequence M0P , M1P , . . . , MqP inductively. To define M0P , consider the nonrecursive \u00af 0 ). Assume that the arity of J0 is j , and that y0 contains k varirule J0 (x0 ) \u2190 E(y ables. (Note that the variables in x0 and y0 are not necessarily disjoint.) For all possible functions \u03b1 : x0 [1], . . . , x0 [j ], y0 [1], . . . , y0 [k ] \u2192 B such that the conjunction of EDBs P \u00af E(\u03b1(y 0 [1]), . . . , \u03b1(y0 [k ])) is true, place the tuple (\u03b1(x0 [1]), . . . , \u03b1(x0 [j ])) into M0 . P Assume that Mi\u22121 is already defined. Then similarly to the base case, for each possible P instantiation \u03b1 of the variables of \u03c1i over B with the restriction that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , if the P conjunction of EDBs of \u03c1i is true, then add the tuple \u03b1(xi ) to Mi . It is not difficult to see that if MqP = \u2205, then we can construct a homomorphism from \u03be(P, P) to B which would be a contradiction. 11 For each i \u2208 {0, . . . , q \u2212 1}, assume that (Si , Si+1 ) has level i . Then we say that the IDB Ji has level i and we write level(Ji ) = i . We proceed to construct a CS-derivation DCS (S) for S. Let Q be a directed path of height n. We construct DCS (S) just like we would construct DCL (\u03be(S , Q)) above, except Q that we will define the subscripts of the IDBs, M0Q , . . . , Mn\u22121 , differently, so that every rule of the resulting derivation belongs to CS. From now on we write M0 , . . . , Mn\u22121 instead of Q . M0Q , . . . , Mn\u22121 To define M0 , . . . , Mn\u22121 , let P0 , P1 , . . . be an enumeration of all (finite) minimal oriented paths of height n. Intuitively, we will collect in Nm all subscripts (recall that a subscript is a relation) of all those IDBs which have the same level in DL (\u03be(S , Pm )). Formally, for each \u2208 {0, . . . , n \u2212 1} define Nm = {MtPm | level(Jt ) = }. Then we collect the subscripts at a fixed level in O over all derivations corresponding to P0 , P1 , . . . . Formally, for each \u2208 {0, . . . , n \u2212 1}, we define O = N0 \u222a N1 , . . . . We are ready to define M0 , . . . , Mn\u22121 . For each s \u2208 {0, . . . , n \u2212 1}, define Ms = W \u2208Os W . It remains to show that every rule of the derivation we defined is in S and that the last IDB is the goal IDB. If the last IDB is not the goal IDB of S, then Mn\u22121 = \u2205. By definition, m it must be that for some minimal oriented path Pm of height n and length qm , MP qm \u22121 = \u2205 m (note that the last IDB of DCL (\u03be(P, Pm )) has subscript MP qm \u22121 ). As noted before, this would mean that \u03be(P, Pm ) \u2192 B, a contradiction. We show that each rule of DCS (S) as defined above belongs to CanS(B). Suppose DCS (S) contains a rule \u03c1 \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y that is not in CanS(B). By definition, there cannot be an instantiation \u03b1 of variables of \u03c1 to elements of B such that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , the conjunction of EDBs holds, but \u03b1(xi ) \u2208 Mi . Assume then that there is an \u03b1 such that \u03b1(xi ) \u2208 Mi , the conjunction of EDBs holds, but \u03b1(xi\u22121 ) \u2208 Mi\u22121 . It is also not difficult to see that this is not possible because we used all minimal oriented paths in the construction of DCS (S). Proof of Theorem 9. If CSP(B) is defined by a symmetric (j, k)-Datalog program P, then using the symmetric property of P, it is laborious but straightforward to show that O= D is a free derivation of P {Ex(D)} is a (j, k)-symmetric obstruction set for B. For the converse, assume that B has (j, k)-SBPD. Let O be a symmetric obstruction set of width (j, k) (i.e., the path decomposition of every structure in O has width (j, k)) for B. We claim that (j, k)-CanS(B) defines CSP(B). Assume that A \u2192 B. Then by Lemma 10, (j, k)CanS(B) does not accept A. Suppose now that A \u2192 B. Then by assumption, there exists a (j, k)-path S \u2208 O with a representation S of length n such that S \u2192 A. Furthermore, since O is symmetric, for any minimal oriented path P of height n, \u03be(S , P) \u2192 B. It follows from Lemma 11 that CanS(B) accepts A. 12 3.2.2 Piecewise Symmetric Bounded Pathwidth Duality Piecewise symmetric bounded pathwidth duality (PSBPD) for symmetric Datalog is less stringent than SBPD; however, the price is larger program width. Although the following definitions might seem technical, the general idea is simple: a piecewise symmetric obstruction set O does not need to contain all (j, k)-paths obtained by \u201czigzagging\u201d (j, k)-paths in O in all possible ways. It is sufficient to zigzag a (j, k)-path S using only oriented paths which \u201cavoid\u201d certain segments of S: some constants c and d are fixed for O, and there are at most c fixed segments of S that are avoided by the zigzag operator, each of size at most d. We give the formal definitions. Definition 12 ((c, d)-filter). Let S be a (j, k)-path with a representation S = S0 , . . . , Sn\u22121 . A (c, d)-filter F for S is a set of intervals {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} such that \u2022 c \u2264 c; 0 \u2264 s1 ; tc \u2264 n \u2212 1; si \u2264 ti , \u2200i \u2208 [c ]; and t + 2 \u2264 s +1 , \u2200 \u2208 [c \u2212 1]; \u2022 | i\u2208[s ,t ] Si | \u2264 d, \u2200 \u2208 [c ]. Elements of F are called delimiters. An oriented path P of height n obeys a (c, d)-filter F if for any delimiter [si , ti ] \u2208 F , the set of edges e of P such that si \u2264 level(e) \u2264 ti form a (single) directed path. A demonstration is given in Figure 3. S P FS S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 Figure 3: S is a (j, k)-path representation of S. FS is the (3, 2k)-filter {[0, 0], [3, 4], [7, 8]} for S . P is an oriented path that obeys the filter. For example, observe that the edges at levels 3 and 4 form a directed subpath, and that \u201czigzagging\u201d happens only at those parts of P that do not fall into the intervals of the filter. Definition 13 (Piecewise Symmetric). Assume that O is a set of (j, k)-paths, and c and d are nonnegative integers. Suppose furthermore that for each S \u2208 O, there is a (j, k)-path representation S , and a (c, d)-filter FS such that the following holds. For every S \u2208 O of some length n, and every minimal oriented path P of height n that obeys the filter FS , it holds that \u03be(S , P) \u2208 O. Then O is (j, k, c, d)-piecewise symmetric. Roughly speaking, an oriented path P is allowed to modulate only those segments of S which do not correspond to any delimiters in FS . Compare Definition 13 with Definition 7, and observe that the only difference is that in the piecewise case, the oriented paths must be of a restricted form. Therefore a set that is (j, k)-symmetric is also (j, k, c, d)-piecewise symmetric for any c and d. We simply associate the empty (c, d)-filter with each structure. 13 Definition 14 (PSBPD). A structure B has (j, k, c, d)-piecewise symmetric bounded pathwidth duality ((j, k, c, d)-PSBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k, c, d)-piecewise symmetric. Theorem 15. For a finite structure B, B has SBPD (for some parameters) if and only if B has PSBPD (for some parameters). We need the corollary of the following lemma in the proof of the above theorem. Lemma 16. Let P be a minimal oriented path e0 , . . . , en\u22121 with the (1, 2)-path representation P = (e0 , . . . , en\u22121 ), where we think of ei as a structure with two domain elements and a binary relation that contains the tuple ei . Let Q be a minimal oriented path f0 , . . . , fm with n edge levels. Then the oriented path \u03be(P, Q) is minimal and has the same height as P. Proof. It is obvious that \u03be(P, Q) is an oriented path. Furthermore the map that assigns every vertex of \u03be(P, Q) to its original in P is a homomorphism. It is easy to check that this homomorphism maps the edges of \u03be(P, Q) back to their originals and the level of an edge in \u03be(P, Q) is the same as the level of the original of that edge. Checking the minimality of \u03be(P, Q) is also straightforward. Corollary 17. Let O be a set of (j, k)-paths, where a (j, k)-representation is fixed for each path. Let O be the set that contains all (j, k)-paths that can be obtained from a (j, k)-path in O by applying some zigzag operator. Then O is (j, k)-symmetric. Remark: A similar statement holds in the piecewise symmetric case. Proof. Let S be an element of O . If we can show that applying an arbitrary zigzag operator to S yields a (j, k)-path in O , then we are clearly done. So assume that S was obtained from S \u2208 O by applying a zigzag operator. The (j, k)-path S inherits the (j, k)-representation of S in a natural way. Then we apply any zigzag operator to S to obtain S , and we need to show that S is in O . We get from S to S using a zigzag operator and from S to S another zigzag operator. Using Lemma 17, we can see that we can replace these two zigzag operators by a single one to obtain S from S directly. Proof of Theorem 15. Let O be a (j, k)-symmetric obstruction set for B. As observed above, for any c and d, O is also (j, k, c, d)-piecewise symmetric. For the converse, let O be a (j, k, c, d)-piecewise symmetric obstruction set. Our goal is to construct a (j , k )-symmetric obstruction set Osym for B as follows. For each structure S \u2208 O, let S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation. Using the filter for S, we \u201cregroup\u201d S0 , . . . , Sn\u22121 to obtain (j , k )-path representation S = T0 \u222a \u00b7 \u00b7 \u00b7 \u222a Tm of S. We add each S together with its new representation to Osym , and also add every structure that is needed to ensure that Osym is symmetric. Finally, we show that Osym is a symmetric obstruction set for B. We begin with the regrouping procedure. Let S \u2208 O, S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation, and {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} be the (c, d)-filter FS . The regrouping procedure is quite pictorial and it is demonstrated in Figure 4. We define 14 S S S17 S16 S15 S14 S13 S12 S11 S10 S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 S11 T5 S5 S10 T4 S4 S9 S16 T3 S3 S8 S15 T2 S2 S7 S14 T1 S13 S1 S6 S17 T0 S12 S0 Figure 4: An example regrouping for the proof of Theorem 15. The filter FS = {[0, 1], [6], [12, 13], [17]}. The structures corresponding to the filter are laying inside the rectangles with lines. The complement of the filter is F\u00afS = {[2, 3, 4], [7, 8, 9, 10, 11], [14, 15, 16]}. The structure corresponding to F\u00afS lay in the gray ovals. The new (j , k )-path representation S of S is on the right. Notice the following pattern: the segments of S determined by FS are placed next to each other in S . T0 = S. \u2208[a,b]: [a,b]\u2208FS This places all substructures in S which correspond to delimiters of FS into one big initial structure. Note though that |T0 | \u2264 c \u00b7 d. Define the complement of FS as F\u00afS = {[0, s1 \u2212 1], [t1 + 1, s2 \u2212 1], [t2 + 1, s3 \u2212 1], . . . , [tc , n \u2212 1]}, and set m = max (b \u2212 a). [a,b]\u2208F\u00afS Intuitively, m is the length of the longest interval in S between any two delimiters. We define T as follows. For each interval [a, b] \u2208 F\u00afS take the ( \u2212 1)-th structure Sa+ \u22121 in that interval and define T to be the union of these structures. Formally, for every \u2208 {1, . . . , m}, set T = Si . i=a+ \u22121\u2264b: [a,b]\u2208F\u00afS 15 S : the representation of S in O Q T : the representation of S in Ops \u03be(S , Q) \u03be(T , P) P S9 S9 S8 S8 S8 S7 S7 S6 S6 S5 S5 S4 S4 S3 , S8 , D S3 , S8 , D S3 S2 , S7 , D S2 , S7 , D S1 , S6 , D S1 , S6 , D S0 , S5 , D S0 , S5 , D S7 S8 S7 S4 , S9 , D S3 S3 S2 S2 S1 S1 S0 S0 S3 S2 S4 , S9 , D S3 , S8 , D S2 , S7 , D S3 , S8 , D S2 , S7 , D S2 Figure 5: Example in the proof of Theorem 15. See the proof for details. Observe that |T | \u2264 k \u00b7 (c + 1). We need to ensure property 2 in Definition 3, so we need to place some additional elements into the domains of the T . Let [x, y] \u2208 FS and [z, w] \u2208 F\u00afS be such that z = y + 1. Then the set of elements Sx \u222a \u00b7 \u00b7 \u00b7 \u222a Sw is called a column. (For the beginning and end of S a column is defined in the natural \u201ctruncated\u201d way.) Because S is a (j, k)-path representation, it follows from the definition that the intersection of any pair of columns has size at most j. Let C1 , . . . , Cr be an enumeration of all the columns. Set D = = C \u2229 C and observe that |D| \u2264 j \u00b7 2r . We add D to the domain of T0 , and also to the domain of Ti to obtain Ti , \u2200i \u2208 {1, . . . , m}. It is straightforward to see that the new representation T = (T0 , . . . , Tm ) satisfies property 2 of Definition 3. Using the remarks about the sizes of the sets, we observe that T is a (j , k )-path decomposition of S, where j and k are functions of j, k, c and d. We place all structures S \u2208 O into Osym but we associate the new representation with S. For a structure S \u2208 Osym , we also apply all valid zigzag operators to S (with respect to the new representation) and add all these structure to Osym . By Lemma 17, Ops is a (j , k )symmetric set. We need to establish that Ops is an obstruction set. Because O \u2286 Osym , it is sufficient to show that no structure in Osym maps to B. To do that we show that for any structure in Ops , there is a structure in O that homomorphically maps to it. Giving a formal proof would lead to unnecessary notational complications and therefore we give an example that is easier to follow and straightforward to generalize. The example is represented in Figure 5. Let S \u2208 Ops such that S is also in O. Assume that the (j , k )representation of S in Ops is T . We consider \u03be(T , P) for some minimal oriented path and show how to find a minimal oriented path Q such that \u03be(S , Q) \u2192 \u03be(T , P). To construct Q, we make a copy of P aligned with S0 , S1 , S2 , S3 , S4 in S . This is represented by the dashed lines in Figure 5. We also make a copy of P aligned with S5 , S6 , S7 , S8 , S9 . This is represented with the dash dotted lines. Note that the resulting minimal oriented path 16 respects the delimiters, i.e., the zigzag operator will not \u201czigzag\u201d S0 and S5 . (In general, we never need to \u201czigzag\u201d structures that were placed into T0 , i.e., the structures that correspond to the delimiters, because P is minimal.) In \u03be(T , P) we denote the copies of the Si with Si and primed Si . Using the definition of the zigzag operator, it follows that the function f that maps an element of S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222aS2 \u222aS3 \u222aS4 in \u03be(S , Q) to the corresponding element in S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222a S2 \u222a S3 \u222a S4 is a homomorphism. We similarly define a homomorphism h from S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(S , Q) to S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(T , P). If we can make sure that if an element x is in the domain of both f and h, and both homomorphisms map x to the same element then we have the desired homomorphism. Assume for example that the element x appears in S2 and also in S8 in \u03be(S , Q), and suppose that f (x) = y and h(x) = y . Let the originals of y and y be z and z in T , respectively. We also identify z and z in S2 and S8 in S . Observe that x in S2 in \u03be(S , Q) is a copy of z and x in S8 in \u03be(S , Q) is a copy of z . If z = z (in S ) then x could not appear both in S2 and S8 by the definition of the zigzag operator. Therefore z = z , z \u2208 D, and by definition, z is in every bag of T . The elements y and y are copies of z, and because z appears in every \u201cbag\u201d of T , all copies of z in \u03be(T , P) are identified to be the same element. In particular, f (x) = y = y = h(x). 3.3 3.3.1 Applications Datalog + Maltsev \u21d2 Symmetric Datalog Using SBPD, we give a short and simple re-proof of the main result of [9]: Theorem 18 ([9]). Let B be a finite core structure. If B is invariant under a Maltsev operation and co-CSP(B) is definable in Datalog, then co-CSP(B) is definable in symmetric Datalog (and therefore CSP(B) is in L by [11]). We only need to show that if co-CSP(B) is in linear Datalog and B is preserved by a Maltsev operation, then co-CSP(B) is in symmetric Datalog. The \u201cjump\u201d from Datalog to linear Datalog essentially follows from already established results, as observed in [9]. For the sake of completeness, we give an approximate outline of the argument without being too technical.4 If co-CSP(B) is definable in Datalog and B has a Maltsev polymorphism, then B also has a majority polymorphism. If B has a majority polymorphism, then co-CSP(B) is definable in linear Datalog [8]. Hence, to re-prove Theorem 18, it is sufficient to prove Lemma 19. Our proof relies on the notion of SBPD. Lemma 19. If co-CSP(B) is definable by a linear Datalog program and B is invariant under a Maltsev operation m, then co-CSP(B) is definable by a symmetric Datalog program. To get ready for the proof of Lemma 19, we define an N -digraph of size s as an oriented path that consists of s forward edges, followed by s backward edges, followed by another s forward edges. Proposition 20 is easy to prove, and the Maltsev properties are used in Lemma 21. 4 The interested reader can consult Lemma 6 (originally in [23]) and Lemma 9 in [9]. For Lemma 9, note that if B has a Maltsev polymorphism, then V(A(B)) is congruence permutable, see [5]. 17 Proposition 20. A minimal oriented path is either a directed path, or it contains a subpath which is an N -digraph. Lemma 21. Let B be a structure invariant under a Maltsev operation m, S be a (j, k)-path with a (j, k)-representation S = (S0 , . . . , Sn\u22121 ), and P = e0 , . . . , eq be a minimal oriented path of height n. If \u03be(S , P) \u2192 B, then S \u2192 B. Proof. Using Proposition 20, there is an index t such that Q = et , et+1 , . . . , et+(3s\u22121) is an N digraph of size s in P. Assume that the first and last vertices of Q are v and w, respectively. Let P be the oriented path obtained from P by removing Q, and adding a directed path Q = ft , ft+1 , . . . , ft+(s\u22121) of length s from v to w. We claim that there is a homomorphism \u03b3 from \u03be(S , P ) to B. Once this is established, repeating the argument sufficiently many times clearly yields that S \u2192 B. Let \u03be(S , P) = (Se0 , . . . , Seq ), and \u03d5e0 , . . . , \u03d5eq be the corresponding isomorphisms (recall the zigzag operator definition in Section 3.1). Similarly, let \u03be(S , P ) = (Sf0 , . . . , Sfq\u22122s ), and \u03c8f0 , . . . , \u03c8fq\u22122s be the corresponding isomorphisms. Because S[e0 ,et\u22121 ] and S[et+3s ,eq ] are isomorphic to S[f0 ,ft\u22121 ] and S[ft+s ,fq\u22122s ] , respectively, \u03b3 for elements in S[f0 ,ft\u22121 ] \u222a S[ft+s ,eq\u22122s ] is defined in the natural way. It remains to define \u03b3 for every d \u2208 S[ft ,ft+(s\u22121) ] . Assume that d \u2208 Sft+ for some \u2208 {0, . . . , s \u2212 1}. Find the original of d in S and let it be do , i.e., do = \u03c8ft+ (d). Then we find the three copies d1 , d2 , d3 of do in S[ft ,ft+(3s\u22121) ] . That is, first we find the three edges e 1 , e 2 , e 3 of Q which have the same level as ft+ (all levels are with respect to P and P ). Then di = \u03d5\u22121 e (do ), i \u2208 [3]. We define \u03b3(d) = m(d1 , d2 , d3 ). By the i \u03b3 Maltsev properties of m, \u03b3 is well-defined. As B is invariant under m, \u03be(S , P ) \u2212\u2192 B. Proof of Lemma 19. If co-CSP(B) can be defined by a linear (j, k)-Datalog program, then there is an obstruction set O for B in which every structure is a (j, k)-path by [7]. We construct a symmetric obstruction set Osym for B as follows. For every (j, k)-path S with a (j, k)-representation S = S0 , . . . , Sn\u22121 in O and for every minimal oriented path P of height n, place \u03be(S , P) into Osym . By Corollary 17, Osym is (j, k)-symmetric. Observe that O \u2286 Osym , so it remains to show that no element of Osym maps to B. But if T \u2208 Osym , then T = \u03be(S , P) for some S \u2208 O and P. By Lemma 21, if \u03be(S , P) \u2192 B, then S \u2192 B. This contradicts the assumption that O is an obstruction set for B. 3.3.2 A class of oriented paths for which the CSP is in L, and a class for which the CSP is NL-complete In this section we define a class C of oriented paths such that if B \u2208 C then co-CSP(B) is in symmetric Datalog. Our strategy is to find an obstruction set O for B \u2208 C, and then to show that our obstruction set is piecewise symmetric. We need some notation. We say that a directed path is forward to mean that its first and last vertices are the vertices with indegree zero and outdegree zero, respectively. Let P be an oriented path with \u00af is a copy of the first vertex v and last vertex w. Then the reverse of P, denoted by P, \u00af is a copy of w and its last oriented path P in the reverse direction, i.e., the first vertex of P vertex is a copy of v. Let Q be another oriented path. The concatenation of P and Q is the oriented path PQ in which the last vertex of P is identified with the first vertex of Q. For a 18 t a c P2 P1 E1 s e E2 P4 P3 b P5 d Figure 6: 2-wave in the proof of Theorem 23. nonnegative integer r, Pr denotes P1 P2 \u00b7 \u00b7 \u00b7 Pr , where the P are disjoint copies of P. Given two vertices v and w, we denote the presence of an edge from v to w with v \u2192 w. \u00af r PE2 , where Ei Definition 22 (Wave). If an oriented path Q can be expressed as E1 (PP) (i \u2208 [2]) denotes the forward directed path that is a single edge, P is a forward directed path of length , and r \u2265 0, then Q is called an r-wave. A 2-wave is shown in Figure 8, 1. Theorem 23. Let Q be a wave. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. We prove the case when Q is an r-wave for r = 2. For larger r-s, the proof generalizes in a straightforward manner. Let P be a directed path of length h, P1 , P3 , P5 be disjoint copies of P, and P2 , P4 be copies of the reverse of P. Let E1 and E2 be forward edges. Assume the 2-wave Q is E1 P1 P2 P3 P4 P5 E2 (Figure 6). We will provide a piecewise symmetric obstruction set Ops for Q, such that every element of Ops is an oriented path. To do this, first we observe that by [17], Q has path duality, i.e., we can assume that the set O of all oriented paths that do not homomorphically map to Q form an obstruction set for Q. To construct Ops from O, we will place certain elements of O into Ops such that Ops is still an obstruction set for Q. We begin with some simple observations. Any oriented path that has height at most h + 1 maps to Q, so these oriented paths can be neither in O nor in Ops . Any oriented path that has height strictly larger than h + 2 obviously does not map to Q, so all such paths are in O and we also place these paths into Ops . Assume that P \u2208 O has height exactly h + 2. It is easy to see that if P is not minimal, then it contains a minimal subpath that does not map to Q. Therefore, it is sufficient to place only those oriented paths from O of height h + 2 into Ops which are minimal. Let P \u2208 Ops of height h + 2 (then P is minimal). Intuitively, any attempt to homomorphically map the vertices of P to Q starting by first mapping the first vertex of P to the first vertex of Q and then progressively finding the image of the vertices of P from left to right would get stuck at a or c. Formally, assume that the vertices of P are v1 , . . . , vn . Let P[i] denote the subpath of P \u03d5 on the first i vertices. Choose i to be the largest index such that P[i] \u2212\u2192 Q and \u03d5(v1 ) = s. Then \u03d5 cannot be extended to vi+1 for one of the following reasons. Clearly, \u03d5 must map vi to a source or a sink other than s or t, i.e., to a,b,c or d. Furthermore, we can assume that vi is not mapped to b or d. This is because if vi is mapped to b or d, then level(vi ) = 1, so the edge between vi and vi+1 is from vi to vi+1 , and therefore \u03d5 can be extended. So we can assume that vi is mapped to a or c. Because we cannot extend \u03d5, vi+1 must be at level 19 + 2, so it must be that vi+1 is the last vertex vn of P. Because P \u2192 Q, P[n\u22121] must be an oriented path such that any homomorphism \u03d5 from P[n\u22121] to Q such that \u03d5(v1 ) = s maps vn\u22121 to a or c but not to e. We assume first that any homomorphism \u03d5 from P[n\u22121] to Q maps vn\u22121 to a. We follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. If there is a vertex to the right of wa at level 1, then because P[n\u22121] will have to reach level h + 1 again, we will be able to map vn\u22121 to c, and that is not possible by assumption. So P must have the following form (Form 1): (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2192 w6 ), where X is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X, and Y is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of Y. See Figure 7, left. For the second case, we assume that P[n\u22121] is such that vn\u22121 can be mapped to c. Again, we follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. We must have a vertex going back to level 1 (otherwise we could not \u201cpass\u201d b and could not map vn\u22121 to c). Let wb be the first such vertex. We will have to go back to level h + 1 again, so let wc be the first vertex at that level. Finally, we cannot go back to level 1 again, since then the last vertex of P[n\u22121] can be mapped to e. We can \u201cgo down\u201d to at most level 2 of P[n\u22121] . So P must have the form (Form 2) (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2190 w6 )Z(w7 \u2192 w8 )W(w9 \u2192 w10 ), where X (Z) is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X (Z), Y is any oriented path of height h \u2212 1 with first vertex at the top and last vertex at the bottom level of Y, and W is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of W. See Figure 7, right. w6 wa = w4 w3 w10 w5 wa = w4 wc = w8 w3 w7 Y X w2 w1 X w2 w1 Y w5 w9 W Z wb = w6 Figure 7: Obstructions of height h + 2 for a 2-wave. Because Ops \u2286 O and for any structure S \u2208 O, there is a structure S \u2208 Ops such that S \u2192 S, Ops is an obstruction set for Q. It remains to show that Ops is piecewise symmetric. Let S be an oriented path of height more than h + 2, and assume the vertex set of S is v1 , . . . , vn . We need to define a representation S , and a filter FS for S. The representation (S0 , S1 , . . . , Sn\u22122 ) is (v1 , v2 ), (v2 , v3 ), . . . , (vn\u22121 , vn ) (width (1, 2)). The filter FS is the empty filter. Note that if we apply a zigzag operation to S, we get an oriented path of the same height as S, so Ops is closed under zigzagging of obstructions of height greater than h + 2. Let S be an oriented path of height h + 2 of Form 1, and assume the vertex set of S is v1 , . . . , vn . The representation S = (S0 , S1 , . . . , Sn\u22122 ) is constructed as in the previous paragraph. We specify FS to be the following (3, 6)-filter. Assume that the edge (w3 , w4 ) is structure Si . Then FS = {[0, 0], [i, i], [n \u2212 2, n \u2212 2]}. Using the definitions it is easy to see 20 that if P obeys the filter FS , then \u03be(S , P) is also an oriented path of Form 1. Therefore Ops is closed under zigzagging of obstructions of Form 1. Obstructions of Form 2 can be handled similarly. We state the following generalization of waves. \u00af r P, \u00af where Definition 24 (Staircase). A monotone wave is an oriented path of the form (PP) P is a forward directed path and r \u2265 0. We call the vertices of a monotone wave in the topmost level peaks, and the vertices in the bottommost level troughs. If a minimal oriented path Q can be expressed as P1 W1 P2 W2 . . . Pn\u22121 Wn\u22121 Pn , where P1 , . . . , Pn are forward directed paths, W1 , . . . , Wn\u22121 are monotone waves, and for any i \u2208 [n \u2212 1], the troughs of Wi are in a level strictly below the level of the troughs of Wi+1 , and also, the peaks of Wi are in a level strictly below the level of the peaks of Wi+1 , then Q is called a staircase. An example is given in Figure 8, 2. Theorem 25. Let Q be a staircase. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. Assume that the height Q is h. As for waves, we use [17] to conclude that Q has path duality. We will construct a piecewise symmetric obstruction set Ops for Q by placing three classes of oriented paths into Ops . First, Ops contains all oriented paths which have height strictly greater than h. These oriented paths obviously do not map to Q. The next class of oriented paths we place into Ops are those which have height precisely h. Recall that Q consists of waves patched together with directed paths in between. Let the wave subpaths of Q be W1 , . . . , Wn , from left to right. For each Wi , we construct a class of oriented paths. Assume that Wi has height hi and let Oi be the set of minimal oriented paths of height hi which do not map to Wi . For each R \u2208 Oi , we construct C = B1 RB2 , where B1 and B2 are oriented paths (possibly empty) such that C has height h, and the level of R in C matches the level of Wi . Observe that there cannot be a homomorphism from C to Q. We place all such constructed C into Ops . Let be the length of the longest directed subpath of Q. The third class of oriented paths are those that have height h , where < h < h. For every such h , we produce a set of obstructions. (Remark: we set < h because any oriented path of length or less maps to Q.) Assume inductively (the base case is trivial) that we already have a piecewise symmetric obstruction set for every staircase of height strictly less than h. Consider every subpath Q1 , . . . , Qm of Q of height h . Notice that core(Qi ) is a staircase which is not a directed path. By the inductive hypothesis we have a piecewise symmetric obstruction set Ui for Qi . We keep only those oriented paths in Ui which have height at most h ; observe that Ui = \u2205. Construct D = B1 T1 \u00b7 \u00b7 \u00b7 Bm Tm Bm+1 , where (T1 , . . . , Tm ) \u2208 U1 \u00d7 \u00b7 \u00b7 \u00b7 \u00d7 Um and the Bj are arbitrary oriented paths such that the height of D is h . Place all these D-s into Ops . Notice that D does not map to Q for the following. Assume for contradiction that D maps to a subpath S of Q. Then D also maps to the core of S which is a staircase. But by construction D contains a subpath that does not map to S. We show that Ops is an obstruction set for Q. If an structure Z \u2208 Ops homomorphically maps to an input structure A, then obviously, there cannot be a homomorphism from A to 21 Q. Assume for contradiction that no structure in Ops maps to A but A does not map to Q. Then O contains an oriented path P that maps to A. So if we show the following claim then we are done. Claim. For any oriented path P that does not homomorphically map to Q, there is an oriented path Z \u2208 Ops that homomorphically maps to P. Proof of Claim. Assume that P has height precisely h. We show that there exists Z \u2208 Ops of height h such that Z \u2192 P. Assume for contradiction that none of the oriented paths of height h in Ops map to P. As before, let W1 , . . . , Wn be the wave segments of Q, from left to right, and assume without loss of generality that none of the Wi is a directed path. Let the initial and final vertices of Wi be ai and bi respectively, i \u2208 [n]. For each i \u2208 [n], find the minimal oriented subpaths of P whose initial vertices have the same level as ai , and final vertices have the same level as bi , or vice versa (note that because of the structure of Q, no such oriented path could contain another as a subpath, however, these oriented paths could overlap). For any such subpath R of P associated with Wi , map the lowest vertex of R to ai , and the highest vertex of R to bi . Remark 1: In fact there is no other choice. The rest of the vertices of R can be mapped to Q as follows. If R does not map to Wi with first and last vertices matched then by definition, P is in Ops and we have a contradiction. Therefore let the homomorphism for R be \u03d5R . Remark 2: Also observe that \u03d5R maps the inner vertices of R to vertices of the staircase which are between ai and bi . We show that the partial homomorphisms \u03d5R map the same vertex of P to the same vertex in Q, and furthermore we can also map those vertices of P to an element of Q that are not mapped anywhere by the \u03d5R . This way we obtain a homomorphism from P to Q and this would be a contradiction. First, any vertex v is assigned to a vertex of Q by at most two homomorphisms which correspond to consecutive wave segments of Q. This is because in Q, Wi and Wj are disjoint unless j = i + 1. Using Remarks 1 and 2, we can see that if a vertex v of P is in the domain of two \u201cnon-consecutive\u201d homomorphisms, then because those homomorphisms could not agree on where to map v, it is not possible that P \u2192 Q. This is a contradiction. Let \u03d5R1 and \u03d5R2 (assume without loss of generality that R1 and R2 correspond to W1 and W2 , respectively) be two partial homomorphisms such that their domains overlap. Then the markers a1 , b1 , a2 , b2 appear in the order a1 , a2 , b1 , b2 when traversing P from left to right. The vertices that are in the domain of both homomorphisms are the ones from a2 to b1 . By the choice of a1 , b1 , a2 , b2 , the segment of P from a2 to b1 is a minimal oriented path. Checking the images of the vertices going back from b1 to a2 under the map \u03d5R1 , we see that these vertices are mapped to the rightmost directed path segment of W1 . Similarly, the image of these vertices under \u03d5R1 is the leftmost directed path of the W2 . That is, the two homomorphisms coincide for the vertices from a2 to b1 . Furthermore, some vertices of P are not in the domain of any partial homomorphisms. Consider the two minimal oriented paths S and S on the two sides of such a maximal continuous sequence of vertices in P. There are two cases. First, assume that S and S both correspond to the same Wi . Let the markers for S be a and b an the markers for S be a and b . Then following P from left to right, the markers appear in the order a, b, b , a . The images of the vertices from b to b are not defined. (Observe that b and b are mapped to the same vertex.) Consider the last directed path segment of Wi together with the first directed 22 path segment of Wi+1 (or just the last edges of Q if i = n). Observe that the vertices from b to b can be mapped to this directed path. The case when S and S correspond to different waves of Q is handled similarly. Suppose lastly that P has height h < h. Because P does not map to any of the subpaths of Q of height h , for each subpath Q1 , . . . , Qm of Q of height h , P contains a subpath Si such that Si \u2192 Qi , i \u2208 [m]. If Si \u2192 Qi then Si \u2192 core(Qi ). Recall that core(Qi ) is a staircase and by definition, Ui contains an oriented path Si such that Si \u2192 Si . It is clear that we can choose oriented paths B1 , . . . , Bm+1 such that B1 S1 B2 . . . Bm Sm Bm+1 \u2192 P. Finally, it is not hard to see from the construction how to associate filters with the elements of Ops to establish that Ops is piecewise symmetric. We also give a large class of oriented paths for which the CSP is NL-complete. We need the following propositions to prove Theorem 28. Proposition 26. Let P1 and P2 be two minimal oriented paths of the same height h. Then there is a minimal oriented path Q of height h such that Q \u2192 P1 , P2 . Proof. Not hard, see e.g. [16]. Proposition 27. A core oriented path has a single automorphism, i.e., it is rigid. Proof. Let P be a core oriented path and P be an isomorphic copy of P . There are at most two isomorphisms from P to P (because a vertex with indegree 0 must be mapped to a vertex with indegree 0, and similarly for a vertex with outdegree 0). One possibility is to map the first vertex of P to the first vertex of P and the last vertex of P to the last vertex of P. For contradiction, assume that the second possibility happens, i.e., there is an isomorphism \u03d5 that maps the first vertex of P to the last vertex of P and the last vertex of P to the first vertex of P. Assume that both the first vertex v and last vertex w of P have indegree zero (the other case is similar). Then the level(v) = level(w). This implies that the number of forward and backward edges in P is the same, so P has 2q edges. By \u00af and such an oriented path is clearly not a the existence of \u03d5, P must have the form QQ, core. Theorem 28. Let B be a core oriented path that contains a subpath P1 P2 P3 of some height h with the following properties: P1 , P2 and P3 are minimal oriented paths, they all have height h, and there is a minimal oriented path Q of height h such that Q \u2192 P1 , Q \u2192 P3 but Q \u2192 P2 . Then CSP(B) is NL-complete. An example is given in Figure 8, 3 and 4. Proof of Theorem 28. We show that the less-than-or-equal-to relation on two elements, R\u2264 = {(0, 0), (0, 1), (1, 1)}, and the relations {0} and {1} can be expressed from P using a primitive positive (pp) formula (i.e., a first order formula with only existential quantification, conjunction and equality). It is easy to see and well known that CSP({R\u2264 , {0}, {1}}) is equivalent to the NL-complete directed st-Conn problem. Since P is a core, it is rigid by Proposition 27. Assume that the first vertex of P1 is in a level lower than the level of the last vertex of P1 (the other case can be handled 23 1 2 P2 E2 P1 P2 P3 P4 4 3 P1 P3 Q P5 E1 Figure 8: 1: A 2-wave. 2: A staircase. 3: An example oriented path for which the CSP is NL-complete. 4: The oriented path Q in Theorem 28 corresponding to the oriented path in 3. similarly). See the illustration in Figure 9. Assume that the first vertex of P1 is 0 and the first vertex of P3 is 1. We construct a structure G with two special vertices x and y such h that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . It is well known and easy to show that then R\u2264 can also be expressed from P using a pp-formula. Let P be an isomorphic copy of P. We refer P G P23 P1 P2 P3 0 P1 1 P2 P3 c Q x P123 y Figure 9: Construction of the gadget G. to copies of P1 , P2 , P3 as P1 , P2 , P3 , respectively. Using Proposition 26, we find a minimal oriented path P23 of height h that maps to both P2 and P3 . Similarly, we find a minimal oriented path P123 that maps to each of P1 , P2 , P3 . We rename the first vertex of Q to x, and the first vertex of P123 y to y. To construct G, we identify the topmost vertices of the oriented paths P23 , Q and P123 . Then we identify the first vertex of P23 with the vertex c of P that is shared by P2 and P3 . Observe that any homomorphism from G to P, must map h c to 1. It is straightforward to verify that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . Because P is rigid, any relation of the form {v} where v \u2208 P can be expressed by a pp-formula. 4 4.1 On CSPs in NL Definitions Let \u03c4 be a vocabulary. A successor \u03c4 -structure S is a relational structure with vocabulary \u03c4 \u222a {first, last, suc}, where first and last are unary symbols and suc is a binary symbol. 24 Without loss of generality, the domain S is defined as {1, . . . , n}, firstS = {1}, lastS = {n}, and sucS contains all pairs (i, i + 1), i \u2208 [n \u2212 1]. Because firstS , lastS and sucS depend only on n, they are called built-in relations. When we say that a class of successor structures is homomorphism/isomorphism-closed, all structures under consideration are successor structures, and we understand that homomorphism/isomorphism closure, respectively, is required only for non-built-in relations. Definition 29 (Split Operation). A split operation produces a \u03c4 -structure A from a \u03c4 structure A as follows. For an element a \u2208 A let Ta be defined as Ta = {(t, R, i) | t = (t1 , . . . , tr ) \u2208 RA where R \u2208 \u03c4 , and ti = a}. If |Ta | \u2264 1 for every a \u2208 A, then no split operation can be applied. Otherwise we choose a strict nonempty subset T of Ta , and for each triple (t, R, i) \u2208 T , we replace t = (t1 , . . . , tr ) in RA with (t1 , . . . , ti\u22121 , a , ti+1 , . . . , tr ) to obtain A (and A = A \u222a {a }). Definition 30 (Split-Minimal, Critical). Let C be a class of structures over the same vocabulary. We say that a structure A \u2208 C is split-minimal in C if for every possible nonempty sequence of split operations applied to A, the resulting structure is not in C. We say that a structure A \u2208 C is critical in C if no proper substructure of A is in C. For a class of successor \u03c4 -structures, criticality and split-minimality is meant only with respect to non-built-in relations. Definition 31 (Read-Once Datalog). Let P be a (linear, symmetric) Datalog program that defines a class of structures C. If for every critical and split-minimal element of C there is a P-derivation that is read-once, then we say that P is read-once. Definition 32 (Read-Once mnBP1). A monotone nondeterministic branching program (mnBP) H with variables X = {x1 , . . . , xn } computes a Boolean function fH : {0, 1}n \u2192 {0, 1}. H is a directed graph with distinguished nodes s and t and some arcs are labeled with variables from X (not all arcs must be labeled). An assignment \u03c3 to the variables in X defines a subgraph H\u03c3 of H as follows: an arc a belongs to H\u03c3 if \u03c3(x) = 1, where x is the label of a, or if a has no label. The function fH is defined as fH (\u03c3) = 1 if and only if there is a directed path in H\u03c3 from s to t (an accepting path). The size of an mnBP is |VH |. Let \u03c4 be a vocabulary and n \u2265 1. We assume without loss of generality that any relational structure whose domain has size n has domain {1, 2, . . . , n}. Let (R1 , t1 ), (R2 , t2 ), . . . , (Rq , tq ) be an enumeration of all pairs such that Ri \u2208 \u03c4 and ti \u2208 {1, 2, . . . , n}ar(Ri ) . We associate a variable xi with (Ri , ti ), for each i = 1, 2, . . . , q. Then if all labels of a branching program Hn are among x1 , x2 , . . . , xq , we say that Hn is over the vocabulary \u03c4 for input size n. We say that a family of branching programs F defines a class of \u03c4 -structures C, if for each n \u2265 1, F contains precisely one branching program Hn over \u03c4 for input size n such that fHn (x1 , x2 , . . . , xq ) = 1 if and only if the tuple structure with domain {1, 2, . . . , n} and containing precisely those pairs (Ri , ti ) for which xi = 1 is in C. Let F be a family of mnBP1s that contains precisely one branching program for each n \u2265 1. We say that F is a poly-size family if there is a polynomial p such that for each n \u2265 1, |V (Hn )| \u2264 p(n). Such a family is denoted by mnBP1(poly). If for every n and every 25 structure of domain size n in C, Hn contains an accepting path P such that any label on P is associated with at most one arc of P , then we say that F is read-once. (This read-once condition can be made a bit weaker.) 4.2 Examples We give some examples of problems definable by a 1-linDat(suc) program or by an mnBP1(poly). The program in Section 2.2, Figure 1 without rule 3 is a read-once linear Datalog(suc) program that defines the problem directed st-Conn. To see that this program Pst\u2212Conn is read-once, let G be any input that is accepted (we do not even need G to be critical and split-minimal). Then we find a directed path in E G connecting an element of S G to an element of T G without repeated edges. We build a Pst\u2212Conn -derivation for this path in the obvious way. For this section, by a clique we mean an ordinary undirected clique but each vertex may or may not have a self-loop. Let EvenCliques be the class of cliques of even size. The read-once linear Datalog(suc) program PEC below defines EvenCliques. The goal predicate of PEC is G2 , and E is the symbol for the edge relation of the input. The first part of PEC checks if the domain size n of the input is even. The second part goes through all pairs (x, y) \u2208 [n]2 , and at the same time, checks if (x, y) is an edge in E. This is achieved by accessing the order on the domain. Program PEC goes through every pair of vertices precisely once, so every PEC -derivation is read-once, and therefore PEC is read-once. I(y) \u2190 first(x) \u2227 suc(x, y) I(z) \u2190 I(x) \u2227 suc(x, y) \u2227 suc(y, z) G1 \u2190 I(x) \u2227 last(x) J(x, y) \u2190 G1 \u2227 first(x) \u2227 first(y) J(x, z) \u2190 J(x, y) \u2227 suc(y, z) \u2227 E(x, z) \u2227 E(z, x) J(z, w) \u2190 J(x, y) \u2227 last(y) \u2227 suc(x, z) \u2227 suc(z, w)\u2227 E(z, w) \u2227 E(w, z) G2 \u2190 J(x, y) \u2227 suc(x, y) \u2227 last(y). Figure 10: The read-once linear Datalog(suc) program PEC for EvenCliques. In fact, we can easily test much more complicated arithmetic properties than the property of being even (e.g., being a power of k) with a 1-linDat(suc) program. However, linear Datalog cannot define any set of cliques with a non-trivial domain size property in the following sense. Let K be a clique of size n, and K be the clique obtained by identifying any two vertices of K. Then K homomorphically maps to K , and therefore if a linear Datalog program accepts K, then it also accepts K . Therefore EvenCliques or, in fact, any set of cliques that contains a clique of size n but no clique of size n \u2212 1 cannot be defined by a 26 linear Datalog program. Since it is not difficult to convert a 1-linDat(suc) program into an mnBP1(poly), the aforementioned problems can also be defined with an mnBP1(poly). The additional power the successor relation gives to 1-linDat is at least twofold. For example, read-once linear Datalog(suc) can do some arithmetic, as demonstrated above. In addition, let\u2019s define the density of a graph to be the number of edges divided by the number of vertices. The density of an n-clique is n2 /n = \u03b8(n). As demonstrated above, access to an order allows read-once linear Datalog(suc) to accept only structures of linear density. On the other hand, any linear Datalog program P accepts structures of arbitrary low density. For let S be a structure accepted by P. Then adding sufficiently many new elements to the domain of S yields a structure S whose density is arbitrarily close to 0, and S is still accepted by P. One consequence of Corollary 34 is that if a read-once linear Datalog(suc) defines co-CSP(B), then both aforementioned additional abilities are of no use. 4.3 Main Results We begin with stating the results for 1-linDat(suc) and poly-size families of mnBP1s discussed in the Introduction. Theorem 33. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a 1-linDat(suc) program of width (j, k), then every critical and split-minimal element of C has a (j, k + j)-path decomposition. Corollary 34. If co-CSP(B) can be defined by a 1-linDat(suc) program of width (j, k), then co-CSP(B) can also be defined by a linear Datalog program of width (j, k + j). Theorem 35. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a family of mnBP1s of size O(nj ), then every critical and split-minimal element of C has a (j, r + j)-path decomposition, where r is the maximum arity of the symbols in \u03c4 . Corollary 36. If co-CSP(B) can be defined by a family of mnBP1s of size O(nj ), then co-CSP(B) can also be defined by a linear Datalog program of width (j, r + j), where r is the maximum arity of the relation symbols in the vocabulary of B. As discussed before, a wide class of CSPs\u2013CSPs whose associated variety admits the unary, affine or semilattice types\u2013does not have bounded pathwidth duality [20]. It follows that all these CSPs are not definable by any 1-linDat(suc) program, or with any mnBP1 of poly-size. An example of such a CSP is the P-complete CSP Horn-3Sat. After some definitions, we give a high-level description of the proof of Theorem 33. Any \u03c4 -structure M with domain size n can be naturally converted into an isomorphic successor structure M(\u03c0), where \u03c0 is a bijective function \u03c0 : M \u2192 {1, . . . , n}. We define the domain M (\u03c0) as {1, . . . , n} (note that this automatically defines firstM\u03c0 , lastM\u03c0 and sucM\u03c0 ) and for any R \u2208 \u03c4 , and (t1 , . . . , tar(R) ) \u2208 RM , we place the tuple (\u03c0(t1 ), . . . , \u03c0(tar(R) )) into RM\u03c0 ). When we want to emphasize that a structure under consideration is a successor \u03c4 -structure, we use the subscript s, for example Ms . Given a successor \u03c4 -structure Ms , M denotes the structure Ms but with the relations firstMs , lastMs and sucMs removed. We make the simple but important observation that we are interested only in isomorphismclosed classes. For example, co-CSP(B) is obviously isomorphism-closed. We will crucially 27 use the fact that if Ms is accepted by a 1-linDat(suc) program P, then P must also accept M(\u03c0) for any bijective function \u03c0. We are ready to describe the intuition behind the proof of Theorem 33. A 1-linDat(suc) program that ensures that the class of successor-structures C it defines is homomorphism-closed (and therefore isomorphism-closed) does not have enough \u201cmemory\u201d\u2013 due to its restricted width\u2013to also ensure that some key structures in C are \u201cwell-connected\u201d. If these key structures are not too connected, then we can define co-CSP(B) in linear Datalog. The more detailed proof plan is the following. Assume that co-CSP(B), where the input is a successor structure, is defined by a linDat(suc) program P of width (j, k). We choose a \u201cminimal\u201d structure M in C that is accepted, and assume for contradiction that M does not have width (j, k). Then roughly speaking, for all possible \u201cpermutations of the domain elements of M\u201d, M must be accepted; therefore for each of these isomorphic structures, P must be able to provide a derivation. Because this procedure will provide many enough derivations, we will be able to find some derivations which are of a desired form. The identification of these \u201cgood\u201d derivations also crucially uses the generalized Erd\u02ddos-Ko-Rado theorem. Once these derivations are detected, they can be combined to produce a derivation that \u201cencodes\u201d a structure of bounded pathwidth. The structures of bounded pathwidth produced this way can be used to define co-CSP(B) in linear Datalog. We give the formal proofs. We need the following additional definitions related to linear Datalog. In addition to extracting Ex(D) from D, we can also extract a decomposition of Ex(D) reminiscent of a \u02dc by adding (R, t) to path decomposition. For each \u2208 [q], we define a tuple structure B \u02dc \u02dc the -th bag, and B if R(t) appears in \u03c1 . In such a representation of Ex(D), we call B \u02dc 1, . . . , B \u02dc q ) the tuple distribution of Ex(D). It will be useful to remove empty bags from the (B \u02dc 1, . . . , B \u02dc q ) to obtain the sequence (B \u02dc i1 , . . . , B \u02dc it ), where i < i if < . For list of bags (B \u02dc i1 , B \u02dc i2 . . . . , B \u02dc it ) to (B \u02dc 1, B \u02dc 2, . . . , B \u02dc t ). We call the sequence simpler notation, we renumber (B \u02dc 1, . . . , B \u02dc t ) the pruned tuple distribution of D. The following is easy to prove. (B Proposition 37. Let A be a \u03c4 -structure obtained from a \u03c4 -structure A by applying a sequence of split operations. Then A \u2192 A. We recall the following theorem tailored a bit to our needs. Theorem 38 (Erd\u02ddos-Ko-Rado, general case; see, e.g., [14]). Suppose that F is a family of s-subsets of {1, . . . , n}, where n \u2265 n0 (s, j + 1). Suppose that for any two sets S1 , S2 \u2208 F, |S1 \u2229 S2 | \u2265 j + 1. Then |F| \u2264 n\u2212(j+1) = O(ns\u2212(j+1) ). s\u2212(j+1) Proof of Theorem 33. Let the read-once linear Datalog(suc) program that defines C be P. Let M be a structure in C such that M is critical and split-minimal, but assume for contradiction that M has no (j, k)-path decomposition. Suppose that M = {m1 , . . . , ms }. We choose a large enough n divisible by s (for convenience): how large n should be will become clear later. We begin with constructing a class of successor structures from M. Let \u03d5 : M \u2192 {1, . . . , n} be a function that for all i \u2208 [s], maps mi to one of the numbers in (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns . We call such a function an embedder. Observe that there are ( ns )s possible embedder functions. For each embedder \u03d5, we define a successor structure M\u03d5 as follows. M\u03d5 is obtained from M by renaming mi to \u03d5(mi ) for each i \u2208 [s], and adding all numbers inside {1, . . . , n} but not in the range of \u03d5 to the domain of the structure. 28 Obviously for any embedder \u03d5, M\u03d5 contains an isomorphic copy of M, and therefore M \u2192 M\u03d5 . Since C is closed under homomorphisms (and successor-invariant), it follows that for any embedder \u03d5, M\u03d5 is accepted by P. Our goal now is to show that P accepts a structure that can be obtained from M by applying a nonempty sequence of split operations. This would contradict the split-minimality of M with respect to C. Let \u03d51 , . . . , \u03d5t be an enumeration of all t = ( ns )s embedders, and M\u03d51 , . . . , M\u03d5t the corresponding successor structures. Since P is read-once, we can assume that for each i \u2208 [t], there is a read-once P-derivation for M\u03d5i : D(M\u03d5i ) = (\u03c1i1 , \u03bbi1 ), . . . , (\u03c1iqi , \u03bbiqi ). \u02dci,...,B \u02dc i ). Let \u03c8i (B \u02dci,...,B \u02dci ) For each D(M\u03d5i ) we denote its pruned tuple distribution as (B 1 wi 1 wi \u02dc i ), where M \u02dc i for each \u2208 [wi ] is obtained as follows. For every (R, t) \u2208 \u02dc i,...,M denote (M wi 1 \u02dc i , place (R, \u03d5\u22121 (t)) into M \u02dc i . We call \u03c8i (B \u02dci,...,B \u02dc i ) the prototype of (B \u02dci,...,B \u02dc i ). We B 1 wi 1 wi i \u02dci,...,B \u02dc i ) and (B \u02dci ,...,B \u02dc i ) are similar if they say that two pruned tuple distributions (B 1 wi 1 wi i i i i \u02dc \u02dc \u02dc \u02dc have the same prototypes, i.e., \u03c8i (B1 , . . . , Bwi ) = \u03c8i (B1 , . . . , Bwi ). Note that the codomain of \u03c8i , for any i, is a sequence S of bags such that a bag contains \u02dc Because by definition, every bag in S is nonempty, and D(M\u03d5 ) is readelements of M. i \u02dc once, we have that |S| \u2264 |M|. Therefore the number of possible bag sequences can be upper-bounded by a function of s; let this upper bound be cs . It follows that there must be \u02dc ) \u02dc ,...,B at least t = cts embedders \u03d5i1 , . . . , \u03d5it such that for any , \u2208 {i1 , i2 , . . . , it }, (B w 1 \u02dc ,...,B \u02dc ) are similar. Let the common prototype of all these similar pruned tuple and (B 1 w \u02dc 1, . . . , M \u02dc w ) (i.e., \u03c8i1 (B \u02dc i1 , . . . , B \u02dc i1 )). Because M \u02dc is critical, it follows distributions be (M 1 wi1 \u02dc =M \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc w 5. that M To give a heads-up to the reader, our goal now is to construct a derivation D using the derivations D(M\u03d5i1 ), D(M\u03d5i2 ), . . . , D(M\u03d5i ), such that Ex(D ) is isomorphic to a structure t \u02dc that can be obtained from M \u02dc by a nonempty sequence of split operations. Because M \u02dc M is split-minimal, this contradiction will complete the proof. \u02dc1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc g , and Yg = M \u02dcg \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w for g \u2208 [w]. If there is no g \u2208 [w \u2212 1] Define Xg = M such that |Xg \u2229 Yg+1 | > j, then we construct a (j, k + j)-path decomposition S1 , . . . , Sw for \u02dc 1 , Sw = M \u02dc w , and S = M \u02dc \u222a (X \u22121 \u2229 Y +1 ), for 2 \u2264 \u2264 w \u2212 1. M as follows. Define S1 = M The first condition of Definition 3 is obviously satisfied. For the second condition, take Si \u02dc i and a \u2208 M \u02dc i for some i \u2264 i and i \u2264 i , and Si and i < < i . If a \u2208 Si \u2229 Si then a \u2208 M so a \u2208 S . For the first part of the third condition observe that because P has width (j, k), \u02dc | \u2264 k. Because we added at most j new elements to M \u02dc to obtain S , |S | \u2264 k + j for |M any . For the second part of the third condition, observe that S \u2286 X and S +1 \u2286 Y +1 , so |S \u2229 S +1 | \u2264 j for any . For the other case, suppose that for some g, |Xg \u2229 Yg+1 | > j. Recall that for each \u02dc g was constructed from the bag B \u02dc , and B \u02dc was constructed from a \u2208 {i1 , i2 , . . . , it }, M g g rule \u03c1g for some g , i.e., the g -th rule in the derivation D(M\u03d5 ) = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ). Let \u03b9 be the number of IDBs of P and \u03ba the maximum arity of any IDB of P. Recall that \u02dc is critical and C is homomorphism closed, M \u02dc cannot contain isolated elements Note that because M \u02dc except when M is a structure with a single element and no tuples. In this case the only critical and split\u02dc and the empty set is a (0, 0)-path decomposition for M. \u02dc minimal element is M 5 29 since P has width (j, k), any IDB contains at most j variables. Assume that the head IDB of \u03c1g is Ig (x g ). Then there are at most \u03b9j \u03ba nj possibilities for the head IDB Ig together with its variables instantiated to numbers in [n]. This means that there is an IDB I and a tuple t such that for at least t = \u03b9j \u03bat nj values of \u2208 {i1 , i2 , . . . , it }, it holds that Ig = I, and \u03bbg (x g ) = t. Let these t values be { 1 , . . . , t }. We establish later that we can choose values a , b \u2208 { 1 , . . . , t } such that the following inequality holds: \u02dc b \u2264 j. \u02dca \u2229 B \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B \u02dc1a \u222a \u00b7 \u00b7 \u00b7 \u222a B B w w Assuming that we have such a and b, (5) we define D as: (\u03c11a , \u03bb1a ), . . . , (\u03c1gaa , \u03bbgaa ), (\u03c1gb +1 , \u03bbg b +1 ), . . . , (\u03c1q b b b b , \u03bbqb ). b That is, we \u201ccut\u201d the derivations D(M\u03d5 a ) at the g a -th rule, and cut the derivation D(M\u03d5 b ) at the g b -th rule, and concatenate the first part of D(M\u03d5 a ) with the second part of D(M\u03d5 b ). D is a valid derivation because at the point of concatenation, the head IDB of \u03c1gaa is the same as the IDB in the body of \u03c1gb +1 , and the variables of this IDB are instantib ated to the same values in both rules. Observe that the pruned tuple distribution of D is \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b. \u02dc =B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222aB \u02dc b ,...,B \u02dc b ). Set B \u02dc a, . . . , B \u02dc a, B (B g+1 1 g+1 1 w g w g \u02dc is isomorphic to a structure that can be obtained from M \u02dc by a nonempty sequence Claim. B of split operations. \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc g of M \u02dc is isomorphic to B \u02dc a\u222a Proof of Claim. Observe that the substructure M 1 b \u02dc g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w is isomorphic to B \u02dc \u02dc b \u02dc a through \u03d5 a . Similarly, M \u00b7\u00b7\u00b7\u222aB g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a Bw through g \u02dc and B. \u02dc \u03d5 b . Our goal is to understand the difference between M Notice that because any embedder maps mi \u2208 M into the interval (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns , and for any i = i , (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns \u2229 (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns = \u2205, if i = i , then \u03d5 a (mi ) = \u03d5 b (mi ). Therefore \u03d5 a and \u03d5 b can return the same value only if they both \u02dc where get the same input. The set Xg \u2229 Yg+1 can be thought of as those elements of M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc g and M \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w are \u201cglued together\u201d to obtain M. \u02dc Let U = B \u02dc1a \u222a\u00b7 \u00b7 \u00b7\u222a B \u02dc ga M b \u02dc where \u02dcg+1 \u02dcwb . The set U \u2229 V can be thought of as those elements of B and V = B \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a and B \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b are \u201cglued together\u201d to obtain B. \u02dc B 1 g+1 g w \u02dc would be isomorphic to M, \u02dc If for all elements m \u2208 Xg \u2229 Yg+1 , \u03d5 a (m) = \u03d5 b (m), then B \u02dc a \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc a would be glued to B \u02dc b \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc b to obtain B \u02dc the same way as M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222aM \u02dcg i.e., B 1 g+1 g w \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w to obtain M. \u02dc But by Inequality 5, |Xg \u2229Yg+1 | > |U \u2229V |. In other is glued to M words, there are some elements m \u2208 Xg \u2229Yg+1 which have one copy for \u03d5 a , and another copy \u02dc Identifying \u03d5 a (m) and \u03d5 (m) for all such m would convert B \u02dc to a structure for \u03d5 b in B. b \u02dc Now it is easy to see that going backwards, splitting elements of M \u02dc would isomorphic to M. \u02dc yield a structure isomorphic to B. It remains to show why we can choose a and b to satisfy Inequality 5. Note that ( n )s \u02dc \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc is an st = cs \u03b9js \u03ba nj \u2265 \u2126(ns\u2212j ). Also note that for any \u2208 { 1 , . . . , t }, B 1 w a \u02dc \u02dcwa \u2229 subset of [n]. So by Theorem 38, if for every pair a , b \u2208 { 1 , . . . , t }, B1 \u222a \u00b7 \u00b7 \u00b7 \u222a B 30 \u02dcwb \u2265 j + 1, then t \u2264 O(ns\u2212j\u22121) ). But as observed t \u2265 \u2126(ns\u2212j ), so for a \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B B large enough n (as a function of s,j, \u03b9 and \u03ba, so n can be chosen in advance) Inequality 5 must hold for some a , b \u2208 { 1 , . . . , t }. Proof of Corollary 34. Let O = co-CSP(B), i.e., the set of all those successor structures that do not homomorphically map to B. We construct an obstruction set O for B such that every structure in O has pathwidth (j, k + j). O is the set of all critical and split minimal structures of O. Theorem 33 tells us that every structure in O has a (j, k + j)-path decomposition. To see that O is an obstruction set for B, take any structure S \u2208 co-CSP(B) = O. Keep on applying split operations to S and taking substructures of S (again, these operations are with respect to non-built-in relations only), as long as the resulting structure is still in O. That is, if we apply any split operation to S , or if we take any substructure of it, then the resulting structure is not in O any more. Then S \u2208 O because S is critical and split minimal with respect to O. Using Proposition 37, we also see that S \u2192 S. Because O is an obstruction set for B such that every structure in O has width (j, k +j), it follows from results of Dalmau in [7] that co-CSP(B) is definable in linear (j, k + j)Datalog. These proofs can be adapted for mnBP1s to obtain Theorem 35 and Corollary 36. Acknowledgement We thank Benoit Larose and Pascal Tesson for useful discussions and comments on an earlier draft. We also thank the anonymous referees for their helpful comments. References [1] F. Afrati and S. S. Cosmadakis. Expressiveness of restricted recursive queries. In Proceedings of the 42th ACM Symposium on Theory of Computing (STOC), pages 113\u2013 126, 1989. [2] E. Allender, M. Bauland, N. Immerman, H. Schnoor, and H. Vollmer. The complexity of satisfiability problems: Refining Schaefer\u2019s theorem. Journal of Computer and System Sciences, 75(4):245\u2013254, 2009. [3] L. Barto and M. Kozik. Constraint satisfaction problems of bounded width. In Proceedings of The 50th Annual Symposium on Foundations of Computer Science (FOCS), 2009. [4] A. A. Bulatov, A. A. Krokhin, and B. Larose. Dualities for constraint satisfaction problems. In N. Creignou, P. G. Kolaitis, and H. Vollmer, editors, Complexity of Constraints, volume 5250 of Lecture Notes in Computer Science, pages 93\u2013124. Springer, 2008. [5] S. Burris and H. P. Sankappanavar. A Course in Universal Algebra. Number 78 in Graduate Texts in Mathematics. Springer-Verlag, 1981. 31 [6] C. Carvalho, L. Egri, M. Jackson, and T. Niven. On Maltsev digraphs. In Proceedings of the 6th International Computer Science Symposium in Russia (CSR), pages 181\u2013194, 2011. [7] V. Dalmau. Constraint satisfaction problems in non-deterministic logarithmic space. In Proceedings of the 29th International Colloquium on Automata, Languages and Programming, ICALP, pages 414\u2013425. Springer-Verlag, 2002. [8] V. Dalmau and A. Krokhin. Majority constraints have bounded pathwidth duality. European Journal of Combinatorics, 29(4):821\u2013837, 2008. [9] V. Dalmau and B. Larose. Maltsev + Datalog \u2192 symmetric Datalog. In IEEE Symposium on Logic in Computer Science (LICS), pages 297\u2013306, 2008. [10] L. Egri, A. A. Krokhin, B. Larose, and P. Tesson. The complexity of the list homomorphism problem for graphs. Theory of Computing Systems, 51(2):143\u2013178, 2012. [11] L. Egri, B. Larose, and P. Tesson. Symmetric Datalog and constraint satisfaction problems in logspace. In IEEE Symposium on Logic in Computer Science (LICS), pages 193\u2013202, 2007. [12] T. Feder. Classification of homomorphisms to oriented cycles and of k-partite satisfiability. SIAM Journal on Discrete Mathematics, 14(4):471\u2013480, 2001. [13] T. Feder and M. Y. Vardi. The computational structure of monotone monadic SNP and constraint satisfaction: A study through Datalog and group theory. SIAM Journal on Computing, 28(1):57\u2013104, 1999. [14] P. Frankl and R. L. Graham. Old and new proofs of the Erd\u00a8os-Ko-Rado Theorem. Journal of Sichuan University Natural Science Edition, 26, 1989. [15] E. Gr\u00a8adel. Capturing complexity classes by fragments of second-order logic. Theoretical Computer Science, 101(1):35\u201357, 1992. [16] R. H\u00a8aggkvist, P. Hell, D. J. Miller, and V. Neumann-Lara. On multiplicative graphs and the product conjecture. Combinatorica, 8:63\u201374, 1988. [17] P. Hell and X. Zhu. Homomorphisms to oriented paths. Discrete Mathematics, 132:107\u2013 114, 1994. [18] D. Hobby and R. McKenzie. The Structure of Finite Algebras, volume 76 of Contemporary Mathematics. American Mathematical Society, Providence, R.I., 1988. [19] N. Immerman. Descriptive complexity. Graduate Texts in Computer Science. Springer, 1999. [20] B. Larose and P. Tesson. Universal algebra and hardness results for constraint satisfaction problems. Theoretical Computer Science, 410(18):1629\u20131647, 2009. 32 [21] B. Larose and L. Z\u00b4adori. Bounded width problems and algebras. Algebra Universalis, 56(3-4):439\u2013466, 2007. [22] L. Libkin. Elements of finite model theory. Springer, 2004. [23] A. F. Pixley. Distributivity and permutability of congruence relations in equational classes of algebras. Proceedings of the American Mathematical Society (AMC), 14:105\u2013 109, 1963. [24] T. Schaefer. The complexity of satisfiability problems. In Proceedings of the 10th ACM Symposium on Theory of Computing (STOC), pages 216\u2013226, 1978. [25] I. Wegener. Branching programs and binary decision diagrams: theory and applications. Society for Industrial and Applied Mathematics (SIAM), Philadelphia, PA, USA, 2000. 33"} \ No newline at end of file +{"id":"oai:arXiv.org:1312.4428","text":"On Constraint Satisfaction Problems below P\u2217 arXiv:1312.4428v2 [cs.CC] 17 Dec 2013 L\u00b4aszl\u00b4o Egri\u2020 Abstract Symmetric Datalog, a fragment of the logic programming language Datalog, is conjectured to capture all constraint satisfaction problems (CSP) in L. Therefore developing tools that help us understand whether or not a CSP can be defined in symmetric Datalog is an important task. It is widely known that a CSP is definable in Datalog and linear Datalog if and only if that CSP has bounded treewidth and bounded pathwidth duality, respectively. In the case of symmetric Datalog, Bulatov, Krokhin and Larose ask for such a duality (2008). We provide two such dualities, and give applications. In particular, we give a short and simple new proof of the result of Dalmau and Larose that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d (2008). In the second part of the paper, we provide some evidence for the conjecture of Dalmau (2002) that every CSP in NL is definable in linear Datalog. Our results also show that a wide class of CSPs\u2013CSPs which do not have bounded pathwidth duality (e.g., the P-complete Horn-3Sat problem)\u2013cannot be defined by any polynomial size family of monotone read-once nondeterministic branching programs. 1 Introduction Constraint satisfaction problems (CSP) constitute a unifying framework to study various computational problems arising naturally in various branches of computer science, including artificial intelligence, graph homomorphisms, and database theory. Loosely speaking, an instance of a CSP consists of a list of variables and a set of constraints, each specified by an ordered tuple of variables and a constraint relation over some specified domain. The goal is then to determine whether variables can be assigned domain values such that all constraints are simultaneously satisfied. Recent efforts have been directed at classifying the complexity of the so-called nonuniform CSP. For a fixed finite set of finite relations \u0393, CSP(\u0393) denotes the nonuniform CSP corresponding to \u0393. The difference between an instance of CSP(\u0393) and an instance of the general CSP is that constraints in an instance of CSP(\u0393) take the form (xi1 , . . . , xik ) \u2208 R for some R \u2208 \u0393. Examples of nonuniform CSPs include k-Sat, Horn-3Sat, Graph H-Coloring, and many others. \u2217 Research supported by NSERC, FQRNT, and ERC Starting Grant PARAMTIGHT (No. 280152). Institute for Computer Science and Control, Hungarian Academy of Sciences (MTA SZTAKI), Budapest, Hungary. {laszlo.egri@mail.mcgill.ca} \u2020 1 For a relational structure B, the homomorphism problem HOM(B) takes a structure A as input, and the task is to determine if there is a homomorphism from A to B. For instance, consider structures that contain a single symmetric binary relation, i.e., graphs. A homomorphism from a graph G to a graph H is a mapping from VG to VH such that any edge of G is mapped to an edge of H. If H is a graph with a single edge then HOM(H) is the set of graphs which are two-colorable. There is a well-known and straightforward correspondence between the CSP and the homomorphism problem. For this reason, from now on we work only with the homomorphism problem instead of the CSP. Nevertheless, we call HOM(B) a CSP and we also write CSP(B) instead of HOM(B), as it is often done in the literature. The CSP is of course NP-complete, and therefore research has focused on identifying \u201cislands\u201d of tractable CSPs. The well-known CSP dichotomy conjecture of Feder and Vardi [13] states that every CSP is either tractable or NP-complete, and progress towards this conjecture has been steady during the last fifteen years. From a complexity-theoretic perspective, the classification of CSP(B) as in P or being NP-complete is rather coarse and therefore somewhat dissatisfactory. Consequently, understanding the fine-grained complexity of CSPs gained considerable attention during the last few years. Ultimately, one would like to know the precise complexity of a CSP lying in P, i.e., to identify a \u201cstandard\u201d complexity class for which a given CSP is complete. Towards this, it was established that Schaefer\u2019s P \u2212 NP dichotomy for Boolean CSPs [24] can indeed be refined: each CSP over the Boolean domain is either definable in first order logic, or complete for one of the classes L, NL, \u2295L, P or NP under AC0 -reductions [2]. The question whether some form of this fine-grained classification extends to non-Boolean domains is rather natural. The two most important tools to study CSPs whose complexity is below P are symmetric Datalog and linear Datalog, syntactic restrictions of the database-inspired logic programming language Datalog. We say that co-CSP(B)\u2013the complement of CSP(B)\u2013is definable in (linear, symmetric) Datalog if the set of structures that do not homomorphically map to B is accepted by a (linear, symmetric) Datalog program.1 Symmetric Datalog programs can be evaluated in logarithmic space (L), and in fact, it is conjectured that if co-CSP(B) is in L then it can also be defined in symmetric Datalog [11]. There is a considerable amount of evidence supporting this conjecture (see, for example, [11, 10, 9, 20, 6]), and therefore providing tools to show whether co-CSP(B) can be defined in symmetric Datalog is an important task. It is well known and easy to see that for any structure B, there is a set of structures O, called an obstruction set, such that a structure A homomorphically maps to B if and only if there is no structure in O that homomorphically maps to A. In fact, there are many possible obstruction sets for any structure B. We say that B has duality X, if B has an obstruction set which has the special property X. The following two well-known theorems relate definability of co-CSP(B) in Datalog and linear Datalog to B having bounded treewidth and bounded pathwidth duality, respectively: 1. co-CSP(B) is definable in Datalog if and only if B has bounded treewidth duality [13]; 2. co-CSP(B) is definable in linear Datalog if and only if B has bounded pathwidth duality [7]. 1 The reason we define co-CSP(B) instead of CSP(B) in (linear, symmetric) Datalog is a technicality explained in Section 2.5. 2 It was stated as an open problem in [4] to find a duality for symmetric Datalog in the spirit of the previous two theorems. We provide two such dualities: symmetric bounded pathwidth duality (SBPD) and piecewise symmetric bounded pathwidth duality (PSBPD). We note that SBPD is a special case of PSBPD. For both bounded treewidth and bounded pathwidth duality, the structures in the obstruction sets are restricted to have some special form. For SBPD and PSBPD the situation is a bit more subtle. In addition that we require the obstruction sets to contain structures only of a special form (they must have bounded pathwidth), the obstruction sets must also possess a certain \u201csymmetric closure\u201d property. To the best of our knowledge, this is the first instance of a duality where in addition to the local requirement that each structure must be of a certain form, the set must also satisfy an interesting global requirement. Using SBPD, we give a short and simple new proof of the main result of [9] that \u201cMaltsev + Datalog \u21d2 symmetric Datalog\u201d. Considering the simplicity of this proof, we suspect that SBPD (or PSBPD) could be a useful tool in an attempt to prove the algebraic symmetric Datalog conjecture [20], a conjecture that proposes an algebraic characterization of all CSPs lying in L. An equivalent form of this conjecture is that \u201cDatalog + n-permutability \u21d2 symmetric Datalog\u201d (by combining results from [18, 3, 21]), where n-permutability is a generalization of Maltsev. One way to gain more insight into the dividing line between CSPs in L and NL is through studying the complexity of CSPs corresponding to oriented paths. It is known that all these CSPs are in NL (by combining results from [12, 8, 7]), and it is natural to ask whether there are oriented paths for which the CSP is NL-complete and L-complete. We provide two classes of oriented paths, C1 and C2 , such that for any B1 \u2208 C1 , the corresponding CSP is NL-complete, and for any B2 \u2208 C2 , the corresponding CSP is in L. In fact, it can be seen with the help of [20] that for most B2 \u2208 C2 , CSP(B2 ) is L-complete. To prove the membership of CSP(B2 ) in L (for B2 \u2208 C2 ), we use PSBPD in an essential way. One can hope to build on this work to achieve an L-NL dichotomy for oriented paths. In the second part of the paper, we investigate CSPs in NL. Based on the observation that any CSP known to be in NL is also known to be definable by a linear Datalog program, Dalmau conjectured that every CSP in NL can be defined by a linear Datalog program [7]. Linear Datalog(suc,\u00ac) (linDat(suc,\u00ac)) denotes the extension of linear Datalog in which we allow negation and access to an order over the domain of the input. It is known that any problem in NL can be defined by a linDat(suc,\u00ac) program [7, 15, 19], and therefore one way to prove the above conjecture would be to show that any CSP that can be defined by a linDat(suc,\u00ac) program can also be defined by a linear Datalog program. We consider a restriction of the conjecture because proving it in its full generality would separate NL from P (using [1]). Read-once linear Datalog(suc) (1-linDat(suc)) is a subclass of linDat(suc,\u00ac), but a subclass that has interesting computational abilities, and for which we are able to find the chink in the armor. We can easily define some NL-complete problems in 1-linDat(suc), such as the CSP directed st-connectivity (st-Conn), and also problems that are not homomorphismclosed, such as determining if the input graph is a clique on 2n vertices, n \u2265 1. Because any problem that can be defined with a linear Datalog program must be homomorphism closed, it follows that 1-linDat(suc) can define nontrivial problems which are in NL but which are not definable by any linear Datalog program. However, our main result shows that if co-CSP(B) 3 can be defined by a 1-linDat(suc) program, then co-CSP(B) can also be defined by a linear Datalog program. The crux of our argument applies the general case of the Erd\u02ddos-Ko-Rado theorem to show that a 1-linDat(suc) program does not have enough \u201cmemory\u201d to handle structures of unbounded pathwidth. Our proof establishing the above result for 1-linDat(suc) programs can be adapted to show a parallel result for a subclass of nondeterministic branching programs, which constitute an important and well-studied class of computational models (see the book [25]). More precisely, we show that if co-CSP(B) can be defined by a poly-size family of read-once2 monotone nondeterministic branching programs (mnBP1(poly)) then co-CSP(B) can also be defined by a linear Datalog program.3 Finally, our results can be interpreted as lower-bounds on a wide class of CSPs: if B does not have bounded pathwidth duality, then co-CSP(B) cannot be defined with any 1linDat(suc) program or with any mnBP1(poly). A specific example of such a CSP would be the P-complete Horn-3Sat problem, and more generally, Larose and Tesson showed that any CSP whose associated variety admits the unary, affine or semilattice types does not have bounded pathwidth duality (see [20] for details). 2 Preliminaries 2.1 Basic Definitions A vocabulary (or signature) is a finite set of relation symbols with associated arities. The arity function is denoted with ar(\u00b7). If A is a relational structure over a vocabulary \u03c4 , then RA denotes the relation of A associated with the symbol R \u2208 \u03c4 . The lightface equivalent of the name of the structure denotes the universe of the structure, e.g., the universe of A is A. \u02dc over a vocabulary \u03c4 is a pair (A, \u02dc T ): T is a set of pairs of the form A tuple structure A \u02dc i.e., A\u02dc contains (R, t), where R \u2208 \u03c4 and t is an ar(R)-tuple, and A\u02dc is the domain of A, every element that appears in some tuple t, and possibly some other elements. Slightly \u02dc to mean (R, t) \u2208 T , where A \u02dc = (A, \u02dc T ). Clearly, abusing notation, we write (R, t) \u2208 A tuple structures are equivalent to relational structures. If A is a relational structure, we \u02dc and vice versa. For convenience, we use the denote the equivalent tuple structure with A, two notations interchangeably. We note that all structures in this paper are finite. Let B be a structure of the same signature \u03c4 as A. The union A \u222a B of A and B is the \u03c4 -structure whose universe is A \u222a B, and for each R \u2208 \u03c4 , RA\u222aB is defined as RA \u222a RB . (Note that it is possible that A \u2229 B = \u2205.) A homomorphism from A to B is a map f from A to B such that f (RA ) \u2286 RB for each R \u2208 \u03c4 . If there exists a homomorphism from A to B, we f often denote it with A \u2192 B. If that homomorphism is f , we write A \u2212\u2192 B. A structure is called a core if it has no homomorphism to any of its proper substructures. A retract of a structure B is an induced substructure B of B such that there is a homomorphism g : B \u2192 B with g(b) = b for every b \u2208 B . A retract of B that has minimal size among 2 Our read-once restriction for nondeterministic branching programs is less stringent than the usual restriction because we require the programs to be read-once only on certain inputs. 3 A 1-linDat(suc) can be converted into an mnBP1(poly), so another way to present our results would be to do the proofs in the context of mnBP1s, and then to conclude the parallel result for 1-linDat(suc). 4 all retracts of B is called a core of B. It is well known that all cores of a structure are isomorphic, and so one speaks of the core of a structure B, core(B). We denote by CSP(B) the set {A | A is a \u03c4 -structure such that A \u2192 B}, and by co-CSP(B) the complement of CSP(B), i.e., the set {A | A is a \u03c4 -structure such that A \u2192 B}. If we are given a class of \u03c4 -structures C such that for any A \u2208 C, and any B such that A \u2192 B it holds that B \u2208 C, then we say that C is homomorphism-closed. Isomorphism closure is defined in a similar way. An n-ary operation on a set A is a map f : An \u2192 A. Given an h-ary relation R and an n-ary operation f on the same set A, we say that f preserves R or that R is invariant under f if the following holds: given any matrix M of size h \u00d7 n whose columns are in R, applying f to the rows of M produces an h-tuple in R. A polymorphism of a structure B is an operation f that preserves each relation in B. Definition 1 (Maltsev Operation). A ternary operation f : A3 \u2192 A on a finite set A is called a Maltsev operation if it satisfies the following identities: f (x, y, y) = f (y, y, x) = x, \u2200x, y \u2208 A. 2.2 Datalog We provide only an informal introduction to Datalog and its fragments, and the reader can find more details, for example, in [22, 7, 11]. Datalog is a database-inspired query language whose connection with CSP-complexity is now relatively well understood (see, e.g., [3]). Let \u03c4 be some finite vocabulary. A Datalog program over \u03c4 is specified by a finite set of rules of the form h \u2190 b1 \u2227 \u00b7 \u00b7 \u00b7 \u2227 bt , where h and the bi are atomic formulas R(x1 , . . . , xk ). When we specify the variables of an atomic formula, we always list the variables from left to right, or we simply provide a tuple x of variables whose i-th variable is x[i]. We distinguish two types of relational predicates occurring in a Datalog program: predicates that occur at least once in the head of a rule (i.e., its left-hand side) are called intensional database predicates (IDBs) and are not in \u03c4 . The predicates which occur only in the body of a rule (its right-hand side) are called extensional database predicates (EDBs) and must all lie in \u03c4 . A rule that contains no IDB in the body is called a nonrecursive rule, and a rule that contains at least one IDB in the body is called a recursive rule. A Datalog program contains a distinguished IDB of arity 0 which is called the goal predicate; a rule whose head IDB is a goal IDB is called a goal rule. Linear Datalog is a syntactic restriction of Datalog in which there is at most one IDB in the body of each rule. The class of linear Datalog programs that contains only rules with at most k variables and IDBs with at most j \u2264 k variables is denoted by linear (j, k)-Datalog. We say that the width of such a linear Datalog program is (j, k). Symmetric Datalog is a syntactic restriction of linear Datalog. A linear Datalog program \u00af P is symmetric if for any recursive rule I(x) \u2190 J(y) \u2227 E(z) of P (except for goal rules), \u00af where E(z) is a shorthand for the conjunction of the EDBs of the rule over variables in z, \u00af the symmetric pair J(y) \u2190 I(x) \u2227 E(z) of that rule is also in P. The width of a symmetric Datalog program is defined similarly to the width of a linear Datalog program. We explain the semantics of linear (symmetric) Datalog using derivations (it could also be explained with fixed point operators, but that would be inconvenient for the proofs). Let 5 P be a linear Datalog program with vocabulary \u03c4 . A P-derivation with codomain D is a sequence of pairs D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ), where \u03c1 is a rule of P, and \u03bb is a function from the variables V of \u03c1 to D, \u2200 \u2208 [q]. The sequence D must satisfy the following properties. Rule \u03c11 is nonrecursive, and \u03c1q is a goal rule. For all \u2208 [q \u2212 1], the head IDB I of \u03c1 is the IDB in the body of \u03c1 +1 , and if the variables of I in the head of \u03c1 and the body of \u03c1 +1 are x and y, respectively, then \u03bb (x[i]) = \u03bb +1 (y[i]), \u2200i \u2208 [ar(I)]. Let D be a derivation. Let R(z) be an EDB (with variables z) appearing in some rule \u03c1 of D. We write R(t) to denote that \u03bb (z) = t, i.e., that \u03bb instantiates the variables of R(z) to t. If R(z) appears in some rule \u03c1 of D and \u03bb (z) = t, we say that R(t) appears in \u03c1 , or less specifically, that R(t) appears in D. Given a structure A and a derivation D with codomain A for a program P, we say that D \u02dc The notation is a derivation for A if for every R(t) that appears in a rule of D, (R, t) \u2208 A. for a P-derivation for a structure A will have the form DP (A). A linear (symmetric) Datalog program P accepts an input structure A if there exists a P-derivation for A. Definition 2 (Read-once Derivation). We say that a derivation D is read-once if every R(t) that appears in D appears exactly once in D, except when R is the special EDB suc, first, or last, defined in Section 4. An example is given in Figure 1. The vocabulary is \u03c4 = {E 2 , S 1 , T 1 }, where the superscripts denote the arity of the symbols. Notice that in the symmetric Datalog program P, rules (2) and (3) form a symmetric pair. It is not difficult to see that P accepts a \u03c4 -structure A if and only if there is an oriented path (see Section 3.1) in E A from an element in S A to an element in T A . 2.3 Path Decompositions and Derivations Definition 3. [Path Decomposition] Let S be a \u03c4 -structure. A (j, k)-path decomposition (or path decomposition of width (j, k)) of S is a sequence S0 , . . . , Sn\u22121 of subsets of S such that \u02dc \u2203 \u2208 {0, . . . , n \u2212 1} such that {a1 , . . . , aar(R) } \u2286 S ; 1. For every (R, (a1 , . . . , aar(R) )) \u2208 S, 2. If a \u2208 Si \u2229 Si (i < i ) then a \u2208 S for all i < < i ; 3. \u2200 \u2208 {0, . . . , n \u2212 1}, |S | \u2264 k, and \u2200 \u2208 {0, . . . , n \u2212 2}, |S \u2229 S +1 | \u2264 j. For ease of notation, it will be useful to introduce a concept closely related to path decompositions. Let \u03c4 be a vocabulary. Let S be a \u03c4 -structure that can be expressed as S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 , where the S0 , . . . , Sn\u22121 (the universes of the Si ) satisfy properties 2 and 3 above. We say that S is a (j, k)-path, and that (S0 , . . . , Sn\u22121 ) is a (j, k)-path representation of S. We denote (j, k)-path representations with script letters, e.g., S = (S0 , . . . , Sn\u22121 ). The substructure Si \u222a \u00b7 \u00b7 \u00b7 \u222a Si of S (assuming a (j, k)-representation is fixed) is denoted by S[i,i ] . We call n the length of the representation. Obviously, a structure is a (j, k)-path if and only if it admits a (j, k)-path decomposition. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation for some linear or symmetric program P with vocabulary \u03c4 . We can extract from D a \u03c4 -structure Ex(D) such that D is a derivation \u02dc for each R(t) that appears in D for Ex(D). We specify Ex(D) as a tuple structure A: 6 I(x) \u2190 S(x) I(y) \u2190 I(x) \u2227 E(x, y) I(x) \u2190 I(y) \u2227 E(x, y) G \u2190 I(x) \u2227 T (x) S G = {a} T G = {d} (1) (2) (3) (4) b f g e I(b) I(c) I(d) G E(a, b) E(c, b) E(c, d) T (d) \u03c12 \u03c13 \u03c14 \u03c15 \u03c11 \u03bb1 (x) = a c a I(a) S(a) d \u03bb2 (x) = a \u03bb3 (x) = c \u03bb2 (y) = b \u03bb3 (y) = b \u03bb4 (x) = c \u03bb4 (y) = d \u03bb5 (x) = d Figure 1: Top left: Symmetric Datalog program P. Top right: Input structure G where the binary relation E G is specified by the digraph. Bottom: Visualization of a P-derivation DP (G) = (\u03c11 , \u03bb1 ), . . . , (\u03c15 , \u03bb5 ) for G, where \u03c11 is the nonrecursive rule (1), \u03c12 , \u03c14 are rule (2), \u03c13 is rule (3), and \u03c15 is the goal rule (4). In the diagram, for example, the dashed box corresponds to rule \u03c12 , and it is the rule I(y) \u2190 I(x) \u2227 E(x, y) of P, where \u03bb2 assigns a to variable x and b to variable y. Observe that DP (G) is read-once. \u02dc and set A\u02dc to be the set of those elements that appear (R \u2208 \u03c4 ), we add the pair (R, t) to A, in a tuple. Let D = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ) be a derivation. For each x that is in a rule \u03c1 for some \u2208 [q], call x the indexed version of x. We define an equivalence relation Eq(D) on the set of indexed variables of D. First we define a graph G = (V, E) as: \u2022 V is the set of all indexed versions of variables in D; \u2022 (x , y ) \u2208 E if = + 1, x is the i-th variable of the head IDB I of \u03c1 , and y is the i-th variable of the body IDB I of \u03c1 +1 . Two indexed variables x and y are related in Eq(D) if they are connected in G. Observe that if C = {x11 , x22 , . . . , xcc } is a connected component of G, then it must be that \u03bb 1 (x1 ) = \u03bb 2 (x2 ) = \u00b7 \u00b7 \u00b7 = \u03bb c (xc ). Definition 4 (Free Derivation). Let P be a linear Datalog program and D = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ) be a derivation for P. Then D is said to be free if for any two (x , y ) \u2208 Eq(D), \u03bb (x) = \u03bb (y). Intuitively, this definition says that D is free if any two variables in D which are not \u201cforced\u201d to have the same value are assigned different values. 2.4 Canonical Programs Fix a \u03c4 -structure B and j \u2264 k. Let Q1 , . . . , Qn be all possible at most j-ary relations over B. The canonical linear (j, k)-Datalog program for B ((j, k)-CanL(B)) contains an IDB Im of 7 \u00af the same arity as Qm for each m \u2208 [n]. The rule Ic (x) \u2190 Id (y)\u2227E(z) belongs to the canonical \u00af program if it contains at most k variables, and the implication Qc (x) \u2190 Qd (y) \u2227 E(z) is true for all possible instantiation of the variables to elements of B. The goal predicate of this program is the 0-ary IDB Ig , where Qg = \u2205. The canonical symmetric (j, k)-Datalog program for B ((j, k)-CanS(B)) has the same definition as (j, k)-CanL(B), except that it has less rules due to the following additional \u00af \u00af restriction. If Ic (x) \u2190 Id (y) \u2227 E(z) is in the program, then both Qc (x) \u2190 Qd (y) \u2227 E(z) and \u00af Qd (y) \u2190 Qc (x) \u2227 E(z) must hold for all possible instantiation of the variables to elements of B. The program (j, k)-CanS(B) is obviously symmetric. When it is clear from the context, we write CanL(B) and CanS(B) instead of (j, k)-CanL(B) and (j, k)-CanS(B), respectively. 2.5 Defining CSPs The following discussion applies not just to Datalog but also to its symmetric and linear fragments. It is easy to see that the class of structures accepted by a Datalog program is homomorphism-closed, and therefore it is not possible to define CSP(B) in Datalog. However, co-CSP(B) is closed under homomorphisms, and in fact, it is often possible to define co-CSP(B) in Datalog. The following definition is key. Definition 5 (Obstruction Set). A set O of \u03c4 -structures is called an obstruction set for B, if for any \u03c4 -structure A, A \u2192 B if and only if there exists S \u2208 O such that S \u2192 A. In other words, an obstruction set defines co-CSP(B) implicitly as A \u2208 co-CSP(B) if and only if there exists S \u2208 O such that S \u2192 A. If O above can be chosen to have property X, then we say that B has X-duality. In the next section we show that co-CSP(B) is definable in symmetric Datalog if and only if B has symmetric bounded pathwidth duality. 3 3.1 On CSPs in symmetric Datalog Definitions An oriented path is a digraph obtained by orienting the edges of an undirected path. In other words, an oriented path has vertices v0 , . . . , vq+1 and edges e0 , . . . , eq , where ei is either (vi , vi+1 ), or (vi+1 , vi ). The length of an oriented path is the number of edges it contains. We call (vi , vi+1 ) a forward edge and (vi+1 , vi ) a backward edge. Oriented paths can be thought of as relational structures over the vocabulary {E 2 }, so we denote them with boldface letters. For an oriented path P, we can find a mapping level : P \u2192 {0, 1, 2, . . . } such that level(b) = level(a)+1 whenever (a, b) is an edge of P. Clearly, there is a unique such mapping with the smallest possible values. The level of an edge (a, b) of P is level(a), i.e., the level of the starting vertex of (a, b). The height(P) of an oriented path P is maxa\u2208P level(a). Let P be an oriented path that has a vertex u with indegree 0 and outdegree 1, and a vertex v with indegree 1 and outdegree 0. We say that P is minimal if u is in the bottommost level and v is in the topmost level, and there are no other vertices of P in the bottommost or the topmost levels. 8 A zigzag operator \u03be takes a (j, k)-path representation S = (S0 , . . . , Sn\u22121 ) of a (j, k)-path S and a minimal oriented path P = e0 , . . . , eq such that height(P) = n, and it returns another (j, k)-path \u03be(S , P). Intuitively, \u03be(S , P) is the (j, k)-path S \u201cmodulated\u201d by P such that the forward and backward edges ei of P are mimicked in \u03be(S , P) by \u201cforward and backward\u201d copies of Slevel(ei ) . Before the formal definition, it could help the reader to look at the right side of Figure 2, where the oriented path used to modulate the (j, k)-path over the vocabulary E 2 (i.e., digraphs) with representation (S0 , S1 , S2 ) is P on the left side. The left side is a more abstract example, and the reader might find it useful after reading the definition. We inductively define the (j, k)-path \u03be(S , P) as (Se0 , Se1 , . . . , Seq ) together with a sequence of isomorphisms \u03d5e0 , \u03d5e1 , . . . , \u03d5eq , where \u03d5ei is an isomorphism from Sei to Slevel(ei ) , 0 \u2264 i \u2264 q. For the base case, we define Se0 to be an isomorphic copy of S0 , and \u03d5e0 to be the isomorphism that maps Se0 back to S0 . Assume inductively that Se0 , . . . , Sei\u22121 and \u03d5e0 , . . . , \u03d5ei\u22121 are already defined. Let Sei be an isomorphic copy of Slevel(ei ) with domain disjoint from Se0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sei\u22121 , and fix \u03d5ei to be the isomorphism that maps back Sei to Slevel(ei ) . We \u201cglue\u201d Sei to Sei\u22121 by renaming some elements of Sei to elements of Sei\u22121 . To facilitate understanding, we can think of the already constructed structures Se0 , . . . , Sei\u22121 as labels of the edges e0 , . . . , ei\u22121 of P, respectively, and we want to determine Sei , the label of the next edge. The connection between Sei\u22121 and Sei will be defined such that Sei\u22121 and Sei \u201cmimic\u201d the orientation of the edges ei\u22121 and ei . We resume our formal definition. Set = level(ei ), and let = \u2212 1 if ei is a forward edge, and = + 1 if ei is a backward edge. If an element x \u2208 Sei and an element y \u2208 Sei\u22121 are both copies of the same element a \u2208 S \u2229 S , then rename x to y in Sei . After all such (a) in Sei elements are renamed, Sei becomes Sei . That is, for all a \u2208 S \u2229 S , rename \u03d5e\u22121 i (a) to obtain S . to \u03d5\u22121 ei ei\u22121 We define the isomorphism \u03d5ei from Sei to Slevel(ei ) as: \u03d5ei (x) = 3.2 if x \u2208 Sei and x \u2208 Sei\u22121 \u03d5ei (x) \u03d5ei\u22121 (x) if x \u2208 Sei \u2229 Sei\u22121 . Two Dualities for Symmetric Datalog The two main theorems (Theorems 9 and 15) of this section can be combined to obtain the equivalence of the statements (1), (3) and (4) in Theorem 6 below. The proof of the implication (1) \u2192 (2) is a direct adaptation of the proof of the result from [13] that if co-CSP(B) is defined by a (j, k)-Datalog program, then it is also defined by the canonical (j, k)-Datalog program (see also [9]). Note that (1) \u2192 (2) is also obvious from the proof of Theorem 9 below. Theorem 6. For a finite structure B, TFAE: 1. There is a symmetric Datalog program that defines co-CSP(B); 2. The canonical symmetric (j, k)-Datalog program defines co-CSP(B); 3. B has symmetric bounded pathwidth duality (for some parameters); 9 P S \u03be(S , P) e4 S2 Se4 cd c d e1 S1 ab S0 e2 e3 Se1 ab e0 Se0 \u03be(S , P) S Se4 S2 \u03be cd Se2 Se3 S1 Se1 Se2 Se3 a b S0 Se0 Figure 2: Left: Applying a zigzag operator to the (j, k)-path S with the (j, k)-representation S = (S0 , S1 , S2 ). Suppose that S0 \u2229 S1 = {a, b} and S1 \u2229 S2 = {c, d}. We demonstrate how Se0 and Se2 are obtained. Se0 is a disjoint copy of S0 (and the copy of a and b in Se0 are a and b , respectively). To obtain Se2 , first make a disjoint copy Se2 of Slevel(e2 ) = S1 . Set = level(e2 ) = 1. Since e1 is a forward edge and e2 is a backward edge, = + 1 = 2. Therefore to \u201cglue\u201d Se2 to Se1 , we need to look at S \u2229 S = {c, d}. Assume that the copy of c and d in Se1 are c and d , respectively. Furthermore, assume that the copy of c and d \u02dc respectively. To obtain Se , we rename c\u02dc to c , and d\u02dc to d in S . Right: in Se2 are c\u02dc and d, 2 e2 A specific example when S0 , S1 , S2 are the digraphs in the boxes. The dashed lines indicate identification of vertices. The level of (Se2 , Se3 ), for example, is 0 since e2 and e3 share a vertex at vertex level 1. 4. B has piecewise symmetric bounded pathwidth duality (for some parameters). 3.2.1 Symmetric Bounded Pathwidth Duality Definition 7 ((j, k)-symmetric). Assume that O is a set of (j, k)-paths. Suppose furthermore that a (j, k)-path representation can be fixed for each structure in O such that the following holds. For every S \u2208 O with representation S of some length n, and every minimal oriented path P of height n, it holds that \u03be(S , P) \u2208 O. Then O is said to be (j, k)-symmetric. Definition 8 (SBPD). A structure B has (j, k)-symmetric bounded pathwidth duality ((j, k)SBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k)-symmetric. The following is our main duality theorem for symmetric Datalog: Theorem 9. For a finite structure B, co-CSP(B) can be defined by a symmetric (j, k)Datalog program if and only if B has (j, k)-SBPD. We will use Lemma 10 in the proof of Theorem 9. Lemma 10 can be proved using the standard canonical Datalog program argument. Lemma 11 is also used in the proof of Theorem 9 and it is the main technical lemma of the section. 10 Lemma 10. If CanS(B) accepts a structure A, then A \u2192 B. Proof. Structure B is not accepted by CanS(B) because a derivation could be translated into a valid chain of implications, which is not possible by the definition of CanS(B). If CanS(B) accepts A and A \u2192 B, then CanS(B) accepts B, a contradiction. Lemma 11. For any \u03c4 -structures A and B, if there exists a structure S with a (j, k)-path representation S of some length n such that S \u2192 A, and for any minimal oriented path P of height n, it holds that \u03be(S , P) \u2192 B, then (j, k)-CanS(B) accepts A. To prove Lemma 11 we need to define an additional concept related to the zigzag operator. Once the (j, k)-path \u03be(S , P) = (Se0 , . . . , Seq ) is defined, where P is the path e0 , . . . , eq , each pair (Sei , Sei+1 ), \u2200i \u2208 {0, . . . , q \u2212 1} is assigned a level : level(Sei , Sei+1 ) is the level of the vertex v minus 1, where v is the vertex that ei and ei+1 share (see Figure 2). Proof of Lemma 11. For the rest of this proof, let CS denote (j, k)-CanS(B), and CL denote (j, k)-CanL(B). If program CS accepts structure S then because S \u2192 A, CS also accepts A. So it is sufficient to show that program CS accepts structure S. First we specify how to associate a CL-derivation with \u03be(S , P), where P is a minimal oriented path of height n. Assume that \u03be(S , P) = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sq . For each i \u2208 {0, . . . , q \u2212 1}, fix an arbitrary order on the elements of Si \u2229 Si+1 . Assume that |Si \u2229 Si+1 | = j (\u2264 j), and define the j -tuple si such that si [ ] is the -th element of Si \u2229 Si+1 . We define sq to be the empty tuple. It is good to keep in mind that later, si will be associated with the IDB Ji . The derivation will be DCL (\u03be(S , P)) = (\u03c10 , \u03bb0 ), . . . , (\u03c1q , \u03bbq ). We specify \u03c1i as \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y if i \u2208 [q] \u00af 0) J0 (x0 ) \u2190 E(y if i = 0. We begin with describing the EDBs of a rule \u03c1i together with their variables. Assume that Si = {d1 , . . . , dt }, and observe that t \u2264 k. The variables of \u03c1i are v1 , . . . , vt . For every R \u2208 \u03c4 , and every tuple (df (1) , . . . , df (r) ) \u2208 RSi , where r = ar(R), R(vf (1) , . . . , vf (r) ) is an EDB of \u03c1i . We describe the variables of the IDBs Ji\u22121 and Ji . Assume that si\u22121 = (dg(1) , . . . , dg(j1 ) ) and si = (dh(1) , . . . , dh(j2 ) ). Then the IDB in the body of \u03c1i together with its variables is Ji\u22121 (vg(1) , . . . , vg(j1 ) ), and the head IDB together with its variables is Ji (vh(1) , . . . , vh(j2 ) ). The function \u03bbi simply assigns the value dg to the variable vg , \u2200g \u2208 [t]. It remains to specify the IDBs, i.e., which IDBs of CL the Ji -s correspond to. For each i \u2208 {0, . . . , q}, Ii denotes IMiP , where MiP is a subset of B j for some j \u2264 j. We define the sequence M0P , M1P , . . . , MqP inductively. To define M0P , consider the nonrecursive \u00af 0 ). Assume that the arity of J0 is j , and that y0 contains k varirule J0 (x0 ) \u2190 E(y ables. (Note that the variables in x0 and y0 are not necessarily disjoint.) For all possible functions \u03b1 : x0 [1], . . . , x0 [j ], y0 [1], . . . , y0 [k ] \u2192 B such that the conjunction of EDBs P \u00af E(\u03b1(y 0 [1]), . . . , \u03b1(y0 [k ])) is true, place the tuple (\u03b1(x0 [1]), . . . , \u03b1(x0 [j ])) into M0 . P Assume that Mi\u22121 is already defined. Then similarly to the base case, for each possible P instantiation \u03b1 of the variables of \u03c1i over B with the restriction that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , if the P conjunction of EDBs of \u03c1i is true, then add the tuple \u03b1(xi ) to Mi . It is not difficult to see that if MqP = \u2205, then we can construct a homomorphism from \u03be(P, P) to B which would be a contradiction. 11 For each i \u2208 {0, . . . , q \u2212 1}, assume that (Si , Si+1 ) has level i . Then we say that the IDB Ji has level i and we write level(Ji ) = i . We proceed to construct a CS-derivation DCS (S) for S. Let Q be a directed path of height n. We construct DCS (S) just like we would construct DCL (\u03be(S , Q)) above, except Q that we will define the subscripts of the IDBs, M0Q , . . . , Mn\u22121 , differently, so that every rule of the resulting derivation belongs to CS. From now on we write M0 , . . . , Mn\u22121 instead of Q . M0Q , . . . , Mn\u22121 To define M0 , . . . , Mn\u22121 , let P0 , P1 , . . . be an enumeration of all (finite) minimal oriented paths of height n. Intuitively, we will collect in Nm all subscripts (recall that a subscript is a relation) of all those IDBs which have the same level in DL (\u03be(S , Pm )). Formally, for each \u2208 {0, . . . , n \u2212 1} define Nm = {MtPm | level(Jt ) = }. Then we collect the subscripts at a fixed level in O over all derivations corresponding to P0 , P1 , . . . . Formally, for each \u2208 {0, . . . , n \u2212 1}, we define O = N0 \u222a N1 , . . . . We are ready to define M0 , . . . , Mn\u22121 . For each s \u2208 {0, . . . , n \u2212 1}, define Ms = W \u2208Os W . It remains to show that every rule of the derivation we defined is in S and that the last IDB is the goal IDB. If the last IDB is not the goal IDB of S, then Mn\u22121 = \u2205. By definition, m it must be that for some minimal oriented path Pm of height n and length qm , MP qm \u22121 = \u2205 m (note that the last IDB of DCL (\u03be(P, Pm )) has subscript MP qm \u22121 ). As noted before, this would mean that \u03be(P, Pm ) \u2192 B, a contradiction. We show that each rule of DCS (S) as defined above belongs to CanS(B). Suppose DCS (S) contains a rule \u03c1 \u00af i) Ji (xi ) \u2190 Ji\u22121 (xi\u22121 ) \u2227 E(y that is not in CanS(B). By definition, there cannot be an instantiation \u03b1 of variables of \u03c1 to elements of B such that \u03b1(xi\u22121 ) \u2208 Mi\u22121 , the conjunction of EDBs holds, but \u03b1(xi ) \u2208 Mi . Assume then that there is an \u03b1 such that \u03b1(xi ) \u2208 Mi , the conjunction of EDBs holds, but \u03b1(xi\u22121 ) \u2208 Mi\u22121 . It is also not difficult to see that this is not possible because we used all minimal oriented paths in the construction of DCS (S). Proof of Theorem 9. If CSP(B) is defined by a symmetric (j, k)-Datalog program P, then using the symmetric property of P, it is laborious but straightforward to show that O= D is a free derivation of P {Ex(D)} is a (j, k)-symmetric obstruction set for B. For the converse, assume that B has (j, k)-SBPD. Let O be a symmetric obstruction set of width (j, k) (i.e., the path decomposition of every structure in O has width (j, k)) for B. We claim that (j, k)-CanS(B) defines CSP(B). Assume that A \u2192 B. Then by Lemma 10, (j, k)CanS(B) does not accept A. Suppose now that A \u2192 B. Then by assumption, there exists a (j, k)-path S \u2208 O with a representation S of length n such that S \u2192 A. Furthermore, since O is symmetric, for any minimal oriented path P of height n, \u03be(S , P) \u2192 B. It follows from Lemma 11 that CanS(B) accepts A. 12 3.2.2 Piecewise Symmetric Bounded Pathwidth Duality Piecewise symmetric bounded pathwidth duality (PSBPD) for symmetric Datalog is less stringent than SBPD; however, the price is larger program width. Although the following definitions might seem technical, the general idea is simple: a piecewise symmetric obstruction set O does not need to contain all (j, k)-paths obtained by \u201czigzagging\u201d (j, k)-paths in O in all possible ways. It is sufficient to zigzag a (j, k)-path S using only oriented paths which \u201cavoid\u201d certain segments of S: some constants c and d are fixed for O, and there are at most c fixed segments of S that are avoided by the zigzag operator, each of size at most d. We give the formal definitions. Definition 12 ((c, d)-filter). Let S be a (j, k)-path with a representation S = S0 , . . . , Sn\u22121 . A (c, d)-filter F for S is a set of intervals {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} such that \u2022 c \u2264 c; 0 \u2264 s1 ; tc \u2264 n \u2212 1; si \u2264 ti , \u2200i \u2208 [c ]; and t + 2 \u2264 s +1 , \u2200 \u2208 [c \u2212 1]; \u2022 | i\u2208[s ,t ] Si | \u2264 d, \u2200 \u2208 [c ]. Elements of F are called delimiters. An oriented path P of height n obeys a (c, d)-filter F if for any delimiter [si , ti ] \u2208 F , the set of edges e of P such that si \u2264 level(e) \u2264 ti form a (single) directed path. A demonstration is given in Figure 3. S P FS S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 Figure 3: S is a (j, k)-path representation of S. FS is the (3, 2k)-filter {[0, 0], [3, 4], [7, 8]} for S . P is an oriented path that obeys the filter. For example, observe that the edges at levels 3 and 4 form a directed subpath, and that \u201czigzagging\u201d happens only at those parts of P that do not fall into the intervals of the filter. Definition 13 (Piecewise Symmetric). Assume that O is a set of (j, k)-paths, and c and d are nonnegative integers. Suppose furthermore that for each S \u2208 O, there is a (j, k)-path representation S , and a (c, d)-filter FS such that the following holds. For every S \u2208 O of some length n, and every minimal oriented path P of height n that obeys the filter FS , it holds that \u03be(S , P) \u2208 O. Then O is (j, k, c, d)-piecewise symmetric. Roughly speaking, an oriented path P is allowed to modulate only those segments of S which do not correspond to any delimiters in FS . Compare Definition 13 with Definition 7, and observe that the only difference is that in the piecewise case, the oriented paths must be of a restricted form. Therefore a set that is (j, k)-symmetric is also (j, k, c, d)-piecewise symmetric for any c and d. We simply associate the empty (c, d)-filter with each structure. 13 Definition 14 (PSBPD). A structure B has (j, k, c, d)-piecewise symmetric bounded pathwidth duality ((j, k, c, d)-PSBPD) if there is an obstruction set O for B that consists of (j, k)-paths, and in addition, O is (j, k, c, d)-piecewise symmetric. Theorem 15. For a finite structure B, B has SBPD (for some parameters) if and only if B has PSBPD (for some parameters). We need the corollary of the following lemma in the proof of the above theorem. Lemma 16. Let P be a minimal oriented path e0 , . . . , en\u22121 with the (1, 2)-path representation P = (e0 , . . . , en\u22121 ), where we think of ei as a structure with two domain elements and a binary relation that contains the tuple ei . Let Q be a minimal oriented path f0 , . . . , fm with n edge levels. Then the oriented path \u03be(P, Q) is minimal and has the same height as P. Proof. It is obvious that \u03be(P, Q) is an oriented path. Furthermore the map that assigns every vertex of \u03be(P, Q) to its original in P is a homomorphism. It is easy to check that this homomorphism maps the edges of \u03be(P, Q) back to their originals and the level of an edge in \u03be(P, Q) is the same as the level of the original of that edge. Checking the minimality of \u03be(P, Q) is also straightforward. Corollary 17. Let O be a set of (j, k)-paths, where a (j, k)-representation is fixed for each path. Let O be the set that contains all (j, k)-paths that can be obtained from a (j, k)-path in O by applying some zigzag operator. Then O is (j, k)-symmetric. Remark: A similar statement holds in the piecewise symmetric case. Proof. Let S be an element of O . If we can show that applying an arbitrary zigzag operator to S yields a (j, k)-path in O , then we are clearly done. So assume that S was obtained from S \u2208 O by applying a zigzag operator. The (j, k)-path S inherits the (j, k)-representation of S in a natural way. Then we apply any zigzag operator to S to obtain S , and we need to show that S is in O . We get from S to S using a zigzag operator and from S to S another zigzag operator. Using Lemma 17, we can see that we can replace these two zigzag operators by a single one to obtain S from S directly. Proof of Theorem 15. Let O be a (j, k)-symmetric obstruction set for B. As observed above, for any c and d, O is also (j, k, c, d)-piecewise symmetric. For the converse, let O be a (j, k, c, d)-piecewise symmetric obstruction set. Our goal is to construct a (j , k )-symmetric obstruction set Osym for B as follows. For each structure S \u2208 O, let S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation. Using the filter for S, we \u201cregroup\u201d S0 , . . . , Sn\u22121 to obtain (j , k )-path representation S = T0 \u222a \u00b7 \u00b7 \u00b7 \u222a Tm of S. We add each S together with its new representation to Osym , and also add every structure that is needed to ensure that Osym is symmetric. Finally, we show that Osym is a symmetric obstruction set for B. We begin with the regrouping procedure. Let S \u2208 O, S = S0 \u222a \u00b7 \u00b7 \u00b7 \u222a Sn\u22121 be the corresponding (j, k)-path representation, and {[s1 , t1 ], [s2 , t2 ], . . . , [sc , tc ]} be the (c, d)-filter FS . The regrouping procedure is quite pictorial and it is demonstrated in Figure 4. We define 14 S S S17 S16 S15 S14 S13 S12 S11 S10 S9 S8 S7 S6 S5 S4 S3 S2 S1 S0 S11 T5 S5 S10 T4 S4 S9 S16 T3 S3 S8 S15 T2 S2 S7 S14 T1 S13 S1 S6 S17 T0 S12 S0 Figure 4: An example regrouping for the proof of Theorem 15. The filter FS = {[0, 1], [6], [12, 13], [17]}. The structures corresponding to the filter are laying inside the rectangles with lines. The complement of the filter is F\u00afS = {[2, 3, 4], [7, 8, 9, 10, 11], [14, 15, 16]}. The structure corresponding to F\u00afS lay in the gray ovals. The new (j , k )-path representation S of S is on the right. Notice the following pattern: the segments of S determined by FS are placed next to each other in S . T0 = S. \u2208[a,b]: [a,b]\u2208FS This places all substructures in S which correspond to delimiters of FS into one big initial structure. Note though that |T0 | \u2264 c \u00b7 d. Define the complement of FS as F\u00afS = {[0, s1 \u2212 1], [t1 + 1, s2 \u2212 1], [t2 + 1, s3 \u2212 1], . . . , [tc , n \u2212 1]}, and set m = max (b \u2212 a). [a,b]\u2208F\u00afS Intuitively, m is the length of the longest interval in S between any two delimiters. We define T as follows. For each interval [a, b] \u2208 F\u00afS take the ( \u2212 1)-th structure Sa+ \u22121 in that interval and define T to be the union of these structures. Formally, for every \u2208 {1, . . . , m}, set T = Si . i=a+ \u22121\u2264b: [a,b]\u2208F\u00afS 15 S : the representation of S in O Q T : the representation of S in Ops \u03be(S , Q) \u03be(T , P) P S9 S9 S8 S8 S8 S7 S7 S6 S6 S5 S5 S4 S4 S3 , S8 , D S3 , S8 , D S3 S2 , S7 , D S2 , S7 , D S1 , S6 , D S1 , S6 , D S0 , S5 , D S0 , S5 , D S7 S8 S7 S4 , S9 , D S3 S3 S2 S2 S1 S1 S0 S0 S3 S2 S4 , S9 , D S3 , S8 , D S2 , S7 , D S3 , S8 , D S2 , S7 , D S2 Figure 5: Example in the proof of Theorem 15. See the proof for details. Observe that |T | \u2264 k \u00b7 (c + 1). We need to ensure property 2 in Definition 3, so we need to place some additional elements into the domains of the T . Let [x, y] \u2208 FS and [z, w] \u2208 F\u00afS be such that z = y + 1. Then the set of elements Sx \u222a \u00b7 \u00b7 \u00b7 \u222a Sw is called a column. (For the beginning and end of S a column is defined in the natural \u201ctruncated\u201d way.) Because S is a (j, k)-path representation, it follows from the definition that the intersection of any pair of columns has size at most j. Let C1 , . . . , Cr be an enumeration of all the columns. Set D = = C \u2229 C and observe that |D| \u2264 j \u00b7 2r . We add D to the domain of T0 , and also to the domain of Ti to obtain Ti , \u2200i \u2208 {1, . . . , m}. It is straightforward to see that the new representation T = (T0 , . . . , Tm ) satisfies property 2 of Definition 3. Using the remarks about the sizes of the sets, we observe that T is a (j , k )-path decomposition of S, where j and k are functions of j, k, c and d. We place all structures S \u2208 O into Osym but we associate the new representation with S. For a structure S \u2208 Osym , we also apply all valid zigzag operators to S (with respect to the new representation) and add all these structure to Osym . By Lemma 17, Ops is a (j , k )symmetric set. We need to establish that Ops is an obstruction set. Because O \u2286 Osym , it is sufficient to show that no structure in Osym maps to B. To do that we show that for any structure in Ops , there is a structure in O that homomorphically maps to it. Giving a formal proof would lead to unnecessary notational complications and therefore we give an example that is easier to follow and straightforward to generalize. The example is represented in Figure 5. Let S \u2208 Ops such that S is also in O. Assume that the (j , k )representation of S in Ops is T . We consider \u03be(T , P) for some minimal oriented path and show how to find a minimal oriented path Q such that \u03be(S , Q) \u2192 \u03be(T , P). To construct Q, we make a copy of P aligned with S0 , S1 , S2 , S3 , S4 in S . This is represented by the dashed lines in Figure 5. We also make a copy of P aligned with S5 , S6 , S7 , S8 , S9 . This is represented with the dash dotted lines. Note that the resulting minimal oriented path 16 respects the delimiters, i.e., the zigzag operator will not \u201czigzag\u201d S0 and S5 . (In general, we never need to \u201czigzag\u201d structures that were placed into T0 , i.e., the structures that correspond to the delimiters, because P is minimal.) In \u03be(T , P) we denote the copies of the Si with Si and primed Si . Using the definition of the zigzag operator, it follows that the function f that maps an element of S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222aS2 \u222aS3 \u222aS4 in \u03be(S , Q) to the corresponding element in S0 \u222a S1 \u222a S2 \u222a S3 \u222a S3 \u222a S2 \u222a S3 \u222a S4 is a homomorphism. We similarly define a homomorphism h from S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(S , Q) to S5 \u222a S6 \u222a S7 \u222a S8 \u222a S8 \u222a S7 \u222a S7 \u222a S8 \u222a S9 in \u03be(T , P). If we can make sure that if an element x is in the domain of both f and h, and both homomorphisms map x to the same element then we have the desired homomorphism. Assume for example that the element x appears in S2 and also in S8 in \u03be(S , Q), and suppose that f (x) = y and h(x) = y . Let the originals of y and y be z and z in T , respectively. We also identify z and z in S2 and S8 in S . Observe that x in S2 in \u03be(S , Q) is a copy of z and x in S8 in \u03be(S , Q) is a copy of z . If z = z (in S ) then x could not appear both in S2 and S8 by the definition of the zigzag operator. Therefore z = z , z \u2208 D, and by definition, z is in every bag of T . The elements y and y are copies of z, and because z appears in every \u201cbag\u201d of T , all copies of z in \u03be(T , P) are identified to be the same element. In particular, f (x) = y = y = h(x). 3.3 3.3.1 Applications Datalog + Maltsev \u21d2 Symmetric Datalog Using SBPD, we give a short and simple re-proof of the main result of [9]: Theorem 18 ([9]). Let B be a finite core structure. If B is invariant under a Maltsev operation and co-CSP(B) is definable in Datalog, then co-CSP(B) is definable in symmetric Datalog (and therefore CSP(B) is in L by [11]). We only need to show that if co-CSP(B) is in linear Datalog and B is preserved by a Maltsev operation, then co-CSP(B) is in symmetric Datalog. The \u201cjump\u201d from Datalog to linear Datalog essentially follows from already established results, as observed in [9]. For the sake of completeness, we give an approximate outline of the argument without being too technical.4 If co-CSP(B) is definable in Datalog and B has a Maltsev polymorphism, then B also has a majority polymorphism. If B has a majority polymorphism, then co-CSP(B) is definable in linear Datalog [8]. Hence, to re-prove Theorem 18, it is sufficient to prove Lemma 19. Our proof relies on the notion of SBPD. Lemma 19. If co-CSP(B) is definable by a linear Datalog program and B is invariant under a Maltsev operation m, then co-CSP(B) is definable by a symmetric Datalog program. To get ready for the proof of Lemma 19, we define an N -digraph of size s as an oriented path that consists of s forward edges, followed by s backward edges, followed by another s forward edges. Proposition 20 is easy to prove, and the Maltsev properties are used in Lemma 21. 4 The interested reader can consult Lemma 6 (originally in [23]) and Lemma 9 in [9]. For Lemma 9, note that if B has a Maltsev polymorphism, then V(A(B)) is congruence permutable, see [5]. 17 Proposition 20. A minimal oriented path is either a directed path, or it contains a subpath which is an N -digraph. Lemma 21. Let B be a structure invariant under a Maltsev operation m, S be a (j, k)-path with a (j, k)-representation S = (S0 , . . . , Sn\u22121 ), and P = e0 , . . . , eq be a minimal oriented path of height n. If \u03be(S , P) \u2192 B, then S \u2192 B. Proof. Using Proposition 20, there is an index t such that Q = et , et+1 , . . . , et+(3s\u22121) is an N digraph of size s in P. Assume that the first and last vertices of Q are v and w, respectively. Let P be the oriented path obtained from P by removing Q, and adding a directed path Q = ft , ft+1 , . . . , ft+(s\u22121) of length s from v to w. We claim that there is a homomorphism \u03b3 from \u03be(S , P ) to B. Once this is established, repeating the argument sufficiently many times clearly yields that S \u2192 B. Let \u03be(S , P) = (Se0 , . . . , Seq ), and \u03d5e0 , . . . , \u03d5eq be the corresponding isomorphisms (recall the zigzag operator definition in Section 3.1). Similarly, let \u03be(S , P ) = (Sf0 , . . . , Sfq\u22122s ), and \u03c8f0 , . . . , \u03c8fq\u22122s be the corresponding isomorphisms. Because S[e0 ,et\u22121 ] and S[et+3s ,eq ] are isomorphic to S[f0 ,ft\u22121 ] and S[ft+s ,fq\u22122s ] , respectively, \u03b3 for elements in S[f0 ,ft\u22121 ] \u222a S[ft+s ,eq\u22122s ] is defined in the natural way. It remains to define \u03b3 for every d \u2208 S[ft ,ft+(s\u22121) ] . Assume that d \u2208 Sft+ for some \u2208 {0, . . . , s \u2212 1}. Find the original of d in S and let it be do , i.e., do = \u03c8ft+ (d). Then we find the three copies d1 , d2 , d3 of do in S[ft ,ft+(3s\u22121) ] . That is, first we find the three edges e 1 , e 2 , e 3 of Q which have the same level as ft+ (all levels are with respect to P and P ). Then di = \u03d5\u22121 e (do ), i \u2208 [3]. We define \u03b3(d) = m(d1 , d2 , d3 ). By the i \u03b3 Maltsev properties of m, \u03b3 is well-defined. As B is invariant under m, \u03be(S , P ) \u2212\u2192 B. Proof of Lemma 19. If co-CSP(B) can be defined by a linear (j, k)-Datalog program, then there is an obstruction set O for B in which every structure is a (j, k)-path by [7]. We construct a symmetric obstruction set Osym for B as follows. For every (j, k)-path S with a (j, k)-representation S = S0 , . . . , Sn\u22121 in O and for every minimal oriented path P of height n, place \u03be(S , P) into Osym . By Corollary 17, Osym is (j, k)-symmetric. Observe that O \u2286 Osym , so it remains to show that no element of Osym maps to B. But if T \u2208 Osym , then T = \u03be(S , P) for some S \u2208 O and P. By Lemma 21, if \u03be(S , P) \u2192 B, then S \u2192 B. This contradicts the assumption that O is an obstruction set for B. 3.3.2 A class of oriented paths for which the CSP is in L, and a class for which the CSP is NL-complete In this section we define a class C of oriented paths such that if B \u2208 C then co-CSP(B) is in symmetric Datalog. Our strategy is to find an obstruction set O for B \u2208 C, and then to show that our obstruction set is piecewise symmetric. We need some notation. We say that a directed path is forward to mean that its first and last vertices are the vertices with indegree zero and outdegree zero, respectively. Let P be an oriented path with \u00af is a copy of the first vertex v and last vertex w. Then the reverse of P, denoted by P, \u00af is a copy of w and its last oriented path P in the reverse direction, i.e., the first vertex of P vertex is a copy of v. Let Q be another oriented path. The concatenation of P and Q is the oriented path PQ in which the last vertex of P is identified with the first vertex of Q. For a 18 t a c P2 P1 E1 s e E2 P4 P3 b P5 d Figure 6: 2-wave in the proof of Theorem 23. nonnegative integer r, Pr denotes P1 P2 \u00b7 \u00b7 \u00b7 Pr , where the P are disjoint copies of P. Given two vertices v and w, we denote the presence of an edge from v to w with v \u2192 w. \u00af r PE2 , where Ei Definition 22 (Wave). If an oriented path Q can be expressed as E1 (PP) (i \u2208 [2]) denotes the forward directed path that is a single edge, P is a forward directed path of length , and r \u2265 0, then Q is called an r-wave. A 2-wave is shown in Figure 8, 1. Theorem 23. Let Q be a wave. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. We prove the case when Q is an r-wave for r = 2. For larger r-s, the proof generalizes in a straightforward manner. Let P be a directed path of length h, P1 , P3 , P5 be disjoint copies of P, and P2 , P4 be copies of the reverse of P. Let E1 and E2 be forward edges. Assume the 2-wave Q is E1 P1 P2 P3 P4 P5 E2 (Figure 6). We will provide a piecewise symmetric obstruction set Ops for Q, such that every element of Ops is an oriented path. To do this, first we observe that by [17], Q has path duality, i.e., we can assume that the set O of all oriented paths that do not homomorphically map to Q form an obstruction set for Q. To construct Ops from O, we will place certain elements of O into Ops such that Ops is still an obstruction set for Q. We begin with some simple observations. Any oriented path that has height at most h + 1 maps to Q, so these oriented paths can be neither in O nor in Ops . Any oriented path that has height strictly larger than h + 2 obviously does not map to Q, so all such paths are in O and we also place these paths into Ops . Assume that P \u2208 O has height exactly h + 2. It is easy to see that if P is not minimal, then it contains a minimal subpath that does not map to Q. Therefore, it is sufficient to place only those oriented paths from O of height h + 2 into Ops which are minimal. Let P \u2208 Ops of height h + 2 (then P is minimal). Intuitively, any attempt to homomorphically map the vertices of P to Q starting by first mapping the first vertex of P to the first vertex of Q and then progressively finding the image of the vertices of P from left to right would get stuck at a or c. Formally, assume that the vertices of P are v1 , . . . , vn . Let P[i] denote the subpath of P \u03d5 on the first i vertices. Choose i to be the largest index such that P[i] \u2212\u2192 Q and \u03d5(v1 ) = s. Then \u03d5 cannot be extended to vi+1 for one of the following reasons. Clearly, \u03d5 must map vi to a source or a sink other than s or t, i.e., to a,b,c or d. Furthermore, we can assume that vi is not mapped to b or d. This is because if vi is mapped to b or d, then level(vi ) = 1, so the edge between vi and vi+1 is from vi to vi+1 , and therefore \u03d5 can be extended. So we can assume that vi is mapped to a or c. Because we cannot extend \u03d5, vi+1 must be at level 19 + 2, so it must be that vi+1 is the last vertex vn of P. Because P \u2192 Q, P[n\u22121] must be an oriented path such that any homomorphism \u03d5 from P[n\u22121] to Q such that \u03d5(v1 ) = s maps vn\u22121 to a or c but not to e. We assume first that any homomorphism \u03d5 from P[n\u22121] to Q maps vn\u22121 to a. We follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. If there is a vertex to the right of wa at level 1, then because P[n\u22121] will have to reach level h + 1 again, we will be able to map vn\u22121 to c, and that is not possible by assumption. So P must have the following form (Form 1): (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2192 w6 ), where X is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X, and Y is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of Y. See Figure 7, left. For the second case, we assume that P[n\u22121] is such that vn\u22121 can be mapped to c. Again, we follow the vertices of P[n\u22121] from left to right. Let wa be the first vertex that is at level h + 1. We must have a vertex going back to level 1 (otherwise we could not \u201cpass\u201d b and could not map vn\u22121 to c). Let wb be the first such vertex. We will have to go back to level h + 1 again, so let wc be the first vertex at that level. Finally, we cannot go back to level 1 again, since then the last vertex of P[n\u22121] can be mapped to e. We can \u201cgo down\u201d to at most level 2 of P[n\u22121] . So P must have the form (Form 2) (w1 \u2192 w2 )X(w3 \u2192 w4 )Y(w5 \u2190 w6 )Z(w7 \u2192 w8 )W(w9 \u2192 w10 ), where X (Z) is any oriented path of height h \u2212 1 with first vertex at the bottom and last vertex at the top level of X (Z), Y is any oriented path of height h \u2212 1 with first vertex at the top and last vertex at the bottom level of Y, and W is any oriented path of height h \u2212 1 with both its first and last vertices being in the top level of W. See Figure 7, right. w6 wa = w4 w3 w10 w5 wa = w4 wc = w8 w3 w7 Y X w2 w1 X w2 w1 Y w5 w9 W Z wb = w6 Figure 7: Obstructions of height h + 2 for a 2-wave. Because Ops \u2286 O and for any structure S \u2208 O, there is a structure S \u2208 Ops such that S \u2192 S, Ops is an obstruction set for Q. It remains to show that Ops is piecewise symmetric. Let S be an oriented path of height more than h + 2, and assume the vertex set of S is v1 , . . . , vn . We need to define a representation S , and a filter FS for S. The representation (S0 , S1 , . . . , Sn\u22122 ) is (v1 , v2 ), (v2 , v3 ), . . . , (vn\u22121 , vn ) (width (1, 2)). The filter FS is the empty filter. Note that if we apply a zigzag operation to S, we get an oriented path of the same height as S, so Ops is closed under zigzagging of obstructions of height greater than h + 2. Let S be an oriented path of height h + 2 of Form 1, and assume the vertex set of S is v1 , . . . , vn . The representation S = (S0 , S1 , . . . , Sn\u22122 ) is constructed as in the previous paragraph. We specify FS to be the following (3, 6)-filter. Assume that the edge (w3 , w4 ) is structure Si . Then FS = {[0, 0], [i, i], [n \u2212 2, n \u2212 2]}. Using the definitions it is easy to see 20 that if P obeys the filter FS , then \u03be(S , P) is also an oriented path of Form 1. Therefore Ops is closed under zigzagging of obstructions of Form 1. Obstructions of Form 2 can be handled similarly. We state the following generalization of waves. \u00af r P, \u00af where Definition 24 (Staircase). A monotone wave is an oriented path of the form (PP) P is a forward directed path and r \u2265 0. We call the vertices of a monotone wave in the topmost level peaks, and the vertices in the bottommost level troughs. If a minimal oriented path Q can be expressed as P1 W1 P2 W2 . . . Pn\u22121 Wn\u22121 Pn , where P1 , . . . , Pn are forward directed paths, W1 , . . . , Wn\u22121 are monotone waves, and for any i \u2208 [n \u2212 1], the troughs of Wi are in a level strictly below the level of the troughs of Wi+1 , and also, the peaks of Wi are in a level strictly below the level of the peaks of Wi+1 , then Q is called a staircase. An example is given in Figure 8, 2. Theorem 25. Let Q be a staircase. Then Q has PSBPD, co-CSP(Q) is definable in symmetric Datalog, and CSP(Q) is in L. Proof. Assume that the height Q is h. As for waves, we use [17] to conclude that Q has path duality. We will construct a piecewise symmetric obstruction set Ops for Q by placing three classes of oriented paths into Ops . First, Ops contains all oriented paths which have height strictly greater than h. These oriented paths obviously do not map to Q. The next class of oriented paths we place into Ops are those which have height precisely h. Recall that Q consists of waves patched together with directed paths in between. Let the wave subpaths of Q be W1 , . . . , Wn , from left to right. For each Wi , we construct a class of oriented paths. Assume that Wi has height hi and let Oi be the set of minimal oriented paths of height hi which do not map to Wi . For each R \u2208 Oi , we construct C = B1 RB2 , where B1 and B2 are oriented paths (possibly empty) such that C has height h, and the level of R in C matches the level of Wi . Observe that there cannot be a homomorphism from C to Q. We place all such constructed C into Ops . Let be the length of the longest directed subpath of Q. The third class of oriented paths are those that have height h , where < h < h. For every such h , we produce a set of obstructions. (Remark: we set < h because any oriented path of length or less maps to Q.) Assume inductively (the base case is trivial) that we already have a piecewise symmetric obstruction set for every staircase of height strictly less than h. Consider every subpath Q1 , . . . , Qm of Q of height h . Notice that core(Qi ) is a staircase which is not a directed path. By the inductive hypothesis we have a piecewise symmetric obstruction set Ui for Qi . We keep only those oriented paths in Ui which have height at most h ; observe that Ui = \u2205. Construct D = B1 T1 \u00b7 \u00b7 \u00b7 Bm Tm Bm+1 , where (T1 , . . . , Tm ) \u2208 U1 \u00d7 \u00b7 \u00b7 \u00b7 \u00d7 Um and the Bj are arbitrary oriented paths such that the height of D is h . Place all these D-s into Ops . Notice that D does not map to Q for the following. Assume for contradiction that D maps to a subpath S of Q. Then D also maps to the core of S which is a staircase. But by construction D contains a subpath that does not map to S. We show that Ops is an obstruction set for Q. If an structure Z \u2208 Ops homomorphically maps to an input structure A, then obviously, there cannot be a homomorphism from A to 21 Q. Assume for contradiction that no structure in Ops maps to A but A does not map to Q. Then O contains an oriented path P that maps to A. So if we show the following claim then we are done. Claim. For any oriented path P that does not homomorphically map to Q, there is an oriented path Z \u2208 Ops that homomorphically maps to P. Proof of Claim. Assume that P has height precisely h. We show that there exists Z \u2208 Ops of height h such that Z \u2192 P. Assume for contradiction that none of the oriented paths of height h in Ops map to P. As before, let W1 , . . . , Wn be the wave segments of Q, from left to right, and assume without loss of generality that none of the Wi is a directed path. Let the initial and final vertices of Wi be ai and bi respectively, i \u2208 [n]. For each i \u2208 [n], find the minimal oriented subpaths of P whose initial vertices have the same level as ai , and final vertices have the same level as bi , or vice versa (note that because of the structure of Q, no such oriented path could contain another as a subpath, however, these oriented paths could overlap). For any such subpath R of P associated with Wi , map the lowest vertex of R to ai , and the highest vertex of R to bi . Remark 1: In fact there is no other choice. The rest of the vertices of R can be mapped to Q as follows. If R does not map to Wi with first and last vertices matched then by definition, P is in Ops and we have a contradiction. Therefore let the homomorphism for R be \u03d5R . Remark 2: Also observe that \u03d5R maps the inner vertices of R to vertices of the staircase which are between ai and bi . We show that the partial homomorphisms \u03d5R map the same vertex of P to the same vertex in Q, and furthermore we can also map those vertices of P to an element of Q that are not mapped anywhere by the \u03d5R . This way we obtain a homomorphism from P to Q and this would be a contradiction. First, any vertex v is assigned to a vertex of Q by at most two homomorphisms which correspond to consecutive wave segments of Q. This is because in Q, Wi and Wj are disjoint unless j = i + 1. Using Remarks 1 and 2, we can see that if a vertex v of P is in the domain of two \u201cnon-consecutive\u201d homomorphisms, then because those homomorphisms could not agree on where to map v, it is not possible that P \u2192 Q. This is a contradiction. Let \u03d5R1 and \u03d5R2 (assume without loss of generality that R1 and R2 correspond to W1 and W2 , respectively) be two partial homomorphisms such that their domains overlap. Then the markers a1 , b1 , a2 , b2 appear in the order a1 , a2 , b1 , b2 when traversing P from left to right. The vertices that are in the domain of both homomorphisms are the ones from a2 to b1 . By the choice of a1 , b1 , a2 , b2 , the segment of P from a2 to b1 is a minimal oriented path. Checking the images of the vertices going back from b1 to a2 under the map \u03d5R1 , we see that these vertices are mapped to the rightmost directed path segment of W1 . Similarly, the image of these vertices under \u03d5R1 is the leftmost directed path of the W2 . That is, the two homomorphisms coincide for the vertices from a2 to b1 . Furthermore, some vertices of P are not in the domain of any partial homomorphisms. Consider the two minimal oriented paths S and S on the two sides of such a maximal continuous sequence of vertices in P. There are two cases. First, assume that S and S both correspond to the same Wi . Let the markers for S be a and b an the markers for S be a and b . Then following P from left to right, the markers appear in the order a, b, b , a . The images of the vertices from b to b are not defined. (Observe that b and b are mapped to the same vertex.) Consider the last directed path segment of Wi together with the first directed 22 path segment of Wi+1 (or just the last edges of Q if i = n). Observe that the vertices from b to b can be mapped to this directed path. The case when S and S correspond to different waves of Q is handled similarly. Suppose lastly that P has height h < h. Because P does not map to any of the subpaths of Q of height h , for each subpath Q1 , . . . , Qm of Q of height h , P contains a subpath Si such that Si \u2192 Qi , i \u2208 [m]. If Si \u2192 Qi then Si \u2192 core(Qi ). Recall that core(Qi ) is a staircase and by definition, Ui contains an oriented path Si such that Si \u2192 Si . It is clear that we can choose oriented paths B1 , . . . , Bm+1 such that B1 S1 B2 . . . Bm Sm Bm+1 \u2192 P. Finally, it is not hard to see from the construction how to associate filters with the elements of Ops to establish that Ops is piecewise symmetric. We also give a large class of oriented paths for which the CSP is NL-complete. We need the following propositions to prove Theorem 28. Proposition 26. Let P1 and P2 be two minimal oriented paths of the same height h. Then there is a minimal oriented path Q of height h such that Q \u2192 P1 , P2 . Proof. Not hard, see e.g. [16]. Proposition 27. A core oriented path has a single automorphism, i.e., it is rigid. Proof. Let P be a core oriented path and P be an isomorphic copy of P . There are at most two isomorphisms from P to P (because a vertex with indegree 0 must be mapped to a vertex with indegree 0, and similarly for a vertex with outdegree 0). One possibility is to map the first vertex of P to the first vertex of P and the last vertex of P to the last vertex of P. For contradiction, assume that the second possibility happens, i.e., there is an isomorphism \u03d5 that maps the first vertex of P to the last vertex of P and the last vertex of P to the first vertex of P. Assume that both the first vertex v and last vertex w of P have indegree zero (the other case is similar). Then the level(v) = level(w). This implies that the number of forward and backward edges in P is the same, so P has 2q edges. By \u00af and such an oriented path is clearly not a the existence of \u03d5, P must have the form QQ, core. Theorem 28. Let B be a core oriented path that contains a subpath P1 P2 P3 of some height h with the following properties: P1 , P2 and P3 are minimal oriented paths, they all have height h, and there is a minimal oriented path Q of height h such that Q \u2192 P1 , Q \u2192 P3 but Q \u2192 P2 . Then CSP(B) is NL-complete. An example is given in Figure 8, 3 and 4. Proof of Theorem 28. We show that the less-than-or-equal-to relation on two elements, R\u2264 = {(0, 0), (0, 1), (1, 1)}, and the relations {0} and {1} can be expressed from P using a primitive positive (pp) formula (i.e., a first order formula with only existential quantification, conjunction and equality). It is easy to see and well known that CSP({R\u2264 , {0}, {1}}) is equivalent to the NL-complete directed st-Conn problem. Since P is a core, it is rigid by Proposition 27. Assume that the first vertex of P1 is in a level lower than the level of the last vertex of P1 (the other case can be handled 23 1 2 P2 E2 P1 P2 P3 P4 4 3 P1 P3 Q P5 E1 Figure 8: 1: A 2-wave. 2: A staircase. 3: An example oriented path for which the CSP is NL-complete. 4: The oriented path Q in Theorem 28 corresponding to the oriented path in 3. similarly). See the illustration in Figure 9. Assume that the first vertex of P1 is 0 and the first vertex of P3 is 1. We construct a structure G with two special vertices x and y such h that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . It is well known and easy to show that then R\u2264 can also be expressed from P using a pp-formula. Let P be an isomorphic copy of P. We refer P G P23 P1 P2 P3 0 P1 1 P2 P3 c Q x P123 y Figure 9: Construction of the gadget G. to copies of P1 , P2 , P3 as P1 , P2 , P3 , respectively. Using Proposition 26, we find a minimal oriented path P23 of height h that maps to both P2 and P3 . Similarly, we find a minimal oriented path P123 that maps to each of P1 , P2 , P3 . We rename the first vertex of Q to x, and the first vertex of P123 y to y. To construct G, we identify the topmost vertices of the oriented paths P23 , Q and P123 . Then we identify the first vertex of P23 with the vertex c of P that is shared by P2 and P3 . Observe that any homomorphism from G to P, must map h c to 1. It is straightforward to verify that {(h(x), h(y)) | G \u2212\u2192 P} = R\u2264 . Because P is rigid, any relation of the form {v} where v \u2208 P can be expressed by a pp-formula. 4 4.1 On CSPs in NL Definitions Let \u03c4 be a vocabulary. A successor \u03c4 -structure S is a relational structure with vocabulary \u03c4 \u222a {first, last, suc}, where first and last are unary symbols and suc is a binary symbol. 24 Without loss of generality, the domain S is defined as {1, . . . , n}, firstS = {1}, lastS = {n}, and sucS contains all pairs (i, i + 1), i \u2208 [n \u2212 1]. Because firstS , lastS and sucS depend only on n, they are called built-in relations. When we say that a class of successor structures is homomorphism/isomorphism-closed, all structures under consideration are successor structures, and we understand that homomorphism/isomorphism closure, respectively, is required only for non-built-in relations. Definition 29 (Split Operation). A split operation produces a \u03c4 -structure A from a \u03c4 structure A as follows. For an element a \u2208 A let Ta be defined as Ta = {(t, R, i) | t = (t1 , . . . , tr ) \u2208 RA where R \u2208 \u03c4 , and ti = a}. If |Ta | \u2264 1 for every a \u2208 A, then no split operation can be applied. Otherwise we choose a strict nonempty subset T of Ta , and for each triple (t, R, i) \u2208 T , we replace t = (t1 , . . . , tr ) in RA with (t1 , . . . , ti\u22121 , a , ti+1 , . . . , tr ) to obtain A (and A = A \u222a {a }). Definition 30 (Split-Minimal, Critical). Let C be a class of structures over the same vocabulary. We say that a structure A \u2208 C is split-minimal in C if for every possible nonempty sequence of split operations applied to A, the resulting structure is not in C. We say that a structure A \u2208 C is critical in C if no proper substructure of A is in C. For a class of successor \u03c4 -structures, criticality and split-minimality is meant only with respect to non-built-in relations. Definition 31 (Read-Once Datalog). Let P be a (linear, symmetric) Datalog program that defines a class of structures C. If for every critical and split-minimal element of C there is a P-derivation that is read-once, then we say that P is read-once. Definition 32 (Read-Once mnBP1). A monotone nondeterministic branching program (mnBP) H with variables X = {x1 , . . . , xn } computes a Boolean function fH : {0, 1}n \u2192 {0, 1}. H is a directed graph with distinguished nodes s and t and some arcs are labeled with variables from X (not all arcs must be labeled). An assignment \u03c3 to the variables in X defines a subgraph H\u03c3 of H as follows: an arc a belongs to H\u03c3 if \u03c3(x) = 1, where x is the label of a, or if a has no label. The function fH is defined as fH (\u03c3) = 1 if and only if there is a directed path in H\u03c3 from s to t (an accepting path). The size of an mnBP is |VH |. Let \u03c4 be a vocabulary and n \u2265 1. We assume without loss of generality that any relational structure whose domain has size n has domain {1, 2, . . . , n}. Let (R1 , t1 ), (R2 , t2 ), . . . , (Rq , tq ) be an enumeration of all pairs such that Ri \u2208 \u03c4 and ti \u2208 {1, 2, . . . , n}ar(Ri ) . We associate a variable xi with (Ri , ti ), for each i = 1, 2, . . . , q. Then if all labels of a branching program Hn are among x1 , x2 , . . . , xq , we say that Hn is over the vocabulary \u03c4 for input size n. We say that a family of branching programs F defines a class of \u03c4 -structures C, if for each n \u2265 1, F contains precisely one branching program Hn over \u03c4 for input size n such that fHn (x1 , x2 , . . . , xq ) = 1 if and only if the tuple structure with domain {1, 2, . . . , n} and containing precisely those pairs (Ri , ti ) for which xi = 1 is in C. Let F be a family of mnBP1s that contains precisely one branching program for each n \u2265 1. We say that F is a poly-size family if there is a polynomial p such that for each n \u2265 1, |V (Hn )| \u2264 p(n). Such a family is denoted by mnBP1(poly). If for every n and every 25 structure of domain size n in C, Hn contains an accepting path P such that any label on P is associated with at most one arc of P , then we say that F is read-once. (This read-once condition can be made a bit weaker.) 4.2 Examples We give some examples of problems definable by a 1-linDat(suc) program or by an mnBP1(poly). The program in Section 2.2, Figure 1 without rule 3 is a read-once linear Datalog(suc) program that defines the problem directed st-Conn. To see that this program Pst\u2212Conn is read-once, let G be any input that is accepted (we do not even need G to be critical and split-minimal). Then we find a directed path in E G connecting an element of S G to an element of T G without repeated edges. We build a Pst\u2212Conn -derivation for this path in the obvious way. For this section, by a clique we mean an ordinary undirected clique but each vertex may or may not have a self-loop. Let EvenCliques be the class of cliques of even size. The read-once linear Datalog(suc) program PEC below defines EvenCliques. The goal predicate of PEC is G2 , and E is the symbol for the edge relation of the input. The first part of PEC checks if the domain size n of the input is even. The second part goes through all pairs (x, y) \u2208 [n]2 , and at the same time, checks if (x, y) is an edge in E. This is achieved by accessing the order on the domain. Program PEC goes through every pair of vertices precisely once, so every PEC -derivation is read-once, and therefore PEC is read-once. I(y) \u2190 first(x) \u2227 suc(x, y) I(z) \u2190 I(x) \u2227 suc(x, y) \u2227 suc(y, z) G1 \u2190 I(x) \u2227 last(x) J(x, y) \u2190 G1 \u2227 first(x) \u2227 first(y) J(x, z) \u2190 J(x, y) \u2227 suc(y, z) \u2227 E(x, z) \u2227 E(z, x) J(z, w) \u2190 J(x, y) \u2227 last(y) \u2227 suc(x, z) \u2227 suc(z, w)\u2227 E(z, w) \u2227 E(w, z) G2 \u2190 J(x, y) \u2227 suc(x, y) \u2227 last(y). Figure 10: The read-once linear Datalog(suc) program PEC for EvenCliques. In fact, we can easily test much more complicated arithmetic properties than the property of being even (e.g., being a power of k) with a 1-linDat(suc) program. However, linear Datalog cannot define any set of cliques with a non-trivial domain size property in the following sense. Let K be a clique of size n, and K be the clique obtained by identifying any two vertices of K. Then K homomorphically maps to K , and therefore if a linear Datalog program accepts K, then it also accepts K . Therefore EvenCliques or, in fact, any set of cliques that contains a clique of size n but no clique of size n \u2212 1 cannot be defined by a 26 linear Datalog program. Since it is not difficult to convert a 1-linDat(suc) program into an mnBP1(poly), the aforementioned problems can also be defined with an mnBP1(poly). The additional power the successor relation gives to 1-linDat is at least twofold. For example, read-once linear Datalog(suc) can do some arithmetic, as demonstrated above. In addition, let\u2019s define the density of a graph to be the number of edges divided by the number of vertices. The density of an n-clique is n2 /n = \u03b8(n). As demonstrated above, access to an order allows read-once linear Datalog(suc) to accept only structures of linear density. On the other hand, any linear Datalog program P accepts structures of arbitrary low density. For let S be a structure accepted by P. Then adding sufficiently many new elements to the domain of S yields a structure S whose density is arbitrarily close to 0, and S is still accepted by P. One consequence of Corollary 34 is that if a read-once linear Datalog(suc) defines co-CSP(B), then both aforementioned additional abilities are of no use. 4.3 Main Results We begin with stating the results for 1-linDat(suc) and poly-size families of mnBP1s discussed in the Introduction. Theorem 33. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a 1-linDat(suc) program of width (j, k), then every critical and split-minimal element of C has a (j, k + j)-path decomposition. Corollary 34. If co-CSP(B) can be defined by a 1-linDat(suc) program of width (j, k), then co-CSP(B) can also be defined by a linear Datalog program of width (j, k + j). Theorem 35. Let C be a homomorphism-closed class of successor \u03c4 -structures. If C can be defined by a family of mnBP1s of size O(nj ), then every critical and split-minimal element of C has a (j, r + j)-path decomposition, where r is the maximum arity of the symbols in \u03c4 . Corollary 36. If co-CSP(B) can be defined by a family of mnBP1s of size O(nj ), then co-CSP(B) can also be defined by a linear Datalog program of width (j, r + j), where r is the maximum arity of the relation symbols in the vocabulary of B. As discussed before, a wide class of CSPs\u2013CSPs whose associated variety admits the unary, affine or semilattice types\u2013does not have bounded pathwidth duality [20]. It follows that all these CSPs are not definable by any 1-linDat(suc) program, or with any mnBP1 of poly-size. An example of such a CSP is the P-complete CSP Horn-3Sat. After some definitions, we give a high-level description of the proof of Theorem 33. Any \u03c4 -structure M with domain size n can be naturally converted into an isomorphic successor structure M(\u03c0), where \u03c0 is a bijective function \u03c0 : M \u2192 {1, . . . , n}. We define the domain M (\u03c0) as {1, . . . , n} (note that this automatically defines firstM\u03c0 , lastM\u03c0 and sucM\u03c0 ) and for any R \u2208 \u03c4 , and (t1 , . . . , tar(R) ) \u2208 RM , we place the tuple (\u03c0(t1 ), . . . , \u03c0(tar(R) )) into RM\u03c0 ). When we want to emphasize that a structure under consideration is a successor \u03c4 -structure, we use the subscript s, for example Ms . Given a successor \u03c4 -structure Ms , M denotes the structure Ms but with the relations firstMs , lastMs and sucMs removed. We make the simple but important observation that we are interested only in isomorphismclosed classes. For example, co-CSP(B) is obviously isomorphism-closed. We will crucially 27 use the fact that if Ms is accepted by a 1-linDat(suc) program P, then P must also accept M(\u03c0) for any bijective function \u03c0. We are ready to describe the intuition behind the proof of Theorem 33. A 1-linDat(suc) program that ensures that the class of successor-structures C it defines is homomorphism-closed (and therefore isomorphism-closed) does not have enough \u201cmemory\u201d\u2013 due to its restricted width\u2013to also ensure that some key structures in C are \u201cwell-connected\u201d. If these key structures are not too connected, then we can define co-CSP(B) in linear Datalog. The more detailed proof plan is the following. Assume that co-CSP(B), where the input is a successor structure, is defined by a linDat(suc) program P of width (j, k). We choose a \u201cminimal\u201d structure M in C that is accepted, and assume for contradiction that M does not have width (j, k). Then roughly speaking, for all possible \u201cpermutations of the domain elements of M\u201d, M must be accepted; therefore for each of these isomorphic structures, P must be able to provide a derivation. Because this procedure will provide many enough derivations, we will be able to find some derivations which are of a desired form. The identification of these \u201cgood\u201d derivations also crucially uses the generalized Erd\u02ddos-Ko-Rado theorem. Once these derivations are detected, they can be combined to produce a derivation that \u201cencodes\u201d a structure of bounded pathwidth. The structures of bounded pathwidth produced this way can be used to define co-CSP(B) in linear Datalog. We give the formal proofs. We need the following additional definitions related to linear Datalog. In addition to extracting Ex(D) from D, we can also extract a decomposition of Ex(D) reminiscent of a \u02dc by adding (R, t) to path decomposition. For each \u2208 [q], we define a tuple structure B \u02dc \u02dc the -th bag, and B if R(t) appears in \u03c1 . In such a representation of Ex(D), we call B \u02dc 1, . . . , B \u02dc q ) the tuple distribution of Ex(D). It will be useful to remove empty bags from the (B \u02dc 1, . . . , B \u02dc q ) to obtain the sequence (B \u02dc i1 , . . . , B \u02dc it ), where i < i if < . For list of bags (B \u02dc i1 , B \u02dc i2 . . . . , B \u02dc it ) to (B \u02dc 1, B \u02dc 2, . . . , B \u02dc t ). We call the sequence simpler notation, we renumber (B \u02dc 1, . . . , B \u02dc t ) the pruned tuple distribution of D. The following is easy to prove. (B Proposition 37. Let A be a \u03c4 -structure obtained from a \u03c4 -structure A by applying a sequence of split operations. Then A \u2192 A. We recall the following theorem tailored a bit to our needs. Theorem 38 (Erd\u02ddos-Ko-Rado, general case; see, e.g., [14]). Suppose that F is a family of s-subsets of {1, . . . , n}, where n \u2265 n0 (s, j + 1). Suppose that for any two sets S1 , S2 \u2208 F, |S1 \u2229 S2 | \u2265 j + 1. Then |F| \u2264 n\u2212(j+1) = O(ns\u2212(j+1) ). s\u2212(j+1) Proof of Theorem 33. Let the read-once linear Datalog(suc) program that defines C be P. Let M be a structure in C such that M is critical and split-minimal, but assume for contradiction that M has no (j, k)-path decomposition. Suppose that M = {m1 , . . . , ms }. We choose a large enough n divisible by s (for convenience): how large n should be will become clear later. We begin with constructing a class of successor structures from M. Let \u03d5 : M \u2192 {1, . . . , n} be a function that for all i \u2208 [s], maps mi to one of the numbers in (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns . We call such a function an embedder. Observe that there are ( ns )s possible embedder functions. For each embedder \u03d5, we define a successor structure M\u03d5 as follows. M\u03d5 is obtained from M by renaming mi to \u03d5(mi ) for each i \u2208 [s], and adding all numbers inside {1, . . . , n} but not in the range of \u03d5 to the domain of the structure. 28 Obviously for any embedder \u03d5, M\u03d5 contains an isomorphic copy of M, and therefore M \u2192 M\u03d5 . Since C is closed under homomorphisms (and successor-invariant), it follows that for any embedder \u03d5, M\u03d5 is accepted by P. Our goal now is to show that P accepts a structure that can be obtained from M by applying a nonempty sequence of split operations. This would contradict the split-minimality of M with respect to C. Let \u03d51 , . . . , \u03d5t be an enumeration of all t = ( ns )s embedders, and M\u03d51 , . . . , M\u03d5t the corresponding successor structures. Since P is read-once, we can assume that for each i \u2208 [t], there is a read-once P-derivation for M\u03d5i : D(M\u03d5i ) = (\u03c1i1 , \u03bbi1 ), . . . , (\u03c1iqi , \u03bbiqi ). \u02dci,...,B \u02dc i ). Let \u03c8i (B \u02dci,...,B \u02dci ) For each D(M\u03d5i ) we denote its pruned tuple distribution as (B 1 wi 1 wi \u02dc i ), where M \u02dc i for each \u2208 [wi ] is obtained as follows. For every (R, t) \u2208 \u02dc i,...,M denote (M wi 1 \u02dc i , place (R, \u03d5\u22121 (t)) into M \u02dc i . We call \u03c8i (B \u02dci,...,B \u02dc i ) the prototype of (B \u02dci,...,B \u02dc i ). We B 1 wi 1 wi i \u02dci,...,B \u02dc i ) and (B \u02dci ,...,B \u02dc i ) are similar if they say that two pruned tuple distributions (B 1 wi 1 wi i i i i \u02dc \u02dc \u02dc \u02dc have the same prototypes, i.e., \u03c8i (B1 , . . . , Bwi ) = \u03c8i (B1 , . . . , Bwi ). Note that the codomain of \u03c8i , for any i, is a sequence S of bags such that a bag contains \u02dc Because by definition, every bag in S is nonempty, and D(M\u03d5 ) is readelements of M. i \u02dc once, we have that |S| \u2264 |M|. Therefore the number of possible bag sequences can be upper-bounded by a function of s; let this upper bound be cs . It follows that there must be \u02dc ) \u02dc ,...,B at least t = cts embedders \u03d5i1 , . . . , \u03d5it such that for any , \u2208 {i1 , i2 , . . . , it }, (B w 1 \u02dc ,...,B \u02dc ) are similar. Let the common prototype of all these similar pruned tuple and (B 1 w \u02dc 1, . . . , M \u02dc w ) (i.e., \u03c8i1 (B \u02dc i1 , . . . , B \u02dc i1 )). Because M \u02dc is critical, it follows distributions be (M 1 wi1 \u02dc =M \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc w 5. that M To give a heads-up to the reader, our goal now is to construct a derivation D using the derivations D(M\u03d5i1 ), D(M\u03d5i2 ), . . . , D(M\u03d5i ), such that Ex(D ) is isomorphic to a structure t \u02dc that can be obtained from M \u02dc by a nonempty sequence of split operations. Because M \u02dc M is split-minimal, this contradiction will complete the proof. \u02dc1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc g , and Yg = M \u02dcg \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w for g \u2208 [w]. If there is no g \u2208 [w \u2212 1] Define Xg = M such that |Xg \u2229 Yg+1 | > j, then we construct a (j, k + j)-path decomposition S1 , . . . , Sw for \u02dc 1 , Sw = M \u02dc w , and S = M \u02dc \u222a (X \u22121 \u2229 Y +1 ), for 2 \u2264 \u2264 w \u2212 1. M as follows. Define S1 = M The first condition of Definition 3 is obviously satisfied. For the second condition, take Si \u02dc i and a \u2208 M \u02dc i for some i \u2264 i and i \u2264 i , and Si and i < < i . If a \u2208 Si \u2229 Si then a \u2208 M so a \u2208 S . For the first part of the third condition observe that because P has width (j, k), \u02dc | \u2264 k. Because we added at most j new elements to M \u02dc to obtain S , |S | \u2264 k + j for |M any . For the second part of the third condition, observe that S \u2286 X and S +1 \u2286 Y +1 , so |S \u2229 S +1 | \u2264 j for any . For the other case, suppose that for some g, |Xg \u2229 Yg+1 | > j. Recall that for each \u02dc g was constructed from the bag B \u02dc , and B \u02dc was constructed from a \u2208 {i1 , i2 , . . . , it }, M g g rule \u03c1g for some g , i.e., the g -th rule in the derivation D(M\u03d5 ) = (\u03c11 , \u03bb1 ), . . . , (\u03c1q , \u03bbq ). Let \u03b9 be the number of IDBs of P and \u03ba the maximum arity of any IDB of P. Recall that \u02dc is critical and C is homomorphism closed, M \u02dc cannot contain isolated elements Note that because M \u02dc except when M is a structure with a single element and no tuples. In this case the only critical and split\u02dc and the empty set is a (0, 0)-path decomposition for M. \u02dc minimal element is M 5 29 since P has width (j, k), any IDB contains at most j variables. Assume that the head IDB of \u03c1g is Ig (x g ). Then there are at most \u03b9j \u03ba nj possibilities for the head IDB Ig together with its variables instantiated to numbers in [n]. This means that there is an IDB I and a tuple t such that for at least t = \u03b9j \u03bat nj values of \u2208 {i1 , i2 , . . . , it }, it holds that Ig = I, and \u03bbg (x g ) = t. Let these t values be { 1 , . . . , t }. We establish later that we can choose values a , b \u2208 { 1 , . . . , t } such that the following inequality holds: \u02dc b \u2264 j. \u02dca \u2229 B \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B \u02dc1a \u222a \u00b7 \u00b7 \u00b7 \u222a B B w w Assuming that we have such a and b, (5) we define D as: (\u03c11a , \u03bb1a ), . . . , (\u03c1gaa , \u03bbgaa ), (\u03c1gb +1 , \u03bbg b +1 ), . . . , (\u03c1q b b b b , \u03bbqb ). b That is, we \u201ccut\u201d the derivations D(M\u03d5 a ) at the g a -th rule, and cut the derivation D(M\u03d5 b ) at the g b -th rule, and concatenate the first part of D(M\u03d5 a ) with the second part of D(M\u03d5 b ). D is a valid derivation because at the point of concatenation, the head IDB of \u03c1gaa is the same as the IDB in the body of \u03c1gb +1 , and the variables of this IDB are instantib ated to the same values in both rules. Observe that the pruned tuple distribution of D is \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b. \u02dc =B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222aB \u02dc b ,...,B \u02dc b ). Set B \u02dc a, . . . , B \u02dc a, B (B g+1 1 g+1 1 w g w g \u02dc is isomorphic to a structure that can be obtained from M \u02dc by a nonempty sequence Claim. B of split operations. \u02dc 1 \u222a \u00b7\u00b7\u00b7 \u222a M \u02dc g of M \u02dc is isomorphic to B \u02dc a\u222a Proof of Claim. Observe that the substructure M 1 b \u02dc g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a M \u02dc w is isomorphic to B \u02dc \u02dc b \u02dc a through \u03d5 a . Similarly, M \u00b7\u00b7\u00b7\u222aB g+1 \u222a \u00b7 \u00b7 \u00b7 \u222a Bw through g \u02dc and B. \u02dc \u03d5 b . Our goal is to understand the difference between M Notice that because any embedder maps mi \u2208 M into the interval (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns , and for any i = i , (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns \u2229 (i \u2212 1) \u00b7 ns + 1, . . . , i \u00b7 ns = \u2205, if i = i , then \u03d5 a (mi ) = \u03d5 b (mi ). Therefore \u03d5 a and \u03d5 b can return the same value only if they both \u02dc where get the same input. The set Xg \u2229 Yg+1 can be thought of as those elements of M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc g and M \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w are \u201cglued together\u201d to obtain M. \u02dc Let U = B \u02dc1a \u222a\u00b7 \u00b7 \u00b7\u222a B \u02dc ga M b \u02dc where \u02dcg+1 \u02dcwb . The set U \u2229 V can be thought of as those elements of B and V = B \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc a and B \u02dc b \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc b are \u201cglued together\u201d to obtain B. \u02dc B 1 g+1 g w \u02dc would be isomorphic to M, \u02dc If for all elements m \u2208 Xg \u2229 Yg+1 , \u03d5 a (m) = \u03d5 b (m), then B \u02dc a \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc a would be glued to B \u02dc b \u222a\u00b7 \u00b7 \u00b7\u222aB \u02dc b to obtain B \u02dc the same way as M \u02dc 1 \u222a\u00b7 \u00b7 \u00b7\u222aM \u02dcg i.e., B 1 g+1 g w \u02dc g+1 \u222a\u00b7 \u00b7 \u00b7\u222a M \u02dc w to obtain M. \u02dc But by Inequality 5, |Xg \u2229Yg+1 | > |U \u2229V |. In other is glued to M words, there are some elements m \u2208 Xg \u2229Yg+1 which have one copy for \u03d5 a , and another copy \u02dc Identifying \u03d5 a (m) and \u03d5 (m) for all such m would convert B \u02dc to a structure for \u03d5 b in B. b \u02dc Now it is easy to see that going backwards, splitting elements of M \u02dc would isomorphic to M. \u02dc yield a structure isomorphic to B. It remains to show why we can choose a and b to satisfy Inequality 5. Note that ( n )s \u02dc \u222a \u00b7\u00b7\u00b7 \u222a B \u02dc is an st = cs \u03b9js \u03ba nj \u2265 \u2126(ns\u2212j ). Also note that for any \u2208 { 1 , . . . , t }, B 1 w a \u02dc \u02dcwa \u2229 subset of [n]. So by Theorem 38, if for every pair a , b \u2208 { 1 , . . . , t }, B1 \u222a \u00b7 \u00b7 \u00b7 \u222a B 30 \u02dcwb \u2265 j + 1, then t \u2264 O(ns\u2212j\u22121) ). But as observed t \u2265 \u2126(ns\u2212j ), so for a \u02dc1b \u222a \u00b7 \u00b7 \u00b7 \u222a B B large enough n (as a function of s,j, \u03b9 and \u03ba, so n can be chosen in advance) Inequality 5 must hold for some a , b \u2208 { 1 , . . . , t }. Proof of Corollary 34. Let O = co-CSP(B), i.e., the set of all those successor structures that do not homomorphically map to B. We construct an obstruction set O for B such that every structure in O has pathwidth (j, k + j). O is the set of all critical and split minimal structures of O. Theorem 33 tells us that every structure in O has a (j, k + j)-path decomposition. To see that O is an obstruction set for B, take any structure S \u2208 co-CSP(B) = O. Keep on applying split operations to S and taking substructures of S (again, these operations are with respect to non-built-in relations only), as long as the resulting structure is still in O. That is, if we apply any split operation to S , or if we take any substructure of it, then the resulting structure is not in O any more. Then S \u2208 O because S is critical and split minimal with respect to O. Using Proposition 37, we also see that S \u2192 S. Because O is an obstruction set for B such that every structure in O has width (j, k +j), it follows from results of Dalmau in [7] that co-CSP(B) is definable in linear (j, k + j)Datalog. These proofs can be adapted for mnBP1s to obtain Theorem 35 and Corollary 36. Acknowledgement We thank Benoit Larose and Pascal Tesson for useful discussions and comments on an earlier draft. We also thank the anonymous referees for their helpful comments. References [1] F. Afrati and S. S. Cosmadakis. Expressiveness of restricted recursive queries. In Proceedings of the 42th ACM Symposium on Theory of Computing (STOC), pages 113\u2013 126, 1989. [2] E. Allender, M. Bauland, N. Immerman, H. Schnoor, and H. Vollmer. The complexity of satisfiability problems: Refining Schaefer\u2019s theorem. Journal of Computer and System Sciences, 75(4):245\u2013254, 2009. [3] L. Barto and M. Kozik. Constraint satisfaction problems of bounded width. In Proceedings of The 50th Annual Symposium on Foundations of Computer Science (FOCS), 2009. [4] A. A. Bulatov, A. A. Krokhin, and B. Larose. Dualities for constraint satisfaction problems. In N. Creignou, P. G. Kolaitis, and H. Vollmer, editors, Complexity of Constraints, volume 5250 of Lecture Notes in Computer Science, pages 93\u2013124. Springer, 2008. [5] S. Burris and H. P. Sankappanavar. A Course in Universal Algebra. Number 78 in Graduate Texts in Mathematics. Springer-Verlag, 1981. 31 [6] C. Carvalho, L. Egri, M. Jackson, and T. Niven. On Maltsev digraphs. In Proceedings of the 6th International Computer Science Symposium in Russia (CSR), pages 181\u2013194, 2011. [7] V. Dalmau. Constraint satisfaction problems in non-deterministic logarithmic space. In Proceedings of the 29th International Colloquium on Automata, Languages and Programming, ICALP, pages 414\u2013425. Springer-Verlag, 2002. [8] V. Dalmau and A. Krokhin. Majority constraints have bounded pathwidth duality. European Journal of Combinatorics, 29(4):821\u2013837, 2008. [9] V. Dalmau and B. Larose. Maltsev + Datalog \u2192 symmetric Datalog. In IEEE Symposium on Logic in Computer Science (LICS), pages 297\u2013306, 2008. [10] L. Egri, A. A. Krokhin, B. Larose, and P. Tesson. The complexity of the list homomorphism problem for graphs. Theory of Computing Systems, 51(2):143\u2013178, 2012. [11] L. Egri, B. Larose, and P. Tesson. Symmetric Datalog and constraint satisfaction problems in logspace. In IEEE Symposium on Logic in Computer Science (LICS), pages 193\u2013202, 2007. [12] T. Feder. Classification of homomorphisms to oriented cycles and of k-partite satisfiability. SIAM Journal on Discrete Mathematics, 14(4):471\u2013480, 2001. [13] T. Feder and M. Y. Vardi. The computational structure of monotone monadic SNP and constraint satisfaction: A study through Datalog and group theory. SIAM Journal on Computing, 28(1):57\u2013104, 1999. [14] P. Frankl and R. L. Graham. Old and new proofs of the Erd\u00a8os-Ko-Rado Theorem. Journal of Sichuan University Natural Science Edition, 26, 1989. [15] E. Gr\u00a8adel. Capturing complexity classes by fragments of second-order logic. Theoretical Computer Science, 101(1):35\u201357, 1992. [16] R. H\u00a8aggkvist, P. Hell, D. J. Miller, and V. Neumann-Lara. On multiplicative graphs and the product conjecture. Combinatorica, 8:63\u201374, 1988. [17] P. Hell and X. Zhu. Homomorphisms to oriented paths. Discrete Mathematics, 132:107\u2013 114, 1994. [18] D. Hobby and R. McKenzie. The Structure of Finite Algebras, volume 76 of Contemporary Mathematics. American Mathematical Society, Providence, R.I., 1988. [19] N. Immerman. Descriptive complexity. Graduate Texts in Computer Science. Springer, 1999. [20] B. Larose and P. Tesson. Universal algebra and hardness results for constraint satisfaction problems. Theoretical Computer Science, 410(18):1629\u20131647, 2009. 32 [21] B. Larose and L. Z\u00b4adori. Bounded width problems and algebras. Algebra Universalis, 56(3-4):439\u2013466, 2007. [22] L. Libkin. Elements of finite model theory. Springer, 2004. [23] A. F. Pixley. Distributivity and permutability of congruence relations in equational classes of algebras. Proceedings of the American Mathematical Society (AMC), 14:105\u2013 109, 1963. [24] T. Schaefer. The complexity of satisfiability problems. In Proceedings of the 10th ACM Symposium on Theory of Computing (STOC), pages 216\u2013226, 1978. [25] I. Wegener. Branching programs and binary decision diagrams: theory and applications. Society for Industrial and Applied Mathematics (SIAM), Philadelphia, PA, USA, 2000. 33"} +{"id":"Clusa-et-al.2016.PlosOne","text":"RESEARCH ARTICLE An Easy Phylogenetically Informative Method to Trace the Globally Invasive Potamopyrgus Mud Snail from River’s eDNA Laura Clusa1*, Alba Ardura2, Fiona Gower3, Laura Miralles1, Valentina Tsartsianidou1, Anastasija Zaiko3,4, Eva Garcia-Vazquez1 1 Department of Functional Biology, University of Oviedo, C/ Julian Claveria s/n 33006, Oviedo, Spain, 2 USR3278-CRIOBE-CNRS-EPHE-UPVD, Laboratoire d’Excellence “CORAIL”, Université de Perpignan CBETM, 58 rue Paul Alduy, 66860, Perpignan Cedex, France, 3 Coastal and Freshwater Group, Cawthron Institute, 98 Halifax Street East, 7010, Nelson, New Zealand, 4 Marine Science and Technology Centre, Klaipeda University, H. Manto 84, LT-92294, Klaipeda, Lithuania a11111 * lauraclusa@gmail.com Abstract OPEN ACCESS Citation: Clusa L, Ardura A, Gower F, Miralles L, Tsartsianidou V, Zaiko A, et al. (2016) An Easy Phylogenetically Informative Method to Trace the Globally Invasive Potamopyrgus Mud Snail from River’s eDNA. PLoS ONE 11(10): e0162899. doi:10.1371/journal.pone.0162899 Editor: Richard C. Willson, University of Houston, UNITED STATES Received: March 21, 2016 Accepted: August 30, 2016 Published: October 5, 2016 Copyright: 2016 Clusa et al. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited. Potamopyrgus antipodarum (New Zealand mud snail) is a prosobranch mollusk native to New Zealand with a wide invasive distribution range. Its non-indigenous populations are reported from Australia, Asia, Europe and North America. Being an extremely tolerant species, Potamopyrgus is capable to survive in a great range of salinity and temperature conditions, which explains its high invasiveness and successful spread outside the native range. Here we report the first finding of Potamopyrgus antipodarum in a basin of the Cantabrian corridor in North Iberia (Bay of Biscay, Spain). Two haplotypes already described in Europe were found in different sectors of River Nora (Nalon basin), suggesting the secondary introductions from earlier established invasive populations. To enhance the surveillance of the species and tracking its further spread in the region, we developed a specific set of primers for the genus Potamopyrgus that amplify a fragment of 16S rDNA. The sequences obtained from PCR on DNA extracted from tissue and water samples (environmental DNA, eDNA) were identical in each location, suggesting clonal reproduction of the introduced individuals. Multiple introduction events from different source populations were inferred from our sequence data. The eDNA tool developed here can serve for tracing New Zealand mud snail populations outside its native range, and for inventorying mud snail population assemblages in the native settings if high throughput sequencing methodologies are employed. Data Availability Statement: All sequences from this work are available in the Genbank database (accession numbers KU932989-KU933010). Funding: This work was supported by the Spanish project MINECO-13-CGL2013-42415-R and the Asturias Regional Grant GRUPIN-2014-093. Laura Clusa holds a PCTI Grant from the Asturias Regional Government, referenced BP14-145. Alba Ardura holds a regional postdoctoral Marie Curie grant COFUND-CLARIN. Introduction Human-mediated translocations of marine organisms have become a widely acknowledged global environmental issue nowadays [1, 2]. Maritime activities like merchant shipping or yachting aid the spread of many species out of their native distribution range, and global change may facilitate the success of exotic species in recipient ecosystems until they become PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 1 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Competing Interests: The authors have declared that no competing interests exist. invasive with adverse effects on environment and economies [3, 4]. A successful invader must exhibit a set of differential features [5] allowing passing the different steps of the invasion process and involved barriers: transportation, establishment and spread [3, 6]. Such species usually become of a particular concern for environmental managers and interest for researchers studying patterns in biological invasions. Potamopyrgus antipodarum (New Zealand mud snail) is one of the extremely successful invaders in aquatic ecosystems worldwide. This ovoviviparous prosobranch is currently found in Australia [7], Asia [8–10], Europe [11, 12] and North America [13–15]. Being extremely tolerant, P. antipodarum is a good candidate to survive the transportation to a new region. The presumed vector of its initial transoceanic introduction to Europe and USA is ballast water [13]. Its further spread within the region could be aided by aquaculture (e.g. translocation of stock or equipment), fisheries (e.g. with boats or gear), recreational activities (e.g. with angling gear or pets) [16] or by natural vectors such as birds or fish [14, 17, 18]. Once it reaches the new region, it can colonize and adapt to a wide range of habitats: estuaries [14, 15], lakes [19], rivers [20], saltwater [21] and even open seas [22]. This mud snail competes with native invertebrates for resources in invaded habitats dominating the invertebrate communities [16, 23]. For example, it has caused the decrease of Pyrgulopsis robusta population in USA [24] and the decline of native benthos density and diversity in Poland [25]. They consume up to 75% of primary production, leading to altered nitrogen and carbon cycles in invaded ecosystems [26, 27]. It has been found to resists the impact of parasites [11], and also that of potential predators because it is a poor and often indigestible food for salmon and other fish species [17]. Moreover, Sanderson et al. [28] suggested that non-indigenous species like P. antipodarum are threatening the conservation of endangered salmon due to the alterations they cause in the trophic chain. Due to extremely fast population growth rate it can reach high densities in a short time after incursion, reducing the opportunities for control and mitigation measures. Therefore, early detection is in this case crucial for the efficient rapid response and prevention of the further invasion. In the last few years, the use of environmental DNA has become a promising tool to detect and survey invasive species in aquatic ecosystems. This method seems to be more sensitive and efficient than traditional surveillance approaches, like visual detection, and does not disturb the aquatic fauna [29–31]. The use of specific primers on eDNA has been successfully demonstrated for a number of species. Examples are fish Petromyzon marinus and Salmo trutta [32], molluscs such as Rangia cuneata in the Baltic Sea [33] and Xenostrobus securis in North Spain [34], and others. Potamopyrgus antipodarum has also been detected previously directly from water samples [35], as presence-absence based on positive or negative PCR amplification of a fragment of the cytochrome b gene. Städler et al. [36] suggested that the origin of European Potamopyrgus antipodarum is located in New Zealand. They found only two haplotypes of 16S rDNA across all Europe shared with snails from the North Island of New Zealand. The marked divergence among the two European haplotypes implies successful colonization by two distinct mitochondrial lineages. The aim of this study was to demonstrate a cost-effective surveillance strategy for the species and to explore its invasion history in the North Iberian region. We developed specific primers for Potamopyrgus based on 16S rDNA sequences, for detecting this mud snail and inferring its lineage directly from water samples. PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 2 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Materials and Methods The species studied Potamopyrgus antipodarum is small in invaded regions (6–7 mm size in average), but can grow up to 12 mm in its native range (New Zealand). It has a solid operculum and an elongated shell [37]. It is capable to survive in a great range of environmental conditions: salinities 0–38 PSU [38–40], water temperatures 0–28°C [41], and can even resists short times of desiccation [3, 42]. Non-native populations are generally parthenogenetic, consisting almost exclusively of females [3]. One adult in a new habitat can produce an average of 230 juveniles per year [13]. This high reproductive capacity helps Potamopyrgus to establish and disperse quickly in a new area. Indeed this capacity is the main reason for the large ecological impact of P. antipodarum. Even a single individual can result in a massive invasion just in a few months. Field sampling River Nora (Asturias, north of Spain) is a tributary of the River Nalon basin, in the central Bay of Biscay region, of 67 km long and with an average discharge of 20.98 m3/s. It is completely isolated from downstream by an impassable barrier and a reservoir for hydroelectric power supply (Priañes dam, 43°230 02@N 5°580 26@W) built in 1953. In February-March 2015, mud snails were sampled from three sites within the River Nora, separated from each other by three kilometers. From upstream to downstream, the sites were: Colloto (coordinates 43.379283, -5.788667); Lugones (coordinates 43.401321, -5.822816); and San Claudio (coordinates 43.382938, -5.931142). Ecological conditions were very similar in all sampling sites, with a bottom of stones and gravel, shallow depth, and resembling water flow. The sampling protocol was the following: a 1m2 quadrat was randomly selected, and all present Potamopyrgus individuals were manually collected from the stones (including the underneath sides). This was done simultaneously by three researchers from each site, thus three replicates of 1m2 (approx.) were obtained per site. The average number of individuals per replica is a rough but comparable proxy of the density of the Potamopyrgus population present in each site. Additionally three liters of water were collected with sterile bottles from the same sampling locations before the search of Potamopyrgus individuals. As negative field controls one liter of water was taken from Llanes beach (seawater), coordinates 43.420461, -4.752003 and mainstream River Nalon (freshwater), coordinates 43.180926, -5.341015. No Potamopyrgus individuals were found in these sites despite intensive exploration. No specific permissions were required for sampling in these locations. The River Nora is not within a national park or other protected area. It is of public access. The species Potamopyrgus antipodarum is not native from Spain. Moreover it is listed in the register of invasive species (Spanish Directive of 4 August 2013). DNA extraction From tissue samples DNA was extracted with mollusc DNA Kit (Omega Bio-Tek, USA) following the instructions provided by the manufacturer. 1 L of the water samples was filtered using the Supor1-200 Membrane Filter (Pall Corporation) with 0.2 μm pore size. The filtration apparatus was cleaned with 10% bleach, rinsed with distilled water and autoclaved between each sampling site. DNA was extracted with the PowerWater 1 DNA Isolation Kit (Mobio laboratories). The filtration process and eDNA extractions were done under sterile conditions, in a laboratory unit where there was no other tissue samples, to avoid any contamination of the environmental DNA. eDNA extractions also were done inside a PCR laminar flow cabinet prior to extractions treated with ultraviolet light. Blanks PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 3 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples containing only water were used as controls in DNA extraction, to confirm that contamination did not occur in the process. Design of specific primers The 16S rRNA gene was chosen for the design of the primer, based on reference nucleotide sequences of 16S rDNA from GenBank plus the sequences obtained in the laboratory from Potamopyrgus samples of different origins. Sequences of this gene (either individual 16S DNA sequences or complete mitochondrial genomes), available for Potamopyrgus and other mollusk species were downloaded and aligned with the ClustalW application included in BioEdit [43]. Polymorphisms were analyzed with the DNASP software [44]. The different haplotypes were visualized employing the BioEdit Sequence Alignment Editor software [45]. The universal primers designed by Palumbi et al. [46] amplifying a 16S rDNA region of approximately 600 nucleotides were used for species barcoding. A region within these amplicons conserved in the genus Potamopyrgus but different in the rest of mollusk species was searched. This region was used to design a Potamopyrgus genus-specific reverse primer. As forward primer we used the universal 16SAr from Palumbi et al. [46]. Markers employed and PCR conditions PCR amplification of 16S rDNA using the universal primers described by Palumbi [46] was done with the following protocol. The amplification reaction was performed in a total volume of 40 μl, including Green GoTaq 1Buffer 1X, 2.5 mM MgCl2, 0.25 mM dNTPs, 1μM of each primer, 0.65 U of DNA Taq polymerase (Promega) and 4 μl of template DNA. PCR conditions were the following: an initial denaturation at 95°C for 5 min followed by 35 cycles of denaturation at 94°C for 1 min, annealing at 55°C for 1 min, extension at 72°C for 2 min and a final extension step at 72° for 7 min. The PCR products were sequenced in the DNA sequencing service Macrogen Europe, and the species identifications were confirmed using the BLAST tool from the NCBI. PCR amplification of partial 16S rDNA from tissue DNA using the new primers set (the newly designed reverse primer and the universal Palumbi’s forward primer) was performed in a total volume of 20μl with the same conditions above, except for the annealing temperature. We assayed six different annealing temperatures: from 55°C to 60°C for selecting the best one (that provides clean and clear amplification products of the expected size with no extra bands). The assays of annealing temperatures showed that the best results were obtained at 60°C. All the PCR products were visualized in 2% agarose gels with 2.5 μl of SimplySafe™. PCR amplification of a fragment of 16S rDNA from the bulk DNA extracted from water samples (eDNA) with the specific primer was performed in a total volume of 20 μl, including Green GoTaq 1Buffer 1X, 2.5mM MgCl2, 0.25mM dNTPS, 1μM of each primer, 6 μl of template DNA, 200ng/μl of BSA (bovine serum albumin) and 0.65 U of DNA Taq polymerase (Promega). The PCR conditions were the same as described above, at the best annealing temperature, but with 45 cycles instead of 35. Amplification products from water samples were purified with the Agarose-Out DNA purification kit (EUR1X) and sequenced by Macrogen service. The cytochrome oxidase I (COI) gene was amplified from DNA extracted from tissue and water samples using the universal primers for invertebrates designed by Geller et al. [47] and following the protocol described therein. The difference between the protocols used for tissue and water DNA was the number of cycles in the PCR– 35 and 45 respectively. Negative controls containing only PCR reagents and distilled water were added in every PCR. PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 4 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Marker validation The new primer was first tested in silico by an alignment with the BLAST tool in the NCBI database [48]. Adult individuals of brackish and freshwater mollusks (five per species) were collected for testing in vitro possible cross-species amplification of the new primer (Table 1). PCR amplification with the universal primers of Palumbi et al. [46] was done. The new set of specific primers was assayed on DNA extracted from eleven mollusk species described in Table 1. The sensitivity of the specific primers was determined in vitro with serial dilutions of Potamopyrgus antipodarum DNA from a known concentration (43μg/ml). PCR amplification and visualization of the PCR product in a 2% agarose gel were performed for each concentration. DNA concentration was measured with a spectrophotometer (SimpliNano™ GEHealthcare). From water eDNA, a fragment of the 16S rDNA was PCR-amplified with the new specific primers set using the protocol described in 2.4. As a positive control, the COI gene was amplified from each eDNA sample as described in 2.4, to test for the quality of the DNA and discard false negatives due to excessive DNA degradation, inhibitors or other reasons. Validation of negative results To confirm that the negative results of PCR with the specific primers performed on eDNA samples were true and not produced by any interference or inhibitor present in the template, the subsamples of the Llanes beach eDNA (6μl) were spiked with Potamopyrgus antipodarum DNA of two concentrations: 2μl of P. antipodarum stock DNA (43 μg/ml), and 2μl of the 1:50 000 dilution from the same stock. PCR amplifications were performed in the same conditions as explained before. Phylogenetic analysis Potamopyrgus individuals from River Nora and from different locations in New Zealand (as representatives of native populations), were collected and taxonomically classified de visu (Table 2). Three different sequences were obtained from these samples: COI gene [47], 16S rRNA gene [46] and partial 16S rDNA amplified with the specific primers set. Additional Table 1. Adult mollusks sequenced in this study for 16S rRNA and cytochrome oxidase I genes. Species Habitat Origin Common name Potamopyrgus antipodarum Mytilus galloprovincialis Collection site freshwater, brackish non- native New Zealand mudsnail Nora River marine Spanish native Mediterranean mussel Aviles estuary Mytilus trossulus marine non-native Foolish mussel Baltic Sea Ruditapes philippinarum marine non-native Japanese carpet Shell Aviles estuary Xenostrobus securis brackish non-native Axe-head mussel Aviles estuary Mya arenaria marine non-native Soft-shell clam Baltic Sea Crassostrea gigas marine non-native Giant oyster Aviles estuary Tylomelania kuli freshwater non-native Sulawesi snail pet shop Tylomelania toradjarum freshwater non-native Sulawesi snail pet shop Neritina canalis brackish non-native Nerite pet shop Neritina punctulata freshwater non-native Nerite pet shop Bivalves and gastropods (five individuals per species) employed for the evaluation of cross-amplification of the specific primers. The origin (native or nonnative) is given in relation with Spanish waters. doi:10.1371/journal.pone.0162899.t001 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 5 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Table 2. Potamopyrgus samples, collected from Asturias and New Zealand, sequenced in this study for 16S rRNA and cytochrome oxidase I genes. Sample Place Country Species Pa Ast1 01 Colloto- River Nora Spain P. antipodarum Pa Ast2 01 Lugones- River Nora Spain P. antipodarum Pa Ast2 02 Lugones- River Nora Spain P. antipodarum Pa Ast2 02 Lugones- River Nora Spain P. antipodarum Pa Ast3 01 San Claudio- River Nora Spain P. antipodarum Pa NZ1 01 Collins River New Zealand P. antipodarum Pa NZ2 01 Onomalutu River New Zealand P. antipodarum Pe NZ3 01 Maitai River Site 1 New Zealand P. estuarinus Pe NZ3 02 Maitai River Site 1 New Zealand P. estuarinus Pe NZ3 03 Maitai River Site 1 New Zealand P. estuarinus Pe NZ4 01 Maitai River Site 2 New Zealand P. estuarinus Pe NZ4 02 Maitai River Site 2 New Zealand P. estuarinus Pe NZ4 03 Maitai River Site 2 New Zealand P. estuarinus Pe NZ5 01 Mangroves Matua Rangarawa New Zealand P. estuarinus Pe NZ5 02 Mangroves Matua Rangarawa New Zealand P. estuarinus Pe NZ5 03 Mangroves Matua Rangarawa New Zealand P. estuarinus doi:10.1371/journal.pone.0162899.t002 sequences assigned to Potamopyrgus species were downloaded from GenBank. For each gene, the sequences were aligned with the ClustalW application included in BioEdit [43]. The alignment was converted to MEGA file and a phylogenetic neighbor-joining tree was built using MEGA 4.0 [49], with 10000 bootstrapping and the evolutionary distances were computed using the Tamura-Nei method [50]. Results Specific primers The new specific primer designed in silico within the 16S rDNA sequence was: Reverse primer: 16SPA-R (5’-TCAAAGATTTTGGATCATAGCT-3’). Using the 16SAr described by Palumbi et al. [46]: 16SAr (5’-CGCCTGTTTATCAAAAACA T-3’) as a forward primer and the new 16SPA-R as a reverse primer, a region of 380 nucleotides within the 16S rRNA gene was amplified. The region is located between sites 5350 and 5730 of the Potamopyrgus antipodarum mitochondrion complete genome with GenBank accession number GQ996421.1. Marker validation From BLAST assays in silico, the new primer retrieved significant alignments, with 100% identity, 100% coverage, 0.018 E-value and score of 44.1, with Potamopyrgus antipodarum, P. estuarinus, P. doci, P. opidanus, P. troglodytes sequences of 16S rRNA gene. The same values were also obtained with 16S rDNA sequences of Caldicochlea globosa, an Australian endemic aquatic snail, and several species of the genus Sororipyrgus that are Hydrobiidae snails endemic in New Zealand. All these species except for Potamopyrgus antipodarum are currently absent from European aquatic ecosystems. PCR reactions for assessing primers’ specificity discarded cross-amplification with other mollusks assayed in this study (Table 1A). Consistently with in silico results, in vitro tests showed that the specific primers give positive PCR amplification (amplicons visible in PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 6 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Fig 1. Agarose gel showing PCR amplification products obtained with the species primers’ set for 16S rRNA gene from serial dilutions of Potamopyrgus antipodarum DNA (43μg/ml): 1 (no dilution), 1:5, 1:10, 1:25, 1:50, 1:100, 1:500, 1:1000, 1:5000, 1:10000, 1:50000, 1:100000 and a negative control. doi:10.1371/journal.pone.0162899.g001 agarose gels; data not shown) on the species listed in Table 1A only from DNA samples of Potamopyrgus. The threshold of detection for PCR-visualization in agarose gels was 0.86μg/l, because we obtained a weak but visible band of the amplicon size in the dilution 1 to 1:50 000 from a sample with a concentration of 43μg/ml (Fig 1). Potamopyrgus antipodarum population in Asturias In River Nora the Potamopyrgus antipodarum population was not identical in the three sampling sites. From the sampling results, the estimated population density was higher in the midstream location of Lugones (Table 3), with 63 individuals/m2; meanwhile downstream San Claudio has quite low density of 6 individuals/m2. None of the individuals was >7mm. In the downstream site the relative abundance of juveniles (<3mm) was clearly lower than in upstream areas (Table 3). In all eDNA samples obtained from water, PCR with the universal COI primers [47] yielded amplification products of the expected size around 650 nucleotides (Fig 2). The water samples from River Nora provided positive PCR amplification with the taxon-specific primers designed herein (Fig 3A). In the other two control sites, Llanes beach and River Nalón; no amplification was obtained with these primers as expected since Potamopyrgus mollusks are not present there. The positive bands observed in agarose gel for River Nora water samples were purified, sequenced and the sequences unequivocally identified as Potamopyrgus antipodarum, GenBank accession numbers KU933000-KU933002. The PCR products gave clear chromatograms directly readable, without any trace of nucleotide mixture in any site. Table 3. Potamopyrgus antipodarum specimens collected from different sites within Nora River, classed by size, and total density. The same sampling protocol from three replicates of 1m2 was employed in all sites. Site Density (individuals/m2) Individuals 3mm Individuals < 3 mm Colloto (upstream) 18 33.3% 66.7% Lugones (midstream) 63 39.7% 60.3% San Claudio (downstream) 6 83.3% 16.7% doi:10.1371/journal.pone.0162899.t003 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 7 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Fig 2. Amplification products of the cytochrome oxidase I gene obtained from PCR with universal primers on water samples. Sampling sites: River Nora: Colloto, Lug (Lugones) and San Claudio, River Nalón and Llanes beach;–and + are negative and positive controls respectively. doi:10.1371/journal.pone.0162899.g002 In the agarose gel it can be seen that the band for Lugones is bigger and brighter than for San Claudio location (Fig 3A), concordantly with different population densities. The method can be considered quite sensitive because PCR product was detectable in agarose gel even for San Claudio sample where the observed density was only 6 individuals/m2 (Fig 3A). On the other hand, the negative results obtained from field water samples were confirmed by the additional validation test. Positive PCR amplification from Llanes beach water sample was obtained when Potamopyrgus DNA was added (Fig 3B). A clear band was seen in the two mixtures, one of high concentration with an amount of approximately 86 ng of P. antipodarum Fig 3. A) PCR products of the partial 16S rRNA gene obtained with the taxon-specific primers, on DNA extracted from water samples of River Nora (Colloto, Lugones and San Claudio sites), River Nalón and Llanes beach. Positive amplifications are marked with an arrow. B) Validation of negative results: amplification products of the same gene obtained from Llanes beach water DNA spiked with Potamopyrgus antipodarum DNA. Ll+PahC and Ll+PalC are high and low concentration of Potamopyrgus antipodarum respectively (43μg/ml and dilution 1:50000 respectively).–and +, negative and positive controls respectively. doi:10.1371/journal.pone.0162899.g003 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 8 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples DNA and the other of low concentration with approximately 1.72pg of DNA. The positive results obtained in this last PCR indicate that there were no inhibitors in the environmental samples. This confirms that the negative results obtained from environmental DNA were not due to the presence of inhibitors in the water sample but to the absence of Potamopyrgus DNA in the samples. Therefore false negatives were discarded. Phylogenetic inferences From the individuals analyzed in this study a total of 26 haplotypes were found: 11 (two from Asturias P. antipodarum individuals), 8 (two from Asturias individuals) and 7 (also two from Asturias) for COI gene, long, and short 16S rDNA fragments respectively. The haplotypes obtained in this study are available in NCBI GenBank database with the accession numbers KU932989-KU932999 (COI gene), KU933003-KU933010 (16S rDNA large fragment). The shorter 16S rDNA amplicon obtained from taxon-specific primers corresponds to the sequence comprised between site 01 and site 325 on KU933003- KU933010. The tree reconstructed from the COI gene (Fig 4A) and 16S rDNA (Fig 4B) haplotypes obtained in our mud snail samples with universal primers separated consistently the samples from San Claudio (downstream) from those collected mid- and upstream. Downstream samples clustered with New Zealand samples (Onomalutu River) while the rest of River Nora samples, all with the same haplotype, clustered with River Collins samples, also from New Zealand. The Potamopyrgus estuarinus samples of Maitai River and the Matua Rangarawa Mangroves (New Zealand) clustered, as expected, in an independent branch for the two genes. They were separated by locations (Maitai River in one branch and Matua Rangarawa Mangroves in another) for 16S rRNA gene (Fig 4B), with apparent geographical differentiation. For the shorter 16S rDNA fragment amplified with the primers designed herein, the two haplotypes of P. antipodarum found in Asturias (from both water samples and mud snail individuals) were also separated in different clusters (Fig 5). Samples from Lugones and Colloto (purple diamond in Fig 5) formed a monophyletic group with one haplotype (JQ346709) found in Germany, France, Hungary, Poland, Lithuania and United Kingdom; with the haplotype AY955377 found in Australia (Tasmania), and the New Zealand haplotype AY955376. The haplotype found downstream River Nora (PaAst3-03, San Claudio location) was in a separate clade supported by a bootstrap value of 65, containing the haplotype JN639014 found in Estonia and Wales; the haplotype JN639014 found in Hammond Harbor in Oregon and Devils Lake in Wisconsin; and the New Zealand haplotype AY955393 (North Island). The haplotypes of New Zealand South Island were also separated in this tree, the haplotype from River Collins being monophyletic with the upstream and midstream Asturian samples and other European references, and the Onomalutu River haplotype exhibiting an intermediate and less clear position in the middle of the two branches (Fig 5). Indeed the haplotypes obtained from Asturias water samples (blue circles in Fig 5, GenBank accession numbers KU933000-KU933002) matched perfectly with the haplotypes of the individuals found from the same place. The Potamopyrgus estuarinus samples from New Zealand analyzed in this work formed a clearly differentiated clade with a P. estuarinus reference sequence from GenBank (AY634082), supported by a bootstrap value of 99. Since the two species are closely related, this confirms the phylogenetic value of this relatively short marker. Discussion This is the first record of Potamopyrgus antipodarum from the central basin of the Bay of Biscay (Asturias, North of Spain). In the Iberian Peninsula the species has been detected in Atlantic and Mediterranean basins [16, 20], but not in the Bay of Biscay façade. PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 9 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Fig 4. Phylogenetic tree reconstructed from: A) cytochrome oxidase I gene (621 nucleotides), and B) 16S rDNA haplotypes (496 nucleotides), obtained with universal primers [46, 47] from the individuals analyzed in this study. Pa and Pe are Potamopyrgus antipodarum and P. estuarinus respectively. doi:10.1371/journal.pone.0162899.g004 The results obtained in this study are surprising in several ways. First, in a small river (River Nora) and at short distance among sampling locations we have found two different haplotypes. These haplotypes correspond to the haplotypes t and z described for European Potamopyrgus antipodarum by Städler et al. [36]. These authors found the two haplotypes together only in two locations: Loch of Stennes (Orkney, Scotland) and Slack estuary (Nord-Pas de Calais, France). In the rest of sites studied across Europe only one haplotype was present in each location. The two haplotypes do not seem however be admixed in the same place. The sequences obtained from water samples did not exhibit any sign of overlapped chromatogram peaks in PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 10 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples Fig 5. Phylogenetic tree of partial 16S rDNA sequence (325 nucleotides) with the taxon-specific primer reconstructed from the Potamopyrgus haplotypes (Pa, P. antipodarum; Pe, P. estuarinus) obtained in this work and references obtained from GenBank (the accession number is indicated). The geographic origin of the voucher P. antipodarum specimens are: Ast1; Ast2; Ast3; USA; NZ; NZ1; NZ2; NEU and EU are: Colloto, Lugones, S. Claudio (Asturias 1, 2, and 3), Wisconsin, New Zealand; Collins River, Onomalutu River (New Zealand, South island); Estonia, France (European samples). Sequences obtained from the water samples, Asturias individuals and New Zealand individuals (south island) sampled in this study are indicated with a blue circle, a purple diamond and a yellow square respectively. doi:10.1371/journal.pone.0162899.g005 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 11 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples the polymorphic sites described by Städler et al. [36]. Low densities and scarce juveniles found downstream suggest that a second and recent introduction occurred in San Claudio site. Another interesting result of this study was high sensitivity of the taxon-specific primers developed for detecting Potamopyrgus DNA in water samples. Goldberg et al. [35] designed a marker in the cytochrome b gene region that was able to detect Potamopyrgus individuals at densities as low as 11 individual/m2, filtering 4-L water samples. In our study successful amplification of the 16S rDNA based marker, with amplicons visible on agarose gel, was obtained from 1-L water samples and almost half density (6 individual/m2). This viviparous mud snail does not have a planktonic stage, so the DNA detected from water samples is most likely free-floating DNA. These results are really encouraging because, since imply usefulness of this for early detection of the species when the population density is still low at the initial stage of invasion or on the edge of the range expansion area. This PCR method is economical (the estimated average cost was 10 euros per water sample) and faster in comparison to Metabarcoding [51], and also to qPCR [52] and could be easily added into routine surveillance programs. The method has a shortcoming, however. Simple positive amplification and visualization in agarose gel (or by capillary electrophoresis), that can serve for detecting the species in Europe and North America because it is unique in its genus there, are not enough for population monitoring in its native settings. In silico, and proved in vitro for P. estuarinus, the primers can anneal with other species of the genus Potamopyrgus that are present in Australia and New Zealand. The DNA region employed here as a marker has the phylogenetic power to discriminate between closely related species of this genus (Fig 4). The same primers could be used in native settings using high throughput methodologies [53], or simply cloning-sequencing to separate the different amplicons. Since the region amplifies well from water samples, after further development it could be employed as an additional method for surveys of native Potamopyrgus species assemblages. The origin of the Potamopyrgus antipodarum found in Asturias seems to be the same as for the rest of Europe, since the two haplotypes described by Städler et al. [36] were found. The particular introduction pathway to the region, however, is still unclear. Ballast water, one of the inferred vectors of this invader [13], can be reasonably discarded in our case because the invaded habitats are not accessible from the sea (isolated by an impassable dam). Upstream River Nalón we found no P. antipodarum individuals neither traces of its DNA in the water (negative controls). Aquaculture can also be disregarded because there are no aquaculture facilities in River Nora valley. Short-distance transport by fishermen as suggested by Alonso and Castro-Díez [16] is plausible. Casual hikers may contribute to short-distance transport as well. The birdmediated transport suggested by Lassen [54] is also plausible, since the region is in the middle of the 600-km corridor of northern Spain that is an important and rich wintering ground for many birds [55, 56]. Another possibility, still unexplored, is that they could come from aquarium releases as accompanying fauna of fish pets, as already described for other species [57]. Loo et al. [58] predicted extremely fast spread of this species, forecasting a total invasion of North America freshwater ecosystem in a relatively short time if actions are not taken to prevent its expansion. It seems that in the Cantabrian range region, or at least in the river where it was detected for the first time, the population density is still not too high, especially downstream. Rapid application of containment measures and eradication efforts, as well as a close surveillance of the present populations could be strongly recommended. Conclusion We developed a specific set of primers to detect Potamopyrgus species directly from the water samples (environmental DNA). With this molecular tool it is possible to establish the species PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 12 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples identity and the phylogenetic characteristics of the invasion, sequencing PCR amplicons obtained from environmental samples. This powerful (and economical if limited to visualization on gel) tool can be useful for early detection of New Zealand mud snail in its expanded range of invasion. Acknowledgments We would like to thank the Government of the Principality of Asturias, the Cawthron Institute in New Zealand and the two anonymous Reviewers of PLoS One who helped to much improve the manuscript. Author Contributions Conceived and designed the experiments: EGV AZ. Performed the experiments: LC AA VT. Analyzed the data: LC LM AA VT. Contributed reagents/materials/analysis tools: LC LM AA VT. Wrote the paper: LC EGV AZ LM AA FG VT. Sampling: LC LM AA VT FG AZ EGV. References 1. Cambray JA. Impact of indigenous species biodiversity caused by the globalization of alien recreational freshwater fisheries. Hydrobiologia. 2003; 500: 217–230. 2. Leprieur F, Beauchard O, Blanchet S, Oberdorff T and Brosse S. Fish invasions in the world’s river systems: When natural processes are blurred by human activities. PLoS Biology. 2008; 6(2): e28. doi: 10. 1371/journal.pbio.0060028 PMID: 18254661 3. Alonso A and Castro-Dı́ez P. What explains the invading success of the aquatic mud snail Potamopyrgus antipodarum (Hydrobiidae, Mollusca)? Hydrobiologia. 2008; 614(1): 107–116. Available: http:// doi.org/10.1007/s10750-008-9529-3 4. Chown SL, Hodgins KA, Griffin PC, Oakeshott JG, Byrne M and Hoffmann AA. Biological invasions, climate change and genomics. Evolutionary Applications. 2015; 8(1): 23–46. Available: http://doi.org/10. 1111/eva.12234 doi: 10.1111/eva.12234 PMID: 25667601 5. Marchetti MP, Moyle PB, Levine R. Alien fishes in California watersheds: characteristics of successful and failed invaders. Ecological Applications. 2004; 14(2): 587–596. 6. Olenin S, Elliott M, Bysveen I, Culverhouse PF, Daunys D, Dubelaar GBJ et al. Recommendations on methods for the detection and control of biological pollution in marine coastal waters. Marine Pollution Bulletin. 2011; 62: 2598–2604. doi: 10.1016/j.marpolbul.2011.08.011 PMID: 21889171 7. Schreiber ESG, Lake PS and Quinn GP. Facilitation of native stream fauna by an invading species? Experimental investigations of the interaction of the snail, Potamopyrgus antipodarum (Hydrobiidae) with native benthic fauna. Biological Invasions. 2002; 4(3): 317–325. Available: http://doi.org/10.1023/ A:1020925022843 8. Shimada K and Urabe M. Comparative ecology of the alien freshwater snail Potamopyrgus antipodarum and the indigenous snail Semisulcospira spp. Venus. 2003; 62: 39–53. 9. Naser MD and Son MO. First record of the New Zealand mud snail Potamopyrgus antipodarum (Gray 1843) from Iraq: The start of expansion to western Asia? Aquatic Invasions. 2009; 4(2): 369–372. Available: http://doi.org/10.3391/ai.2009.4.2.11 10. Hamada K, Tatara Y and Urabe M. Survey of mitochondrial DNA haplotypes of Potamopyrgus antipodarum (Caenogastropoda: Hydrobiidae) introduced into Japan. Limnology. 2013; 14(3): 223–228. Available: http://doi.org/10.1007/s10201-013-0405-0 11. Gérard C, Blanc A and Costil K. Potamopyrgus antipodarum (Mollusca:Hydrobiidae) in continental aquatic gastropod communities: Impact of salinity and trematode parasitism. Hydrobiologia. 2003; 493: 167–172. Available: http://doi.org/10.1023/A:1025443910836 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 13 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples 12. Radea C, Louvrou I and Economou-Amilli A. First record of the New Zealand mud snail Potamopyrgus antipodarum J.E. Gray, 1843 (Mollusca: Hydrobiidae) in Greece—Notes on its population structure and associated microalgae. Aquatic Invasions. 2008; 3(3): 341–344. Available: http://doi.org/10.3391/ ai.2008.3.3.10 13. Zaranko DT, Farara DG and Thompson FG. Another exotic mollusc in the Laurentian Great Lakes: the New Zealand native Potamopyrgus antipodarum (Gray 1843) (Gastropoda, Hydrobiidae). Canadian Journal of Fisheries and Aquatic Sciences. 1997; 54(4): 809–814. Available: http://doi.org/10.1139/ cjfas-54-4-809 14. Bersine K, Brenneis VEF, Draheim RC, Rub AMW, Zamon JE, Litton RK et al. Distribution of the invasive New Zealand mudsnail (Potamopyrgus antipodarum) in the Columbia River Estuary and its first recorded occurrence in the diet of juvenile Chinook salmon (Oncorhynchus tshawytscha). Biological Invasions. 2008; 10(8): 1381–1388. Available: http://doi.org/10.1007/s10530-007-9213-y 15. Davidson TM, Brenneis VEF, de Rivera C, Draheim R and Gillespie GE. Northern range expansion and coastal occurrences of the New Zealand mud snail Potamopyrgus antipodarum (Gray, 1843) in the northeast Pacific. Aquatic Invasions. 2008; 3(3): 349–353. Available: http://doi.org/10.3897/ neobiota.14.3140 16. Alonso A and Castro-Dı́ez P. The exotic aquatic mud snail Potamopyrgus antipodarum (Hydrobiidae, Mollusca): State of the art of a worldwide invasion. Aquatic Sciences. 2012; 74(3): 375–383. Available: http://doi.org/10.1007/s00027-012-0254-7 17. Vinson MR and Baker MA. Poor Growth of Rainbow Trout Fed New Zealand Mud Snails Potamopyrgus antipodarum. North American Journal of Fisheries Management. 2008; 28(3): 701–709. Available: http://doi.org/10.1577/M06-039.1 18. Bruce RL, Moffitt CM and Dennis B. Survival and Passage of Ingested New Zealand Mudsnails through the Intestinal Tract of Rainbow Trout. North American Journal of Aquaculture. 2009; 71(4): 287–301. 19. Schmidlin S, Schmera D and Baur B. Alien molluscs affect the composition and diversity of native macroinvertebrates in a sandy flat of Lake Neuchâtel, Switzerland. Hydrobiologia. 2012; 679: 233– 249. 20. Múrria C, Bonada N and Prat N. Effects of the invasive species Potamopyrgus antipodarum (Hydrobiidae, Mollusca) on community structure in a small Mediterranean stream. Fundamental and Applied Limnology. 2008; 171(2): 131–143. 21. Thomsen MS, Wernberg T, Silliman BR and Josefson AB. Broad scale patters of abundance of nonindigenous soft-bottom invertebrates in Denmark. Helgoland Marine Research. 2009; 63(2): 159–167. 22. Grant A, Briggs A. Toxicity of ivermectin to estuarine and marine invertebrates. Marine Pollution Bulletin. 1998; 36: 540–541. 23. Larson MD and Black RA. Assessing interactions among native snails and the invasive New Zealand mud snail, Potamopyrgus antipodarum, using grazing experiments and stable isotope analysis. Hydrobiologia. 2016; 763(1): 147–159. Available: http://doi.org/10.1007/s10750-015-2369-z 24. Riley LA and Dybdahl MF. The roles of resource availability and competition in mediating growth rates of invasive and native freshwater snails. Freshwater Biology. 2015; 60: 1308–1315. Available: http:// doi.org/10.1111/fwb.12566 25. Spyra A, Kubicka J and Strzelec M. The Influence of the Disturbed Continuity of the River and the Invasive Species—Potamopyrgus antipodarum (Gray, 1843), Gammarus tigrinus (Sexton, 1939) on Benthos Fauna: A Case Study on Urban Area in the River Ruda (Poland). Environmental Management. 2015; 56: 233–244. Available: http://doi.org/10.1007/s00267-015-0483-3 doi: 10.1007/s00267-0150483-3 PMID: 25868573 26. Hall RO, Tank JL and Dybdahl MF. Exotic Snails Dominate Nitrogen and Carbon Cycling in a highly productive stream. Frontiers in Ecology and the Environment. 2003; 1(8): 407–411. 27. Hall RO, Dybdahl MF and VanderLoop MC. Extremely High Secondary Production of Introduced Snails in Rivers. Ecological Applications. 2006; 16: 1121–1131. doi: 10.1890/1051-0761(2006)016 [1121:EHSPOI]2.0.CO;2 PMID: 16827007 28. Sanderson BL, Barnas KA and Rub AMW. Nonindigenous Species of the Pacific Northwest: An Overlooked Risk to Endangered Salmon? BioScience. 2009; 59(3): 245–256. Available: http://doi.org/10. 1525/bio.2009.59.3.9 29. Ficetola GF, Miaud C, Pompanon F and Taberlet P. Species detection using environmental DNA from water samples. Biology Letters. 2008; 4(4): 423–425. Available: http://doi.org/10.1098/rsbl.2008.0118 doi: 10.1098/rsbl.2008.0118 PMID: 18400683 30. Blanchet S. The use of molecular tools in invasion biology: An emphasis on freshwater ecosystems. Fisheries Management and Ecology. 2012; 19(2): 120–132. Available: http://doi.org/10.1111/j.13652400.2011.00832.x PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 14 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples 31. Thomsen PF, Kielgast J, Iversen LL, Møller PR, Rasmussen M and Willerslev E. Detection of a Diverse Marine Fish Fauna Using Environmental DNA from Seawater Samples. PLoS ONE. 2012; 7(8): 1–9. Available: http://doi.org/10.1371/journal.pone.0041732 32. Gustavson MS, Collins PC, Finarelli JA, Egan D, Conchúir RÓ, Wightman GD et al. An eDNA assay for Irish Petromyzon marinus and Salmo trutta and field validation in running water. Journal of Fish Biology. 2015; 87: 1254–1262. Available: http://doi.org/10.1111/jfb.12781 doi: 10.1111/jfb.12781 PMID: 26377304 33. Ardura A, Zaiko A, Martinez JL, Samulioviene A, Semenova A and Garcia-Vazquez E. eDNA and specific primers for early detection of invasive species–a case study on the bivalve Rangia cuneata, currently spreading in Europe. Marine Environmental Research. 2015; 112(B): 48–55. Available: http:// doi.org/10.1016/j.marenvres.2015.09.013 34. Devloo-Delva F, Miralles L, Ardura A, Borrell YJ, Pejovic I, Tsartsianidou V et al. Detection and characterisation of the biopollutant Xenostrobus securis (Lamarck 1819) Asturian population from DNA Barcoding and eBarcoding. Marine Pollution Bulletin. 2016. Available: http://dx.doi.org/10.1016/j. marpolbul.2016.03.008 35. Goldberg CS, Sepulveda A, Ray A, Baumgardt J and Waits LP. Environmental DNA as a new method for early detection of New Zealand mudsnails (Potamopyrgus antipodarum). Freshwater Science. 2013; 32(3): 792–800. Available: http://doi.org/10.1899/13-046.1 36. Städler T, Frye M, Neiman M and Lively CM. Mitochondrial haplotypes and the New Zealand origin of clonal European Potamopyrgus, an invasive aquatic snail. Molecular Ecology. 2005; 14(8): 2465– 2473. Available: http://doi.org/10.1111/j.1365-294X.2005.02603.x PMID: 15969728 37. Winterbourn M. The New Zealand species of Potamopyrgus (Gastropoda: Hydrobiidae). Malacologia. 1970; 10(2): 283–321. 38. Jacobsen R and Forbes VE. Clonal variation in life-history traits and feeding rates in the gastropod, Potamopyrgus antipodarum: performance across a salinity gradient. Functional Ecology. 1997; 11(2): 260–267. Available: http://doi.org/10.1046/j.1365-2435.1997.00082.x 39. Costil K, Dussart GBJ and Daguzan J. Biodiversity of aquatic gastropods in the Mont St-Michel basin (France) in relation to salinity and drying of habitats. Biodiversity and Conservation. 2001; 10(1): 1–18. Available: http://doi.org/10.1023/A:1016670708413 40. Hoy M, Boese BL, Taylor L, Reusser D and Rodriguez R. Salinity adaptation of the invasive New Zealand mud snail (Potamopyrgus antipodarum) in the Columbia River estuary (Pacific Northwest, USA): Physiological and molecular studies. Aquatic Ecology. 2012; 46(2): 249–260. Available: http://doi.org/ 10.1007/s10452-012-9396-x 41. Moffitt CM and James CA. Response of New Zealand mudsnails Potamopyrgus antipodarum to freezing and near-freezing fluctuating water temperatures. Freshwater Science. 2012; 31(4): 1035–1041. Available: http://doi.org/10.1899/11-160.1 42. Alonso A and Castro-Dı́ez P. Tolerance to air exposure of the New Zealand mudsnail Potamopyrgus antipodarum (Hydrobiidae, Mollusca) as a prerequisite to survival in overland translocations. NeoBiota. 2012; 14: 67–74. 43. Thompson JD, Higgins DG and Gibson TJ. Clustal-W, Improving the sensitivity of progressive multiple sequence alignment trough sequence weighting, position-specific gap penalties and weight matrix choice. Nucleic Acids Research. 1994; 22: 4673–4680. PMID: 7984417 44. Rozas J, Sánchez-Del Barrio JC, Messeguer X and Rozas R. DnaSP, DNA polymorphism analyses by the coalescent and other methods. Bioinformatics. 2003; 19(18): 2496–2497. PMID: 14668244 45. Hall T. BioEdit: a user-friendly biological sequence alignment editor and analysis program for Windows 95/98/NT. Nucleic Acids Symposium Series. 1999. 46. Palumbi SR, Martin A, Romano S, McMillan WO, Stice L and Grabowski G. The simple fool’s guide to PCR. version 2. Version 2.0, Department of Zoology and Kewalo Marine Laboratory. 1991; pp. 28. 47. Geller J, Meyer C, Parker M and Hawk H. Redesign of PCR primers for mitochondrial cytochrome c oxidase subunit I for marine invertebrates and application in all-taxa biotic surveys. Molecular Ecology Resources. 2013; 13: 851–861. doi: 10.1111/1755-0998.12138 PMID: 23848937 48. Altschul SF, Gish W, Miller W, Myers EW and Lipman DJ. Basic Local Alignment Search Tool. Journal of Molecular Biology. 1990; 215: 403–410. PMID: 2231712 49. Tamura K, Dudley J, Nei M and Kumar S. MEGA4: Molecular Evolutionary Genetics Analysis (MEGA) software version 4.0. Molecular Biology and Evolution. 2007; 24: 1596–1599. (Available: http://www. kumarlab.net/publications). PMID: 17488738 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 15 / 16 A Phylogenetically Informative Method to Trace Mud Snail Invasion in eDNA from Water Samples 50. Tamura K and Nei M. Estimation of the number of nucleotide substitutions in the control region of mitochondrial DNA in humans and chimpanzees. Molecular Biology and Evolution. 1993; 10: 512–526. PMID: 8336541 51. Teletchea F. Molecular identification methods of fish species: Reassessment and possible applications. Reviews in Fish Biology and Fisheries. 2009; 19(3): 265–293. Available: http://doi.org/10.1007/ s11160-009-9107-4 52. Darling JA, Blum MJ. DNA-based methods for monitoring invasive species: a review and prospectus. Biological Invasions. 2007; 9: 751–765. 53. Zaiko A, Martinez JL, Ardura A, Clusa L, Borrell YJ, Samuiloviene Aet al. Detecting nuisance species using NGST: methodology shortcomings and possible application in ballast water monitoring. Marine Environmental Research. 2015; 112(PtB): 64–72. 54. Lassen HH. Diversity of freshwater snails in view of equilibrium theory of island biogeography. Oecologia. 1975; 19: 1–8. 55. Milwright RDP. Post-breeding dispersal, breeding site fidelity and migration/wintering areas of migratory populations of song thrush Turdus philomelos in the Western Palearctic. Ringing and Migration. 2006; 23: 21–32. 56. Tellerı́a JL, Ramı́rez Á, Galarza A, Carbonell R, Pérez-Tris J and Santos T. Do migratory pathways affect the regional abundance of wintering birds? A test in northern Spain. Journal of Biogeography. 2009; 36: 220–229 57. Strayer DL. Alien species in fresh waters: ecological effects, interactions with other stressors, and prospects for the future. Freshwater Biology. 2010; 55(Suppl. 1): 152–174. 58. Loo SE, Mac Nally R, Lake PS. Forecasting New Zealand Mudsnail invasion range: model comparisons using native and invaded ranges. Ecological Applications. 2007; 17(1): 181–189. PMID: 17479844 PLOS ONE | DOI:10.1371/journal.pone.0162899 October 5, 2016 16 / 16"} +{"id":"jmedgenet-2011-100468","text":"Genotype-phenotype correlations ORIGINAL ARTICLE Comprehensive sequence analysis of nine Usher syndrome genes in the UK National Collaborative Usher Study Polona Le Quesne Stabej,1 Zubin Saihan,2,3 Nell Rangesh,4 Heather B Steele-Stallard,1 John Ambrose,5 Alison Coffey,5 Jenny Emmerson,5 Elene Haralambous,1 Yasmin Hughes,1 Karen P Steel,5 Linda M Luxon,4,6 Andrew R Webster,2,3 Maria Bitner-Glindzicz1,6 < Additional materials are published online only. To view these files please visit the journal online (http://jmg.bmj. com/content/49/1.toc). 1 Clinical and Molecular Genetics, Institute of Child Health, UCL, London, UK 2 Institute of Ophthalmology, UCL, London, UK 3 Moorfields Eye Hospital, London, UK 4 Audiovestibualar Medicine, Institute of Child Health, UCL, London, UK 5 Wellcome Trust Sanger Institute, Hinxton, Cambridge, UK 6 UCL Ear Institute, London, UK Correspondence to Dr Maria Bitner-Glindzicz, Clinical and Molecular Genetics Unit, Institute of Child Health, UCL, 30 Guilford Street, London WC1N 1EH, UK; mbitnerg@ich.ucl.ac.uk Received 31 August 2011 Revised 13 October 2011 Accepted 15 October 2011 Published Online First 1 December 2011 ABSTRACT Background Usher syndrome (USH) is an autosomal recessive disorder comprising retinitis pigmentosa, hearing loss and, in some cases, vestibular dysfunction. It is clinically and genetically heterogeneous with three distinctive clinical types (IeIII) and nine Usher genes identified. This study is a comprehensive clinical and genetic analysis of 172 Usher patients and evaluates the contribution of digenic inheritance. Methods The genes MYO7A, USH1C, CDH23, PCDH15, USH1G, USH2A, GPR98, WHRN, CLRN1 and the candidate gene SLC4A7 were sequenced in 172 UK Usher patients, regardless of clinical type. Results No subject had definite mutations (nonsense, frameshift or consensus splice site mutations) in two different USH genes. Novel missense variants were classified UV1-4 (unclassified variant): UV4 is ‘probably pathogenic’, based on control frequency <0.23%, identification in trans to a pathogenic/probably pathogenic mutation and segregation with USH in only one family; and UV3 (‘likely pathogenic’) as above, but no information on phase. Overall 79% of identified pathogenic/UV4/UV3 variants were truncating and 21% were missense changes. MYO7A accounted for 53.2%, and USH1C for 14.9% of USH1 families (USH1C: c.496+1G>A being the most common USH1 mutation in the cohort). USH2A was responsible for 79.3% of USH2 families and GPR98 for only 6.6%. No mutations were found in USH1G, WHRN or SLC4A7. Conclusions One or two pathogenic/likely pathogenic variants were identified in 86% of cases. No convincing cases of digenic inheritance were found. It is concluded that digenic inheritance does not make a significant contribution to Usher syndrome; the observation of multiple variants in different genes is likely to reflect polymorphic variation, rather than digenic effects. INTRODUCTION Usher syndrome (USH) is an autosomal recessive disease characterised by the association of sensorineural hearing loss, retinitis pigmentosa (RP) and in some cases by vestibular dysfunction. The disorder is divided into three clinical types: type I (USH1) characterised by profound congenital hearing loss, absent vestibular function and onset of RP usually within the first decade of life; type II (USH2), J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 characterised by congenital, moderate to severe hearing loss, with normal vestibular function and onset of RP around or after puberty; and type III (USH3), defined by postlingual progressive hearing loss and variable vestibular response together with RP.1 2 In addition there remain patients whose disease does not fit into any of these three subtypes, because of atypical audiovestibular or retinal findings, who are said to have ‘atypical Usher syndrome’. Eleven loci and nine genes are associated with USH and cases of digenic inheritance have been described.3e16 For USH1, five associated genes have been cloneddMYO7A (USH1B), USH1C, CDH23 (USH1D), PCDH15 (USH1F), and USH1G (USH1G). Three associated genesdUSH2A (USH2A), GPR98 (USH2C), and WHRN (USH2D)dhave been found to be responsible for USH2 and mutations in the CLARIN/CLRN1 gene were found in cases of USH3. Proteins encoded by Usher genes belong to different classes and are organised in the common synaptic and periciliary areas of the photoreceptors, and in the stereocilia or hair bundle of the inner ear hair cells.1 17 Digenic cases of Usher have been reported with description of possibly pathogenic variants in two different USH genes, but the pathogenicity of these variants is often questionable, with at least one of the genes often containing a missense variant.18e20 Only one study describes an Usher type II case, with one truncating mutation in GPR98 and one truncating mutation in a novel USH modifier gene, PDZD7.21 The National Collaborative Usher Study (NCUS) was initiated in order to examine the molecular epidemiology of USH in the UK in anticipation of treatments for the condition, and to look specifically for the contribution of digenic inheritance in Usher syndrome. This report describes sequence analysis of nine USH genes MYO7A, USH1C, CDH23, PCDH15, USH1G, USH2A, GPR98, WHRN, CLRN1 and a candidate gene SLC4A7 in the ethnically heterogeneous UK population. The SLC4A7 gene encodes the stilbene-insensitive electroneural sodium bicarbonate co-transporter (NBC3) and loss of NBC3 in mice causes degeneration of sensory cells in the inner ear and eye.22 At the beginning of this study SLC4A7 was considered as a potential positional candidate gene for type 2 27 Genotype-phenotype correlations Usher syndrome in humans. All 384 exons of these 10 genes were sequenced by bidirectional capillary sequencing in one affected individual from each family, regardless of their clinical type, as well as in 48 CEPH (Centre d’Etude du Polymorphisme Humain) controls. The study contributes 774 sequence variants to the locus specific database (LSDB) for USH genes.23 Molecular diagnosis with one or two pathogenic or likely pathogenic variants was established in 86% of USH families; 295 variants residing in the exons or exon/intron boundaries were novel. We found no convincing cases of digenic inheritance, although the polymorphic nature of some of the genes means that many individuals had variants in more than one gene. PATIENTS AND METHODS Patient and control DNAs A total of 188 probands and 456 family members (parents and sibs) were collected and studied as part of the UK NCUS. The protocol of the study adhered to the provisions of the Declaration of Helsinki and had multicentre research ethics approval granted for recruitment through Moorfields Eye Hospital, Great Ormond Street Hospital (who both also approved the study), the support organisation Sense, or as self-referrals. Informed consent to the study was obtained from all participants. Patients were classified as Usher type I (USH1), II (USH2), III (USH3) or atypical based on ophthalmologic, audiometric and vestibular tests. Control DNA cohorts consisted of 381 unrelated UK blood donors (European Collection of Cell Cultures, ECCAC), 48 CEPH control DNAs (Caucasian, Utah, USA), and 57 individuals of Pakistani origin (courtesy of Professor Eamonn R Maher, Birmingham, UK). Ophthalmic examination was performed in all affected individuals to confirm the presence of RP and included best corrected visual acuities, slit lamp biomicroscopy, colour vision testing with Hardy-Rand-Rittler colour plates, and Goldmann perimetry using the V4e, II4e and I4e targets. Retinal imaging with digital colour fundus photography, optical coherence tomography (6mm scans centred on the fovea; Stratus OCT3; Carl Zeiss Meditec, Dublin, California, USA) and fundus autofluorescence (FAF) imaging (HRA, Heidelberg, Germany) was also performed. Pattern and full field electroretinograms (ERGs) were performed in some cases using international standards.24 25 Audiologic evaluation included pure tone audiometry, tympanometry, stapedial reflex measurement, transient evoked otoacoustic emission recordings, and auditory brain stem evoked response recording using standard protocol.26e31 Subjective pure tone air and bone conduction thresholds were determined at 0.25, 0.5, 1, 2, 4, and 8 kHz using a GSI 61 audiometer (Guymark, Cradley Heath, UK), TDH39 supra aural earphones (Sennheiser UK, Ltd, High Wycombe, UK), and the British Society of Audiology recommended procedure. Audiometric descriptors of mild, moderate, severe, and profound hearing loss were calculated according to the British Society of Audiology descriptors. Vestibular function was evaluated with infrared video nystagmography, a rotary chair system (Neurokinetics, Pittsburgh, Pennsylvania, USA), and vestibulo-ocular reflex responses.30 Binaural bithermal caloric testing with water was undertaken using the British Society of Audiology recommended protocol (http://www.thebsa.org.uk/docs/RecPro/CTP.pdf),29 and the departmental normative data for peak slow component velocity were used to determine normality. Canal paresis (>17%) and directional preponderance (>16%) were calculated according to Jongkees formulae,32 and vestibular hypofunction was defined by total eye velocity <788/s. All parameters were defined by departmental normative data. Bilateral horizontal 28 semicircular canal function was assessed using sinusoidal (608 peak velocity and 0.05 Hz) and step rotation testing (acceleration, 08e608/s constant velocity in <1 s). A gain of either <0.23 in test or time constant of <8 s on impulsive rotation was considered vestibular hypofunction. DNA sequencing Genomic DNA of patients and family members was extracted from peripheral blood by standard methods. The 188 NCUS probands and 48 CEPH controls underwent bidirectional DNA sequencing of nine Usher genes (MYO7A, CDH23, PCDH15, USH1C, USH1G, USH2A, GPR98, WHRN, CLRN1) and the candidate gene SLC4A7. Exons and flanking sequences were amplified by PCR using 578 primer sets and sequenced at the Wellcome Trust Sanger Institute as part of the ExoSeq project. Primers were designed for all the exons of the transcripts whose NM numbers are given below under Accession Numbers. Primers covered the entire region of each of these transcripts and primer sequences are available on request. Direct sequencing was performed using the BigDye Terminator Cycle Sequencing on an ABI 3100 (Applied Biosystems, Foster City, California, USA). Sequences were analysed using GAP4,33 and SeqMan softwares (DNASTAR Inc, Madison, Wisconsin, USA). Further details of the ExoSeq protocols and instructions on data access are available from: http://www. sanger.ac.uk/resources/downloads/human/exoseq.html All sequence variants except SLC4A7 will be submitted to the LSDB for Usher Syndrome (USHbases).23 Analysis of USH variants in parents, siblings and control DNAs A total of 365 variants were genotyped by Sequenom using Sequenom iPLEX Gold assay on the MassARRAY Platform (Sequenom, San Diego, California, USA) in probands and available family members to ascertain parental origin, and in controls to assess minimum allele frequency (MAF); 320 pathogenic and putatively pathogenic variants were identified in probands during Sanger sequencing and 45 were highly polymorphic single nucleotide polymorphisms (SNPs) selected from the SNP database (supplementary table 1). Genotyped variants and family pedigrees were imported into Progeny Lab software (Progeny, LLC) where haplotypes were constructed from a combination of intragenic variants and polymorphic SNPs in individual families. Assessment of pathogenicity To facilitate clear description of variants with respect to their pathogenicity, we have used a grading system along the guidelines provided by the Clinical and Molecular Genetics Society, a constituent member of the British Society of Human Genetics. Variants were graded using two definite pathogenicity grades, that is, pathogenic and neutral. Variants which could not be confidently classified as either pathogenic or neutral were called ‘unclassified variants’ (UV1eUV4), with UV4 being probably pathogenic and UV1 being probably neutral. All frame shift mutations, nonsense mutations, and mutations of the first two nucleotides of canonical intron splice acceptor or donor sites have been classified as pathogenic. A missense or intronic change was described as pathogenic if it fulfilled all of the following criteria: it occurred in controls with a frequency <0.236%, was identified in trans to a pathogenic/ probably pathogenic mutation and it was either novel and segregated with USH in more than two families, or was previously published as pathogenic/likely pathogenic. The benchmark frequency of 0.236% was determined based on the MAF of the J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 Genotype-phenotype correlations most common USH mutation USH2A:p.Glu767SerfsX21 in 846 control chromosomes assayed in this study. If a novel variant fulfilled the above criteria, but segregated with USH in only one family, it was deemed to be probably pathogenic and was classified as ‘UV4’. Missense variants were classed as ‘UV3’ (likely pathogenic) if the frequency in control chromosomes was <0.236%, but phase of the variant could not be ascertained due to missing family data. Missense and silent changes of the last nucleotide of the exon that are likely to affect splicing were also described as ‘UV3’ if they were found in the same gene as another ‘pathogenic’ or ‘UV4’ variant. Our determination of a variant as pathogenic, is therefore stringent. Variants with uncertain pathogenicity were described as ‘UV2’. UV2 variants fulfilled the criteria described for UV3, but were only genotyped in 96 CEPH control chromosomes. A missense variant was also classified as UV2 if it was the only possibly pathogenic variant in the gene. Furthermore, novel intronic variants residing three nucleotides from the start/end of the exon and not found in 846 control chromosomes were also classified as UV2. ‘UV1’ (probably neutral variants) variants were found in patients who already had two other pathogenic/probably pathogenic mutations or did not segregate with disease. The MAF of UV1 in control chromosomes was either <0.236% or was not assessed. We cannot exclude the possibility that such variants may modify disease phenotype. ‘Neutral’ variants did not segregate with disease, were either previously published as neutral, or were found in controls with a frequency >0.236%. A box diagram detailing the grading system is in supplemental data 1. Novel missense variants, classified as UV2eUV4 and pathogenic, were also assessed using Usher Syndrome Missense Analysis. Usher Syndrome Missense Analysis is a web-based tool dedicated to analysis of missense variants in Usher genes available through USHbases. (https://194.167.35.160/cgi-bin/ USMA/USMA.fcgi). The software compiles orthologue analysis, alignment of protein domains, secondary structure and 3D predictions.23 34 Output is given by number and percentage of orthologues conserved to those surveyed. Accession numbers Sequence variants were described following Human Genome Variation Society’s recommendations according to the following accession numbers: NM_000260.3 (MYO7A), NM_153676.2 (USH1C), NM_022124.5 (CDH23), NM_033056.3 (PCDH15), NM_173477.2 (USH1G), NM_206933.2 (USH2A), NM_032119.3 (GPR98), NM_015404.3 (WHRN), NM_174878.2 (CLRN1), and NM_003615.3 (SLC4A7). USH1C:c.496+1G>A assay PCR primers were designed to target the splice mutation, USH1C:c.496+1G>A, using the amplification created restriction site method (see supplemental data 2). These primers were designed to avoid the VNTR in intron 5.5 they may be ligated to each other to form a single complete probe. PCR primers complementary to each arm of the probe amplify the hybridised oligonucleotide target. Each probe is of a unique length and can be resolved by capillary sequencing and quantified relative to standards of known copy number by ratio. In silico splice site prediction Novel synonymous changes in direct proximity to splice sites and novel synonymous changes which were absent in controls were evaluated in silico (Human Splicing Finder, http://www. umd.be/HSF/ and Splice Site Prediction by Neural Network, http://www.fruitfly.org/seq_tools/splice.html), supplementary tables 2 and 3). RESULTS Clinical evaluation A total of 188 families were recruited for the NCUS study: 47 were diagnosed with USH1 on the basis of having profound congenital hearing loss on audiometry, absent vestibular function on formal testing and typical RP; 121 were diagnosed with USH2 on the basis of sloping moderate to severe congenital sensorineural hearing loss on audiometry and normal vestibular function on formal testing with typical RP; four were diagnosed with USH3 on the basis of progressive sensorineural hearing loss and typical RP. Four NCUS probands had typical RP, but their hearing loss was not typical for any type of Usher syndrome. A further seven had both RP and hearing loss which were considered atypical for Usher syndrome. Another five NCUS probands were thought not have Usher syndrome, but were diagnosed with autosomal-recessive RP (one proband), Alström syndrome (one proband), sector RP and hearing loss (one proband35), and an unknown syndromic disorder (two probands). All underwent sequencing. Analysis of variants We recorded a total of 774 sequence variants in exons and exon/ intron boundaries of the 188 NCUS probands; 319 intronic variants resided more than 20 nucleotides from a splice donor or acceptor site and were not analysed in detail. The remaining 455 variants (295 novel and 160 previously published) were classified as ‘pathogenic’ (115 variants), ‘UV4’ (15 variants), ‘UV3’ (11 variants), ‘UV2’ (18 variants), ‘UV1’ (201 variants) and ‘neutral’ (95 variants). Novel variants were classified as UV4, based on a frequency <0.236% in controls, identification in trans to a pathogenic/probably pathogenic mutation and segregation with USH in only one family (ie, probably pathogenic), and UV3 (likely pathogenic) as above but phase of the variant could not be ascertained due to missing family data. Pathogenic and UV2eUV4 variants per patient are described in tables 1e3. Details of 295 novel variants are described in supplementary table 2 (UV2eUV4, pathogenic) and supplementary table 3 (UV1, neutral). Molecular diagnosis in Usher type 1 families (N[47) Multiplex ligation dependent probe amplification Multiplex ligation dependent probe amplification (MLPA) designed by MRC Holland was used to confirm suspected large genomic deletion in PCDH15. The SALSA MLPA 292-A1 kit was used according to manufacturer’s instructions. MLPA is a multiplex PCR based method of DNA copy number quantification. (http://www.mlpa.com/). Two oligonucleotides complementary to a target sequence hybridise next to each other on the target, separated by a single base. When both hybridise J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 Diagnosis with two pathogenic/UV4/UV3 variants was established in 37/47 (78.7%) USH1 families, and a single pathogenic/ UV4 variant was found in four families (8.5%). In another three families (6.4%) we suspect involvement of USH1 genes based on haplotype analysis (family 168 shown in supplemental figure 1, and families 104, 206). Molecular diagnosis was unclear in a further three families (6.4%) with either UV2 (uncertain pathogenicity) variants (family 705) or no mutations identified (families 129, 340). Genotypes are detailed in table 1. 29 Genotype-phenotype correlations Table 1 Genotypes of Usher syndrome type 1 probands (novel variants are in bold) Family Gene Allele 1* Allele 2* Allele 1score Allele 2 score 107 140z 163 407 555 168z 30 146 444 68 100 111 132 93 262 287 435 578 731 732 516 69 35 79 257 706z 182 500 692 676 42 104z 206z 705 291z 313 399z 119 87 461 4 families 530z 129, 340 CDH23 CDH23 CDH23 CDH23 CDH23 CDH23{ MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A MYO7A{{ MYO7A{{ MYO7A PCDH15 PCDH15 PCDH15 PCDH15 USHIC USHIC USHIC USHIC Unknown c.6712+1G>A p.Arg2107X p.Arg2107X c.2177-2A>G c.7362G>Ax Unknown{ p.Lys1255ArgfsX8 p.Lys1255ArgfsX8 p.Lys1255ArgfsX8 p.Gly214Arg c.3108+1G>A p.Arg669X p.Arg972X c.3504-1G>C** p.Asp1613ValfsX32 p.Gly1942X p.Arg1240Gln p.Arg669X p.Ala2009ProfsX32 p.Glu117SerfsX33 p.Gly1378TrpfsX6 p.Arg241Pro p.Arg2024X p.Arg1240Gln p.Cys31X p.Phe1963del p.Pro2126LeufsX5 p.Trp1431X p.Arg212His p.Arg241Pro p.Arg1701X Unknown{{ p.Tyr2015His c.1793-3C>G p.Gly942ValfsX22 c.3717+1G>A c.3501+2T>C Exon 10 deletedyyy p.Arg80ProfsX69 p.Arg80ProfsX69 c.496+1G>A p.Glu149del Unknown p.Arg2107X p.Leu2436ThrfsX3 P.Leu3041Pro c.6254_6254-3delCAGGinsT Unknown{ p.Ala26Glu** p.Asp521GlufsX8 p.Lys542GlnfsX5 p.Arg212His c.3108+1G>A c.5944G>Ayy p.Arg972X p.Leu1858Pro p.Lys420X p.Gly25Arg c.133-2A>G p.Lys542GlnfsX5 c.-48A>Gzz c.592+1G>T p.Glu968Asp c.5944G>Ayy p.Asp75His p.Leu2193Phe p.Arg1883Gln p.Phe1963del p.Arg1240Trp p.Ala826Thrxx c.1798-3C>G p.Glu380Lys Unknown Unknown{{ p.Tyr2015His Unknown p.Gly942ValfsX22 Exon 9-18 deleted*** c.3501+2T>C Exon 10 deletedyyy c.496+1G>A p.Arg80ProfsX69 c.496+1G>A p.Glu149del Pathogenic Pathogenic Pathogenic Pathogenic UV3 Pathogenic Pathogenic UV4 Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic UV4 Pathogenic Pathogenic Pathogenic UV4 Pathogenic Pathogenic Pathogenic UV4 Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic UV4 Pathogenic UV4 UV4 UV4 UV4 UV3 UV4 UV2 UV2 UV2 UV2 Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic UV2 Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Pathogenic Ethnicityy Caucasian Pakistani (Sindi) Caucasian Caucasian Caucasian Turkish Cypriot Caucasian Caucasian Caucasian Caucasian Indian Caucasian Iranian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Asian Black African (Ghana) Caucasian Caucasian Caucasian Caucasian Turkish Cypriot Caucasian Caucasian Greek Cypriot South Pacific Caucasian Caucasian Caucasian Caucasian Jewish Caucasian Indian Caucasian *Unless stated otherwise, the alleles were not observed in control chromosomes. yCaucasian: UK and European. zConsanguineous family. xLast nucleotide of exon (possibly affects splicing). {See supplemental figure 1. **Found in 1/872 (0.11%) control chromosomes. yyLast nucleotide of exon; causes MYO7A:p.Val1953GlufsX12.23 zzIVS1-2A>G. xxFound in 2/826 (0.23%) control chromosomes (both heterozygotes are Pakistani controls). {{Consanguineous family demonstrating linkage to Usher type1 genes. The causative mutations were either not found or were of uncertain pathogenicity (UV2). ***See supplemental figure 2. Deletion of PCDH15 exons 9e18 was confirmed by MLPA. yyyA homozygous deletion of PCDH15 exon 10 was confirmed by MLPA. The family is not knowingly consanguineous. MLPA, multiplex ligation dependent probe amplification; UV unclassified variant. Mutations in MYO7A were the most frequent cause of USH1 in our cohort, representing the molecular cause of USH in 25 (53.2%) out of 47 USH1 families (supplemental data 3). In 22 out of 47 families (46.8%) we identified two MYO7A pathogenic/ UV4/UV3 variants and in three families (6.4%) only one such variant. Twenty-eight out of 47 (59.6%) mutated alleles were predicted to code for prematurely truncated proteins, 17 (36.2%) were missense mutation, and two were in-frame deletions. 30 Overall, two pathogenic variants in USH1C were identified in seven out of 47 (14.9%) USH1 families, making this the second most common USH1 gene. Remarkably, USH1C:c.496+1G>A36 was the most frequent USH1 mutation in our cohort, accounting for 9.6% (9/94) of all expected USH1 mutations. Analysis of three polymorphic USH1C SNPs showed that the c.496+1G>A is associated with a common haplotypedthat is, ‘A-C-G’ (‘rs2072227-rs2240488-rs2883581’). It was found in J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 Genotype-phenotype correlations Table 2 Genotypes of Usher type 2 and of Usher type 3 probands (novel variants in bold) Family USH type Gene Allele 1* Allele 2* Allele 1 score Allele 2 score Ethnicityy 3 21 26, 46 29 32 38 45 53x 57 61 64 82 113 147 151 155 165 171 179 187 192 193 194 200 203x 205x 212, 702 215 219, 672 220 221 225 239 247 296 314x 321 332 334, 386 345 347 355 359 367, 17 369 374 377 385 387 389 394 398 401 408x 417 418 427 440 455 490 509 545 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu2288X p.Arg4192Hisz p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Cys1452LeufsX25 p.Glu767SerfsX21 p.Pro560LeufsX31 p.Trp1607X{ c.11047+1G>A p.Cys982LeufsX2 p.Gln3959AsnfsX53 p.Gln1063SerfsX15 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Arg1504LysfsX26 p.Glu767SerfsX21 p.Pro746Ala p.His308SerfsX16 p.Thr4439Ile p.Glu767SerfsX21 p.Cys870X Unknown p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Arg3689X Exon 47 deletedzz p.Asn346His p.Thr4809Ile p.Glu767SerfsX21 p.Glu767SerfsX21 c.1841-2A>G p.Thr4439Ile p.Glu767SerfsX21 p.Glu4458AspfsX3 p.Glu1492X p.Glu2288X p.Glu767SerfsX21 p.Cys5153X p.Glu767SerfsX21 p.Arg1504LysfsX26 p.Pro560LeufsX31 p.Arg1777Trp p.Glu3305ArgfsX41 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Gln675X p.Glu767SerfsX21 p.Cys419Phe p.Glu767SerfsX21 p.Arg63X p.Cys1452LeufsX25 p.Cys620Phe p.Ser4377X p.Arg1281X p.Glu767SerfsX21 p.Glu767SerfsX21 p.Arg4971X p.Arg626X Unknown p.Arg4192Hisz p.His308SerfsX16 p.Arg34X p.Ser1173X p.Cys1452LeufsX25 p.Cys536Arg p.Glu2265_Tyr2266del insAsp p.Cys620Phe p.Cys3575Tyr p.Trp3955X** p.Asn4762Ser Unknown p.Arg1946X p.Trp2945X p.Lys4816X p.Trp2744X p.Gln1063SerfsX15 c.7595-3C>Gyy c.9371+1G>C{ p.Cys3267Arg c.11390-1G>C p.Cys870X Unknown p.Cys419Phe p.Ala4153Thr{ p.Ala1872LeufsX58 p.Trp3521Arg Exon 47 deletedzz Unknown Unknown p.Gln3959AsnfsX53 p.Gln675X c.1841-2A>G p.Asn346His p.Cys620Phe c.7595-3C>Gyy c.11047+1G>A p.Gly268Arg p.Glu4458AspfsX3 p.Trp3521Arg p.Gln4541X p.Glu767SerfsX21 p.His340Leu p.Asn2285Ser p.Asn346His p.Ile2754AsnfsX15 p.Tyr4801X p.Glu284AspfsX38 p.Gln4541X p.Glu2288X p.Cys419Phe p.Thr4439Ile p.Arg1549X Unknown Unknown p.Cys419Phe p.Met1280Ile p.Trp4713X p.Leu1378Pro P P P UV2 P P P P P P P P P P P P P P P P UV4 P P P P P P Caucasian Caucasian Caucasian Caucasian (Italy) Caucasian Caucasian Caucasian Indian Caucasian Caucasian Caucasian Caucasian Arab Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Turkish Cypriot Indian Caucasian Caucasian Caucasian Caucasian Greek Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Unknown Caucasian Caucasian Caucasian Caucasian Caucasian Indian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Afro-Caribbean Caucasian Caucasian Caucasian Caucasian Caucasian P P P P P P P P P P P P P P P P P P P P UV3 P P P P P P P P P P P P P P P UV2 P P P P P P P UV4 P UV3 P P P P P P P UV4 P P P P P P P P P P P P P P UV3 P P P P UV3 UV2 P P P P P P P P P P UV4 P UV4 Continued J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 31 Genotype-phenotype correlations Table 2 Continued Family USH type Gene Allele 1* Allele 2* Allele 1 score 546 549 558x 568 591 595 601 611 620 644 648 651 657 670 680 683x 5 fams 465 432 531 669 9 fams 136x 170 271 275 300x 357 481 665 697 222 110 49 82 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 USH2A USH2A USH2Az USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A USH2A GPR98 GPR98 GPR98 GPR98 GPR98 GPR98 GPR98 GPR98 GPR98 GPR98*** MYO7A CLRN1 CLRN1 p.Cys3281Phe p.Gly4403ProfsX15 p.Thr281Lys p.Gly4403ProfsX15 p.Asn346His p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Asn1967TrpfsX5 p.Arg1504LysfsX26 p.Arg1946LeufsX22 p.Arg63X p.Gly1751Val p.Asn346His Exons 50e55 deletedzz p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 p.Glu767SerfsX21 Exon 83 deleted{{ p.Arg2286X p.Arg4802X p.Asp1375His p.Glu2103X p.Arg800X p.Ser5048ArgfsX29 p.Ala3579ValfsX6 c.9623+1G>A Unknown p.Gly1942X p.Ser50LeufsX12 p.Asn48Lys Unknown p.Ser1588HisfsX5 p.Thr281Lys Unknown p.Trp3521Arg p.Trp3521Arg p.Ser1136Asn c.651+1G>A p.Cys999LeufsX9 p.Arg1578Cys p.Cys419Phe Unknown Unknown p.Gly2017Cysxx p.Cys419Phe Exons 50e55 deletedzz p.Glu767SerfsX21 c.10585G>Axx c.12295-3T>A p.Gly257Arg p.Phe1868Cys Unknown Exon 83 deleted{{ p.Ser3339Asn p.Ile3325Thr Unknown p.Gln2301X c.13433G>Txx p.Val2321AlafsX4 p.Val3363AspfsX11 Unknown Unknown Unknown p.Ser50LeufsX12 p.Asn48Lys UV2 P UV2 P P P P P P P P P P UV2 P P P P P P P P P P P UV2 P P P P P P P P Allele 2 score P UV2 P P UV4 P P UV4 P UV2 P P P UV3 UV2 UV2 UV2 P UV3 UV2 P UV3 P P P P Ethnicityy Caucasian Caucasian/Philippino Turkish Cypriot Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Indian Caucasian Kashmiri Caucasian Caucasian Indian Unknown Unknown Caucasian Arab Palestinian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Caucasian Indian Caucasian Caucasian Ashkenazi Jewish For family 29, USH2A haplotypes are not homozygous. It is possible they are p.Arg4192His hemizygous and have a deletion on the other allele. *Unless stated otherwise, the alleles were not observed in control chromosomes. yCaucasian: UK and European. zParental origin could not be determined. Patient and affected sib are homozygous for the mutation. xConsanguineous family. {Found in 1/872 (0.11%) control chromosomes. **Found in 2/860 (0.23%) control chromosomes. yySplice mutation; causes USH2A:p.Pro2533Asnfs*5.23 zzLarge deletion speculated based on patient’s homozygosity of USH2A haplotypes and failure to amplify exon. xxLast nucleotide of the exon. {{Deletion strongly suspected based on homozygosity for GPR98 markers (USH2A excluded based on haplotype analysis), and apparent non-inheritance of GPR98 SNPs in the family and PCR non-amplification of patient’s as well as affected sib’s DNAs (supplemental figure 3). ***Not reported as a consanguineous family. Usher is compatible with mutation in GPR98 (affected sibs are homozygous for a GPR98 haplotype); USH2A is excluded by haplotype analysis. P, Pathogenic; UV unclassified variant. a homozygous state in four families and in a compound heterozygous state with p.Arg80ProfsX69 in one family. It did not occur in 234 ethnically matching control chromosomes. Mutations in CDH23 were determined as the cause of USH1 in five (10.6%) families. In family 407 we identified a likely pathogenic (UV4) novel missense variant CDH23:p.Leu3041Pro in trans with a pathogenic splice mutation CDH23:c.2177-2A>G together with a previously published pathogenic mutation, MYO7A:p.Arg302His.37 38 Mutations of PCDH15 occurred in only four (8.5%) families; two pathogenic variants were detected in all four cases and involved deletions causing frame shifts and splice mutations, all predicted to result in a truncated protein. In two out of these four families (119 and 313) haplotype analysis indicated the presence of large genomic deletions which were confirmed by MLPA (table 1 and supplemental figure 2). 32 We did not find any pathogenic or potentially pathogenic variants in USH1G in the entire cohort. Molecular diagnosis in Usher type 2 families (N[121) Mutations in USH2A were the molecular cause of USH2 in 96 out of 121 (79.3%) of USH2 families; two pathogenic/UV4/UV3 variants were found in 73/121 (60.3%) of all USH2 families and only one pathogenic/UV4/UV3 variant was identified in 23/121 (19%) of USH2 families, in spite of full sequence analysis. In a further four families, mutations in USH2A were suspected as the cause of USH as we identified variants of uncertain pathogenicity (UV2) and one consanguineous family (family 205) was linked to USH2A, but we could not find any likely pathogenic variants (table 2). The USH2A:p.Glu767SerfsX21 mutation was the most common mutation in the entire USH cohort, accounting for J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 Genotype-phenotype correlations Table 3 Genotypes of atypical and non-Usher patients (novel variants are in bold) Family Diagnosis Gene 9 families 520 633 542 505z 448 569, 502 Atypical Usher Atypical Usher Atypical Usher ARRP Sector RP and hearing loss Alström syndrome Unknown, not Usher Unknown MYO7A USHIC USH2A USHIC Unknown Unknown Allele 1* Allele 2* Pathogenicity Allele_1 p.Leu326Gln p.Arg339Gln p.Cys759Phe p.Arg103His Unknown Unknown p.Cys3358Tyr c.2227-1G>T UV2 UV2 Pathogenic Pathogenic Pathogenicity Allele_2 Ethnicityy UV3 Pathogenic Indian Caucasian Caucasian Caucasian *Unless stated otherwise, the alleles were not observed in control chromosomes. yCaucasian: UK and European. zSaihan et al.35 ARRP, autosomal recessive retinitis pigmentosa; RP, retinitis pigmentosa; UV, unclassified variant. 31% of all USH2 alleles and 33.7% of identified pathogenic/UV4/ UV3 USH2A alleles. Five families were homozygous for p.Glu767SerfsX21, 35 families were compound heterozygotes, and in 12 families p.Glu767SerfsX21 was the only pathogenic variant identified. We also observed other mutations occurring multiple times; segregating in four families were previously published p.Glu2288X and p.Trp3521Arg. The latter was not found in control chromosomes (0/836) and always appeared in trans with a pathogenic variant. Novel changes segregating in three families were p.Cys620Phe and p.Gln4541X. The remaining mutations were private, appearing in one or two families only. GPR98 mutations accounted for eight (6.6%) of USH2 families, with two pathogenic/UV3 variants identified in six families and one variant in two families. In an additional ninth family (consanguineous family 222), we suspect mutations in the GPR98 as the cause of USH as the two affected sibs share a homozygous haplotype consisting of three informative GPR98 SNPs (rs1700510, rs10942605, rs2438351); USH2A was excluded based on haplotype analysis. Twelve out of 14 GPR98 mutations were novel: three deletions causing frame shift mutations, four nonsense mutations, two splice mutations, one missense, and a homozygous large deletion (table 2). In consanguineous family 136 we suspect a large deletion of exon 83 (supplemental figure 3) as the two affected sibs in this family shared a homozygous GPR98 haplotype; parents were heterozygous for the same haplotype. Despite affected sibs showing homozygosity for the GPR98 haplotype throughout the gene, the mother was ‘genotype C’ for rs3098356 residing in intron 83, while the father was ‘genotype A’; the assay failed in both affected sibs. PCR primers designed to amplify GPR98 exon 83 in the affected sibs also failed, supporting a homozygous deletion of exon 83, for which parents are likely to be hemizygous. Intron 83 is 101 kb in size and mapping of breakpoints is in progress. Interestingly, one patient (110) with an USH2 phenotype and no mutations in USH2 genes had a single MYO7A:p.Gly1942X nonsense mutation. In this family, we could not confirm association of the disease with any USH gene by SNP analysis as the proband has no sibs. The proband has good speech and bilateral severe hearing loss with an audiometric configuration more consistent with an USH2 phenotype. There was also no history of delayed motor milestones and vestibular testing demonstrated normal vestibular function.39 In nine out of 121 (7.4%) USH2 families, no possibly pathogenic variants were identified. gous mutation p.Asn48Lys,15 the prevalent mutation in Ashkenazi Jews, and family 49 segregated CLRN1:p.Ser50LeufsX12 in the homozygous state.40 Neither family was known to be consanguineous. While the proband from family 49 had moderate hearing loss and normal vestibular function, the proband from family 83 had vestibular hypofunction with moderate to severe hearing loss. In family 482, clinically classified as Usher type 3 because of progressive hearing loss, we found four missense variants of uncertain pathogenicity (two in WHRN and two in CDH23), none of which were found in 96 control chromosomes. The variants in WHRN were in trans (p.Glu137Gln and p.Gln252Arg), and so were the two variants in CDH23 (p.Glu1113Gln and p.Gly2908Arg). The predicted p.Gly2908Arg change (c.8722G>A) occurs at the last nucleotide of CDH23 exon 60 and might act by altering splicing (supplementary table 3). Based on haplotype segregation analysis, neither CDH23 nor WHRN could be excluded as the causative gene. Phenotypically, patient 482 had mild RP, profound hearing loss and vestibular function within normal limits. One proband (429) classified clinically as having Usher syndrome type 3 had no mutations in any USH genes or the candidate gene SLC4A7. Non-Usher cases and NCUS probands with atypical RP and hearing loss Four NCUS probands had typical RP, but their hearing loss was not typical for any type of Usher syndrome. None of these had any mutations in the 10 sequenced genes. A further seven had both RP and hearing loss which were considered atypical for Usher syndrome; one had a single MYO7A:p.Leu326Gln missense variant which was previously published as a likely pathogenic variant (UV3) in a Pakistani family30; a second had a novel USH1C:p.Arg339Gln UV2 variant (supplementary table 2) and five had no mutations. Of the remaining five NCUS probands who were not thought to have Usher syndrome, the patient with autosomal-recessive RP had two known USH2A mutationsdthat is, p.Cys759Phe and p.Cys3358Tyr; and the one with sector RP and hearing loss had two USH1C mutations. In this family the proband had moderate hearing loss and borderline bilateral vestibular hypofunction, whereas a younger affected sibling had severe hearing loss and normal vestibular function.35 We found no mutations in the patient diagnosed with Alström syndrome or the two patients with an unknown syndromic disorder. Molecular diagnosis in Usher type 3 families (N[4) Very few subjects in our cohort were found to have mutations in CLRN1. Molecular diagnosis was clear in two families (table 2). Family 83 segregated a previously published CLRN1 homozyJ Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 DISCUSSION Molecular diagnosis in Usher syndrome is hindered by significant genetic heterogeneity, the large size of some of the Usher 33 Genotype-phenotype correlations genes, and the number of missense changes in genes such as MYO7A and USH2A. To this is added the further potential complexity of digenic inheritance which has been proposed in some cases of Usher syndrome and described in other retinal diseases.18 20 21 Although a major undertaking in terms of time and expense, we decided at the beginning of the study to sequence all the known Usher genes in all subjects, regardless of clinical subtype, in order to assess evidence for, and contribution of, digenic inheritance and the extent of polymorphic sequence variation within the genes. With digenic inheritance in mind, demonstrating that novel missense changes are truly pathogenic, rather than neutral variants is often difficult in the absence of functional studies; this is particularly so in the case of polymorphic genes in an ethnically diverse population. We applied a stringent assignment of pathogenicity to novel missense changes. A novel missense change was described as pathogenic only if it occurred in controls with a frequency <0.236%, was identified in trans to a pathogenic/probably pathogenic mutation, and it segregated with USH in more than two families. If the variant did not fulfil all of the mentioned criteria, it was classified as UV4/UV3 (supplemental data 1). Although a number of molecular studies of Usher cohorts have been published to date, only one smaller study has been designed in a way that would systematically detect digenic inheritance and whether or not this is a significant or recurring phenomenon.18 Bonnet et al described 10 (out of 54) USH patients with presumably pathogenic mutations in two different USH genes. Seven of them had biallelic mutations in one gene, and carried an additional mutation in a second and, for one of them, also a third USH gene. However, none of these had definite pathogenic mutations (ie, nonsense, frame shifting or splice) in two different genes. In all cases, one of the heterozygous mutations was a missense change which could have been a rare benign variant or possibly a disease modifier. For example, CDH23:p.R1060W, reported as presumably pathogenic in a digenic USH case,18 has previously been published as likely non-pathogenic.20 41 42 A possible case of digenic inheritance is reported in one (out of 75) USH patients who segregated CDH23:p.T1209A and PCDH15:p. T1867del variants20; however, the p.T1209A variant was also found in 48/904 (MAF¼5.3%) alleles in the 1000 Genomes Project which suggests that it is unlikely to be pathogenic (http://browser.1000genomes.org/index.html). The polymorphic variation present in Usher genes means that multiple variants are likely to be found if multiple genes are sequenced. In our study, many patients had a number of variants across multiple genes, and there are several interesting examples of two pathogenic variants in one gene and a missense variant, previously reported as a missense mutation, in a different gene. For example, CDH23:p.Arg3175His, previously published as disease-causing,43 was identified in an USH2 family (219) together with two USH2A truncating mutations. Another variant, more recently published as a pathogenic missense change, CDH23:p.Ala366Thr,42 44e46 occurred in 1/96 CEPH chromosomes and was found in an Usher syndrome type 1 patient 146 who has two pathogenic MYO7A mutations (p. Asp521GlufsX8 and p.Lys1255ArgfsX8). Also CDH23:p. His755Tyr was regarded as pathogenic,18 44 but we identified it in a consanguineous USH2 family (203) segregating a homozygous USH2A nonsense mutation. So although the findings of others are similar to ours, their interpretation is different. We found no convincing evidence for digenic inheritance in this study; no subject had two definitely pathogenic alleles (nonsense, frameshifts or splice mutations) in different genes, 34 which given the overall spectrum of mutations in Usher syndrome (79% of identified pathogenic/UV4/UV3 variants were truncating mutations and 21% were missense changes) one might expect to find in genuine digenic inheritance. If digenic inheritance exists, it must be an occurrence too rare to be taken into account in genetic counselling. The only example of an USH2 patient described by Ebermann et al, who carried a single truncating mutation in GPR98 and a truncating mutation in PDZD721 explained as ‘digenic inheritance’, could also be accounted for by an unidentified second mutation in GPR98 in combination with a modifier allele in PDZD7. Since our study was completed before mutations in PDZD7 were published as a cause of USH, this gene was not sequenced in our cohort. We detected at least one pathogenic/likely pathogenic mutant allele in 86% of all Usher probands studied, indicating that there is no other Usher gene of major impact in the population. However, in the USH1 cohort, only a single pathogenic/UV4/ UV3 variant was identified in 4/47 (8.5%) of families and in the USH2 cohort we observed a comparatively much higher number of missing alleles with only one pathogenic/UV4/UV3 variant identified in 26/121 (21.5%) of USH2 families. Undetected large genomic rearrangements, undetected pathogenic variants in the promoter and intronic regions, misdiagnosed USH syndrome, and human as well as computer software errors during sequence analysis are likely to underlie these ‘missing alleles’. Certainly gross deletions and duplications have been well documented in genes such as PCDH15 where they account for 37% of PCDH15 mutations,34 and 13% of USH1 cases.36 Large genomic deletions and duplications have also been reported in MYO7A, CDH23, GPR98, and USH2A.34 47 48 To analyse such rearrangements reliably, other methods such as MLPA and oligonucleotide array based comparative genomic hybridisation could be used in future.49 Our future research will focus on detection of large genomic rearrangements and mutations causing splicing aberrations at the mRNA level and will aim to clarify further the molecular diagnosis in the NCUS cohort. Although probands with a clinical classification of Usher syndrome type 1 were screened for all USH genes, the causative mutations were only found in USH1 genes. In probands clinically classified as USH2, only 1/121 patients had a nonsense mutation in MYO7A, an USH1 gene. In another family who entered the study with a diagnosis thought unlikely to be Usher syndrome, we identified two USH1C mutations and affected sibs were subsequently diagnosed as having sector RP and hearing loss.35 Therefore, regarding cases with atypical presentation, the mutation detection rate is low, but even these cases can harbour mutations in the known genes and produce unexpected phenotypes. Thus clinical classification, particularly that of type 1 Usher, is generally very robust, so screening all genes is unnecessary for molecular diagnosis in most cases and segregation analysis using haplotypes will be valuable for selecting candidate genes.34 50 Because 52.5% of pathogenic and likely pathogenic variants were novel, the use of microarray chips for molecular diagnosis in a disorder with a large number of private mutations such as USH is limited. It can, however, serve as a useful initial screen, although hybridisation techniques are being superseded by massively parallel sequencing, with the ability to generate large datasets. The existence of LSDBs for nine Usher genes (USHbases) combining international datasets is a valuable tool for molecular genetic studies of USH. The database enables integration of published and unpublished data, is regularly updated, and currently encompasses >4500 entries with 900 J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 Genotype-phenotype correlations 5. Electronic database information < USHbasesdLSDB < < < < < < for Usher syndrome genes: https:// grenada.lumc.nl/LOVD2/Usher_montpellier/USHbases.html Clinical and Molecular Genetics Society: http://www.cmgs. org/BPGs/pdfs%20current%20bpgs/UV%20GUIDELINES% 20ratified.pdf Usher Syndrome Missense Analysis: https://194.167.35.160/ cgi-bin/USMA/USMA.fcgi 1000 Genomes Browser: http://browser.1000genomes.org/ index.html British Society of Audiology Recommended Procedure for the Caloric Test: http://www.thebsa.org.uk/docs/RecPro/CTP.pdf. MRC Holland for MLPA: (http://www.mlpa.com/) Splice Site Prediction: Human Splicing Finder, http://www. umd.be/HSF/ and Splice Site Prediction by Neural Network, http://www.fruitfly.org/seq_tools/splice.html unique pathogenic, neutral and unclassified variants.23 We have 295 novel variants to submit to USHbases (137 are missense changes). Integration of large datasets such as this with data from all groups studying Usher syndrome, combined with haplotype and segregation analysis in families, and functional analysis of variants, will enable more reliable detection of truly pathogenic USH variants as well as the discovery of likely modifier genes. Acknowledgements We would like to thank Professor Alan Bird, Professor Tony Moore, numerous clinical colleagues throughout the UK, Mary Guest (Sense) and Liz Cook (Sense) for their help with patient recruitment, Morag Lewis and Anne Glazier for help with sequence analysis, the Sanger Institute’s Exoseq and Sequencing teams for large scale sequencing, and David Baux for LSDB data submission. We particularly thank the families and individuals with Usher syndrome for their time and support. Funding This work was supported by the Big Lottery (through Sense UK), Deafness Research UK, the RP Fighting Blindness charity (Retinitis Pigmentosa Society of the UK), Moorfields Special Trustees, Foundation Fighting Blindness, Jeans for Genes, NIHR Biomedical Research Centre (BMRC), Comprehensive Biomedical Research Centre (CBRC) and the Wellcome Trust. MB-G is supported by Great Ormond Street Hospital Children’s Charity. Competing interests None. 6. 7. 8. 9. 10. 11. 12. 13. 14. 15. 16. 17. Patient consent Obtained. Ethics approval Ethics approval was provided by NRES Committee London South East (MREC). 18. Contributors All authors contributed to the conception and design of the study, acquisition of data, or analysis and interpretation of data, and all were involved in critical review of the manuscript for intellectual content and reviewed the final version. Provenance and peer review Not commissioned; externally peer reviewed. Data sharing statement All data will be provided to Locus Specific Databases (LSDBs) Usher genes (USHbases) which is publicly accessible. 19. 20. REFERENCES 1. 2. 3. 4. Saihan Z, Webster AR, Luxon L, Bitner-Glindzicz M. Update on Usher syndrome. Curr Opin Neurol 2009;22:19e27. Millan JM, Aller E, Jaijo T, Blanco-Kelly F, Gimenez-Pardo A, Ayuso C. An update on the genetics of usher syndrome. J Ophthalmol 2011;2011:417217. Weil D, Blanchard S, Kaplan J, Guilford P, Gibson F, Walsh J, Mburu P, Varela A, Levilliers J, Weston MD. Defective myosin VIIA gene responsible for Usher syndrome type 1B. Nature 1995;374:60e1. Bitner-Glindzicz M, Lindley KJ, Rutland P, Blaydon D, Smith VV, Milla PJ, Hussain K, Furth-Lavi J, Cosgrove KE, Shepherd RM, Barnes PD, O’Brien RE, Farndon PA, Sowden J, Liu XZ, Scanlan MJ, Malcolm S, Dunne MJ, ynsley-Green A, Glaser B. A recessive contiguous gene deletion causing infantile hyperinsulinism, enteropathy and deafness identifies the Usher type 1C gene. Nat Genet 2000;26:56e60. J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468 21. 22. 23. Verpy E, Leibovici M, Zwaenepoel I, Liu XZ, Gal A, Salem N, Mansour A, Blanchard S, Kobayashi I, Keats BJ, Slim R, Petit C. A defect in harmonin, a PDZ domaincontaining protein expressed in the inner ear sensory hair cells, underlies Usher syndrome type 1C. Nat Genet 2000;26:51e5. Bork JM, Peters LM, Riazuddin S, Bernstein SL, Ahmed ZM, Ness SL, Polomeno R, Ramesh A, Schloss M, Srisailpathy CR, Wayne S, Bellman S, Desmukh D, Ahmed Z, Khan SN, Kaloustian VM, Li XC, Lalwani A, Riazuddin S, Bitner-Glindzicz M, Nance WE, Liu XZ, Wistow G, Smith RJ, Griffith AJ, Wilcox ER, Friedman TB, Morell RJ. Usher syndrome 1D and nonsyndromic autosomal recessive deafness DFNB12 are caused by allelic mutations of the novel cadherin-like gene CDH23. Am J Hum Genet 2001;68:26e37. Bolz H, von BB, Ramirez A, Bryda EC, Kutsche K, Nothwang HG, Seeliger M, del C-Salcedó Cabrera M, Vila MC, Molina OP, Gal A, Kubisch C. Mutation of CDH23, encoding a new member of the cadherin gene family, causes Usher syndrome type 1D. Nat Genet 2001;27:108e12. Ahmed ZM, Riazuddin S, Bernstein SL, Ahmed Z, Khan S, Griffith AJ, Morell RJ, Friedman TB, Riazuddin S, Wilcox ER. Mutations of the protocadherin gene PCDH15 cause Usher syndrome type 1F. Am J Hum Genet 2001;69:25e34. Alagramam KN, Yuan H, Kuehn MH, Murcia CL, Wayne S, Srisailpathy CR, Lowry RB, Knaus R, Van LL, Bernier FP, Schwartz S, Lee C, Morton CC, Mullins RF, Ramesh A, Van CG, Hageman GS, Woychik RP, Smith RJ. Mutations in the novel protocadherin PCDH15 cause Usher syndrome type 1F. Hum Mol Genet 2001;10:1709e18. Weil D, El-Amraoui A, Masmoudi S, Mustapha M, Kikkawa Y, Laine S, Delmaghani S, Adato A, Nadifi S, Zina ZB, Hamel C, Gal A, Ayadi H, Yonekawa H, Petit C. Usher syndrome type I G (USH1G) is caused by mutations in the gene encoding USH1G, a protein that associates with the USH1C protein, harmonin. Hum Mol Genet 2003;12:463e71. Eudy JD, Weston MD, Yao S, Hoover DM, Rehm HL, Ma-Edmonds M, Yan D, Ahmad I, Cheng JJ, Ayuso C, Cremers C, Davenport S, Moller C, Talmadge CB, Beisel KW, Tamayo M, Morton CC, Swaroop A, Kimberling WJ, Sumegi J. Mutation of a gene encoding a protein with extracellular matrix motifs in Usher syndrome type IIa. Science 1998;280:1753e7. van Wijk E, Pennings RJ, Te BH, Claassen A, Yntema HG, Hoefsloot LH, Cremers FP, Cremers CW, Kremer H. Identification of 51 novel exons of the Usher syndrome type 2A (USH2A) gene that encode multiple conserved functional domains and that are mutated in patients with Usher syndrome type II. Am J Hum Genet 2004;74:738e44. Weston MD, Luijendijk MW, Humphrey KD, Moller C, Kimberling WJ. Mutations in the VLGR1 gene implicate G-protein signaling in the pathogenesis of Usher syndrome type II. Am J Hum Genet 2004;74:357e66. Ebermann I, Scholl HP, Charbel IP, Becirovic E, Lamprecht J, Jurklies B, Millan JM, Aller E, Mitter D, Bolz H. A novel gene for Usher syndrome type 2: mutations in the long isoform of whirlin are associated with retinitis pigmentosa and sensorineural hearing loss. Hum Genet 2007;121:203e11. Adato A, Vreugde S, Joensuu T, Avidan N, Hamalainen R, Belenkiy O, Olender T, Bonne-Tamir B, Ben-Asher E, Espinos C, Millan JM, Lehesjoki AE, Flannery JG, Avraham KB, Pietrokovski S, Sankila EM, Beckmann JS, Lancet D. CLRN1 transcripts encode clarin-1, a four-transmembrane-domain protein with a possible role in sensory synapses. Eur J Hum Genet 2002;10:339e50. Joensuu T, Hamalainen R, Yuan B, Johnson C, Tegelberg S, Gasparini P, Zelante L, Pirvola U, Pakarinen L, Lehesjoki AE, de la CA, Sankila EM. Mutations in a novel gene with transmembrane domains underlie Usher syndrome type 3. Am J Hum Genet 2001;69:673e84. Richardson GP, de Monvel JB, Petit C. How the genetics of deafness illuminates auditory physiology. Annu Rev Physiol 2011;73:311e34. Bonnet C, Grati M, Marlin S, Levilliers J, Hardelin JP, Parodi M, Niasme-Grare M, Zelenika D, Delepine M, Feldmann D, Jonard L, El-Amraoui A, Weil D, Delobel B, Vincent C, Dollfus H, Eliot MM, David A, Calais C, Vigneron J, Montaut-Verient B, Bonneau D, Dubin J, Thauvin C, Duvillard A, Francannet C, Mom T, Lacombe D, Duriez F, Drouin-Garraud V, Thuillier-Obstoy MF, Sigaudy S, Frances AM, Collignon P, Challe G, Couderc R, Lathrop M, Sahel JA, Weissenbach J, Petit C, Denoyelle F. Complete exon sequencing of all known Usher syndrome genes greatly improves molecular diagnosis. Orphanet J Rare Dis 2011;6:21. Zheng QY, Yan D, Ouyang XM, Du LL, Yu H, Chang B, Johnson KR, Liu XZ. Digenic inheritance of deafness caused by mutations in genes encoding cadherin 23 and protocadherin 15 in mice and humans. Hum Mol Genet 2005;14:103e11. Vozzi D, Aaspollu A, Athanasakis E, Berto A, Fabretto A, Licastro D, Kulm M, Testa F, Trevisi P, Vahter M, Ziviello C, Martini A, Simonelli F, Banfi S, Gasparini P. Molecular epidemiology of Usher syndrome in Italy. Mol Vis 2011;17:1662e8. Ebermann I, Phillips JB, Liebau MC, Koenekoop RK, Schermer B, Lopez I, Schafer E, Roux AF, Dafinger C, Bernd A, Zrenner E, Claustres M, Blanco B, Nurnberg G, Nurnberg P, Ruland R, Westerfield M, Benzing T, Bolz HJ. PDZD7 is a modifier of retinal disease and a contributor to digenic Usher syndrome. J Clin Invest 2010;120:1812e23. Bok D, Galbraith G, Lopez I, Woodruff M, Nusinowitz S, BeltrandelRio H, Huang W, Zhao S, Geske R, Montgomery C, Van Sligtenhorst I, Friddle C, Platt K, Sparks MJ, Pushkin A, Abuladze N, Ishiyama A, Dukkipati R, Liu W, Kurtz I. Blindness and auditory impairment caused by loss of the sodium bicarbonate cotransporter NBC3. Nat Genet 2003;34:313e19. Baux D, Faugere V, Larrieu L, Le Guedard-Mereuze S, Hamroun D, Beroud C, Malcolm S, Claustres M, Roux AF. UMD-USHbases: a comprehensive set of 35 Genotype-phenotype correlations 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37. 38. 39. databases to record and analyse pathogenic mutations and unclassified variants in seven Usher syndrome causing genes. Hum Mutat 2008;29:E76e87. Holder GE, Brigell MG, Hawlina M, Meigen T, Vaegan, Bach M; International Society for Clinical Electrophysiology of Vision. ISCEV standard for clinical pattern electroretinographye2007 update. Doc Ophthalmol 2007;114:111e16. Marmor MF, Fulton AB, Holder GE, Miyake Y, Brigell M, Bach M; International Society for Clinical Electrophysiology of Vision. ISCEV Standard for full-field clinical electroretinography (2008 update). Doc Ophthalmol 2009;118:69e77. Cohen M, Prasher D. The value of combining auditory brainstem responses and acoustic reflex threshold measurements in neuro-otological diagnosis. Scand Audiol 1988;17:153e62. Kemp DT, Ryan S, Bray P. A guide to the effective use of otoacoustic emissions. Ear Hear 1990;11:93e105. Anon. Recommended procedure for tympanometry. British Society of Audiology. Br J Audiol 1992;26:255e7. Anon. Caloric test protocol. Br J Audiol 1999;33:179e84. Pollak L, Luxon LM, Haskard DO. Labyrinthine involvement in Behcet’s syndrome. J Laryngol Otol 2001;115:522e9. Ceranic B, Luxon LM. Progressive auditory neuropathy in patients with Leber’s hereditary optic neuropathy. J Neurol Neurosurg Psychiatry 2004;75:626e30. Jongkees LB. The methods of examining the vestibular apparatus. Fortschr Hals Nasen Ohrenheilkd 1953;1:1e147. Bonfield JK, Smith K, Staden R. A new DNA sequence assembly program. Nucleic Acids Res 1995;23:4992e9. Roux AF, Faugere V, Vache C, Baux D, Besnard T, Leonard S, Blanchet C, Hamel C, Mondain M, Gilbert-Dussardier B, Edery P, Lacombe D, Bonneau D, Holder-Espinasse M, Ambrosetti U, Journel H, David A, Lina-Granade G, Malcolm S, Claustres M. Fouryear follow-up of diagnostic service in USH1 patients. Invest Ophthalmol Vis Sci 2011;52:4063e71. Saihan Z, Stabej PQ, Robson AG, Rangesh N, Holder GE, Moore Frcophth AT, Steel KP, Luxon LM, Bitner-Glindzicz M, Webster AR. Mutations in the ush1c gene associated with sector retinitis pigmentosa and hearing loss. Retina 2011;31:1708e16. Zwaenepoel I, Verpy E, Blanchard S, Meins M, pfelstedt-Sylla E, Gal A, Petit C. Identification of three novel mutations in the USH1C gene and detection of thirty-one polymorphisms used for haplotype analysis. Hum Mutat 2001;17:34e41. Weston MD, Kelley PM, Overbeck LD, Wagenaar M, Orten DJ, Hasson T, Chen ZY, Corey D, Mooseker M, Sumegi J, Cremers C, Moller C, Jacobson SG, Gorin MB, Kimberling WJ. Myosin VIIA mutation screening in 189 Usher syndrome type 1 patients. Am J Hum Genet 1996;59:1074e83. Pennings RJ, Huygen PL, Orten DJ, Wagenaar M, van AA, Kremer H, Kimberling WJ, Cremers CW, Deutman AF. Evaluation of visual impairment in Usher syndrome 1b and Usher syndrome 2a. Acta Ophthalmol Scand 2004;82:131e9. Liu XZ, Hope C, Walsh J, Newton V, Ke XM, Liang CY, Xu LR, Zhou JM, Trump D, Steel KP, Bundey S, Brown SD. Mutations in the myosin VIIA gene cause a wide 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. phenotypic spectrum, including atypical Usher syndrome. Am J Hum Genet 1998;63:909e12. Fields RR, Zhou G, Huang D, Davis JR, Moller C, Jacobson SG, Kimberling WJ, Sumegi J. Usher syndrome type III: revised genomic structure of the USH3 gene and identification of novel mutations. Am J Hum Genet 2002;71:607e17. Riazuddin S, Nazli S, Ahmed ZM, Yang Y, Zulfiqar F, Shaikh RS, Zafar AU, Khan SN, Sabar F, Javid FT, Wilcox ER, Tsilou E, Boger ET, Sellers JR, Belyantseva IA, Riazuddin S, Friedman TB. Mutation spectrum of MYO7A and evaluation of a novel nonsyndromic deafness DFNB2 allele with residual function. Hum Mutat 2008;29:502e11. Jaijo T, Aller E, Garcia-Garcia G, Aparisi MJ, Bernal S, vila-Fernandez A, Barragan I, Baiget M, Ayuso C, Antinolo G, az-Llopis M, Kulm M, Beneyto M, Najera C, Millan JM. Microarray-based mutation analysis of 183 Spanish families with Usher syndrome. Invest Ophthalmol Vis Sci 2010;51:1311e17. Astuto LM, Bork JM, Weston MD, Askew JW, Fields RR, Orten DJ, Ohliger SJ, Riazuddin S, Morell RJ, Khan S, Riazuddin S, Kremer H, van Hauwe P, Moller CG, Cremers CW, Ayuso C, Heckenlively JR, Rohrschneider K, Spandau U, Greenberg J, Ramesar R, Reardon W, Bitoun P, Millan J, Legge R, Friedman TB, Kimberling WJ. CDH23 mutation and phenotype heterogeneity: a profile of 107 diverse families with Usher syndrome and nonsyndromic deafness. Am J Hum Genet 2002;71:262e75. Oshima A, Jaijo T, Aller E, Millan JM, Carney C, Usami S, Moller C, Kimberling WJ. Mutation profile of the CDH23 gene in 56 probands with Usher syndrome type I. Hum Mutat 2008;29:E37e46. Kimberling WJ, Hildebrand MS, Shearer AE, Jensen ML, Halder JA, Trzupek K, Cohn ES, Weleber RG, Stone EM, Smith RJ. Frequency of Usher syndrome in two pediatric populations: implications for genetic screening of deaf and hard of hearing children. Genet Med 2010;12:512e16. Ouyang XM, Yan D, Du LL, Hejtmancik JF, Jacobson SG, Nance WE, Li AR, Angeli S, Kaiser M, Newton V, Brown SD, Balkany T, Liu XZ. Characterization of Usher syndrome type I gene mutations in an Usher syndrome patient population. Hum Genet 2005;116:292e9. Hilgert N, Kahrizi K, Dieltjens N, Bazazzadegan N, Najmabadi H, Smith RJ, Van CG. A large deletion in GPR98 causes type IIC Usher syndrome in male and female members of an Iranian family. J Med Genet 2009;46:272e6. LeGuedard-Mereuze S, Vache C, Baux D, Faugere V, Larrieu L, Abadie C, Janecke A, Claustres M, Roux AF, Tuffery-Giraud S. Ex vivo splicing assays of mutations at noncanonical positions of splice sites in USHER genes. Hum Mutat 2010;31:347e55. Aller E, Jaijo T, Garcia-Garcia G, Aparisi MJ, Blesa D, az-Llopis M, Ayuso C, Millan JM. Identification of large rearrangements of the PCDH15 gene by combined MLPA and a CGH: large duplications are responsible for Usher syndrome. Invest Ophthalmol Vis Sci 2010;51:5480e5. Roux AF, Faugere V, Le GS, Pallares-Ruiz N, Vielle A, Chambert S, Marlin S, Hamel C, Gilbert B, Malcolm S, Claustres M; French Usher Syndrome Collaboration. Survey of the frequency of USH1 gene mutations in a cohort of Usher patients shows the importance of cadherin 23 and protocadherin 15 genes and establishes a detection rate of above 90%. J Med Genet 2006;43:763e8. PAGE fraction trail=9.75 36 J Med Genet 2012;49:27e36. doi:10.1136/jmedgenet-2011-100468"} \ No newline at end of file diff --git a/interactive-mining-backend/madoap/src/static/exampleClarinProfile.oamp b/interactive-mining-backend/madoap/src/static/exampleClarinProfile.oamp index 315a84ca434583562949cf89c8ed7454b769faeb..7578c3d41ad27aa3b564d1ae15fd1477140579c1 100644 GIT binary patch delta 231 zcmZqBXwaA-CB$XKz`(!?#8AL8WulG|6PMA(g#Ap6o|6@rtr&eKM>Cr+dQYCfY|iL4 z`2@2K8zb{Q2IhO41zD~z>nSiYyK|jC?r)Xlw_nTIQuvTc{+M16cpvB7bWKM z0*z*3e#*f7o%tzH&rN1&0VZa3PG^6&P(K%4uwp-UMrKjY^whl6O+1U48M!7e;g979 zYGYx(z`*>4`6csh<_kcr2bc{wS(qhxeLVeK97A}RndNyQ+I9VWIGKPF5FQsJvowV1 G=m7wi{x*LA delta 254 zcmZqBXwaA-CB%83fq{V)h@pTbXQGY~6X*Sn3HzBCO(!cbTQOQpj%GGtG@m?y*__dA z@(E@eCI*(xf*eHq)$ delta 17 YcmZqBXwaA-%_u!l#+gxiW5NPq04x9m=l}o!