diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml
index ff9b378..43a7c2b 100644
--- a/dnet-dedup-test/pom.xml
+++ b/dnet-dedup-test/pom.xml
@@ -59,6 +59,19 @@
org.apache.spark
spark-graphx_2.11
+
+
+ eu.dnetlib
+ dnet-openaireplus-mapping-utils
+ test
+
+
+
+ junit
+ junit
+ test
+
+
\ No newline at end of file
diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
index 2d34afe..f893570 100644
--- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
+++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala
@@ -1,24 +1,21 @@
-package eu.dnetlib.graph
-import java.lang
+package eu.dnetlib.graph
import eu.dnetlib.ConnectedComponent
import eu.dnetlib.pace.model.MapDocument
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
-import scala.collection.JavaConversions
-;
-
+import scala.collection.JavaConversions;
object GraphProcessor {
- def findCCs(vertexes: RDD[(VertexId,MapDocument)], edges:RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
+ def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
val graph: Graph[MapDocument, String] = Graph(vertexes, edges)
val cc = graph.connectedComponents(maxIterations).vertices
val joinResult = vertexes.leftOuterJoin(cc).map {
case (id, (openaireId, cc)) => {
- if (cc.isEmpty){
+ if (cc.isEmpty) {
(id, openaireId)
}
else {
@@ -33,7 +30,7 @@ object GraphProcessor {
}
- def asConnectedComponent(group: (VertexId, Iterable[MapDocument])) : ConnectedComponent = {
+ def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = {
val docs = group._2.toSet[MapDocument]
val connectedComponent = new ConnectedComponent("empty", JavaConversions.setAsJavaSet[MapDocument](docs));
connectedComponent.initializeID();
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java
new file mode 100644
index 0000000..961fdd6
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java
@@ -0,0 +1,198 @@
+package eu.dnetlib.pace;
+
+import com.google.common.collect.Lists;
+import com.google.gson.Gson;
+import eu.dnetlib.data.proto.FieldTypeProtos.Author;
+import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
+import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
+import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
+import eu.dnetlib.data.proto.OafProtos.Oaf;
+import eu.dnetlib.data.proto.OafProtos.OafEntity;
+import eu.dnetlib.data.proto.OrganizationProtos.Organization;
+import eu.dnetlib.data.proto.ResultProtos.Result;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.model.ProtoDocumentBuilder;
+import eu.dnetlib.pace.model.gt.GTAuthor;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.RandomStringUtils;
+import org.apache.commons.lang.StringUtils;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+public abstract class AbstractProtoPaceTest extends OafTest {
+
+ protected DedupConfig getResultFullConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
+ }
+
+ protected DedupConfig getResultSimpleConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
+ }
+
+ protected DedupConfig getResultConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
+ }
+
+ protected DedupConfig getOrganizationSimpleConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
+ }
+
+ protected DedupConfig getResultAuthorsConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
+ }
+
+ protected DedupConfig getResultProdConf() {
+ return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
+ }
+
+ protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
+ return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
+ }
+
+ protected GTAuthor getGTAuthor(final String path) {
+
+ final Gson gson = new Gson();
+
+ final String json = readFromClasspath(path);
+
+ final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
+
+ return gta;
+ }
+
+ protected String readFromClasspath(final String filename) {
+ final StringWriter sw = new StringWriter();
+ try {
+ IOUtils.copy(getClass().getResourceAsStream(filename), sw);
+ return sw.toString();
+ } catch (final IOException e) {
+ throw new RuntimeException("cannot load resource from classpath: " + filename);
+ }
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title) {
+ return result(config, id, title, null, new ArrayList<>(), null);
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title, final String date) {
+ return result(config, id, title, date, new ArrayList<>(), null);
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid) {
+ return result(config, id, title, date, pid, null);
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
+ return result(config, id, title, date, pid, null);
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) {
+ return result(config, id, title, date, Lists.newArrayList(pid), authors);
+ }
+
+ protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) {
+ final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
+ if (!StringUtils.isBlank(title)) {
+ metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
+ metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
+ }
+ if (!StringUtils.isBlank(date)) {
+ metadata.setDateofacceptance(sf(date));
+ }
+
+ final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
+ final Result.Builder result = Result.newBuilder().setMetadata(metadata);
+
+ if (authors != null) {
+ result.getMetadataBuilder().addAllAuthor(
+ IntStream.range(0, authors.size())
+ .mapToObj(i -> author(authors.get(i), i))
+ .collect(Collectors.toCollection(LinkedList::new)));
+ }
+
+ entity.setResult(result);
+
+ if (pid != null) {
+ for(String p : pid) {
+ if (!StringUtils.isBlank(p)) {
+ entity.addPid(sp(p, "doi"));
+ //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
+ }
+ }
+ }
+
+ final OafEntity build = entity.build();
+ return ProtoDocumentBuilder.newInstance(id, build, config.model());
+ }
+
+ private Author author(final String s, int rank) {
+ final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
+ final Author.Builder author = Author.newBuilder();
+ if (p.isAccurate()) {
+ author.setName(p.getNormalisedFirstName());
+ author.setSurname(p.getNormalisedSurname());
+ }
+ author.setFullname(p.getNormalisedFullname());
+ author.setRank(rank);
+
+ return author.build();
+ }
+
+ private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
+ final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
+ return entity;
+ }
+
+ protected MapDocument organization(final Config config, final String id, final String legalName) {
+ return organization(config, id, legalName, null);
+ }
+
+ protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
+ final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
+ if (legalName != null) {
+ metadata.setLegalname(sf(legalName));
+ }
+ if (legalShortName != null) {
+ metadata.setLegalshortname(sf(legalShortName));
+ }
+
+ final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
+ entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
+
+ return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
+ }
+
+ private StructuredProperty sp(final String pid, final String type) {
+ final Builder pidSp =
+ StructuredProperty.newBuilder().setValue(pid)
+ .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
+ return pidSp.build();
+ }
+
+ protected Field title(final String s) {
+ return new FieldValueImpl(Type.String, "title", s);
+ }
+
+ protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
+ return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
+ }
+
+ /*
+ * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
+ *
+ * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
+ * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
+ */
+
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java
new file mode 100644
index 0000000..590c416
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java
@@ -0,0 +1,446 @@
+package eu.dnetlib.pace;
+
+import com.google.protobuf.GeneratedMessage;
+import com.google.protobuf.InvalidProtocolBufferException;
+import eu.dnetlib.data.mapreduce.util.OafDecoder;
+import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization;
+import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision;
+import eu.dnetlib.data.proto.DatasourceProtos.Datasource;
+import eu.dnetlib.data.proto.DedupProtos.Dedup;
+import eu.dnetlib.data.proto.FieldTypeProtos.*;
+import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
+import eu.dnetlib.data.proto.KindProtos.Kind;
+import eu.dnetlib.data.proto.OafProtos.Oaf;
+import eu.dnetlib.data.proto.OafProtos.OafEntity;
+import eu.dnetlib.data.proto.OafProtos.OafRel;
+import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization;
+import eu.dnetlib.data.proto.OrganizationProtos.Organization;
+import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization;
+import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation;
+import eu.dnetlib.data.proto.ProjectProtos.Project;
+import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata;
+import eu.dnetlib.data.proto.RelTypeProtos.RelType;
+import eu.dnetlib.data.proto.RelTypeProtos.SubRelType;
+import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject;
+import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome;
+import eu.dnetlib.data.proto.ResultProtos.Result;
+import eu.dnetlib.data.proto.ResultProtos.Result.Context;
+import eu.dnetlib.data.proto.ResultProtos.Result.Instance;
+import eu.dnetlib.data.proto.ResultResultProtos.ResultResult;
+import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity;
+import eu.dnetlib.data.proto.TypeProtos.Type;
+
+public class OafTest {
+
+ public static final String CITATION_JSON =
+ "\n \n [10] M. Foret et al., Phys. Rev. B 66, 024204 (2002).\n \n \n [11] B. Ru\175404\264e et al., Phys. Rev. Lett. 90, 095502 (2003).\n \n \n [12] U. Buchenau et al., Phys. Rev. B 34, 5665 (1986).\n \n \n [13] S.N. Taraskin and S.R. Elliott, J. Phys.: Condens. Mat- ter 11, A219 (1999).\n \n \n [14] B. Hehlen et al., Phys. Rev. Lett. 84, 5355 (2000).\n \n \n [15] N.V. Surotsev et al., J. Phys.: Condens. Matter 10, L113 (1998).\n \n \n [16] D.A. Parshin and C. Laermans, Phys. Rev. B 63, 132203 (2001).\n \n \n [17] V.L. Gurevich et al., Phys. Rev. B 67, 094203 (2003).\n \n \n [18] A. Matic et al., Phys. Rev. Lett. 86, 3803 (2001).\n \n \n [19] E. Rat et al., arXiv:cond-mat/0505558, 23 May 2005.\n \n \n [1] R.C. Zeller and R.O. Pohl, Phys. Rev. B 4, 2029 (1971).\n \n \n [20] C.A. Angell, J. Non-Cryst. Solids 131\20023133, 13 (1991).\n \n \n [21] A.P. Sokolov et al., Phys. Rev. Lett. 71, 2062 (1993).\n \n \n [22] T. Matsuo et al., Solid State Ionics 154-155, 759 (2002).\n \n \n [23] V.K. Malinovsky et al., Europhys. Lett. 11, 43 (1990).\n \n \n [24] J. Lor\250osch et al., J. Non-Cryst. Solids 69, 1 (1984).\n \n \n [25] U. Buchenau, Z. Phys. B 58, 181 (1985).\n \n \n [26] A.F. Io\175400e and A.R. Regel, Prog. Semicond. 4, 237 (1960).\n \n \n [27] R. Dell\20031Anna et al., Phys. Rev. Lett. 80, 1236 (1998).\n \n \n [28] D. Fioretto et al., Phys. Rev. E 59, 4470 (1999).\n \n \n [29] U. Buchenau et al., Phys. Rev. Lett. 77, 4035 (1996).\n \n \n [2] M. Rothenfusser et al., Phys. Rev. B 27, 5196 (1983).\n \n \n [30] J. Mattsson et al., J. Phys.: Condens. Matter 15, S1259 (2003).\n \n \n [31] T. Scopigno et al., Phys. Rev. Lett. 92, 025503 (2004).\n \n \n [32] M. Foret et al., Phys. Rev. Lett. 81, 2100 (1998).\n \n \n [33] F. Sette et al., Science 280, 1550 (1998).\n \n \n [34] J. Wuttke et al., Phys. Rev. E 52, 4026 (1995).\n \n \n [35] M.A. Ramos et al., Phys. Rev. Lett. 78, 82 (1997).\n \n \n [36] G. Monaco et al., Phys. Rev. Lett. 80, 2161 (1998).\n \n \n [37] A. T\250olle, Rep. Prog. Phys. 64, 1473 (2001).\n \n \n [38] As the straight lines do not cross the origin, this does not 2 imply \1623 \21035 \1651 .\n \n \n [39] A. Matic et al., Europhys. Lett. 54, 77 (2001).\n \n \n [3] S. Hunklinger and W. Arnold, in Physical Acoustics, Vol. XII, W.P. Mason and R.N. Thurston Eds. (Academic Press, N.Y. 1976), p. 155.\n \n \n [40] IXS data are usually not available below \1651co, mostly for experimental reasons. E.g., that the rapid onset was not evidenced in vitreous silica [27], is not indicative of its absence but rather of a low qco \21074 1 nm\210221.\n \n \n [41] G. Ruocco et al., Phys. Rev. Lett. 83, 5583 (1999).\n \n \n [42] D. C\1307 iplys et al., J. Physique (Paris) 42, C6-184 (1981).\n \n \n [43] R. Vacher et al., Rev. Sci. Instrum. 51, 288 (1980).\n \n \n [44] R. Vacher et al., arXiv:cond-mat/0505560, 23 May 2005.\n \n \n [45] T.N. Claytor et al., Phys. Rev. B 18, 5842 (1978).\n \n \n [46] M. Arai et al., Physica B 263-264, 268 (1999).\n \n \n [4] R. Vacher et al., J. Non-Cryst. Solids 45, 397 (1981); T.C. Zhu et al., Phys. Rev. B 44, 4281 (1991).\n \n \n [5] J.E. Graebner et al., Phys. Rev. B 34, 5696 (1986).\n \n \n [6] E. Duval and A. Mermet, Phys. Rev. B 58, 8159 (1998).\n \n \n [7] A. Matic et al., Phys. Rev. Lett. 93, 145502 (2004).\n \n \n [8] Often alluded to, e.g. in the Encyclopedia of Materials: Science and Technology, K.H.J. Buschow et al., Eds., Vol. 1 (Elsevier, Oxford, 2001), articles by S.R. Elliott on pp. 171-174 and U. Buchenau on pp. 212-215.\n \n \n [9] E. Rat et al., Phys. Rev. Lett. 83, 1355 (1999).\n \n";
+
+ public static final String STATISTICS_JSON =
+ "[{ \"citationsPerYear\": \"many\", \"anotherCoolStatistic\": \"WoW\", \"nestedStat\": { \"firstNestedStat\" : \"value 1\", \"secondNestedStat\" : \"value 2\"}, \"listingStat\" : [ \"one\", \"two\" ] }]";
+
+ public static Builder getStructuredproperty(final String value, final String classname, final String schemename) {
+ return getStructuredproperty(value, classname, schemename, null);
+ }
+
+ public static Builder getStructuredproperty(final String value, final String classname, final String schemename, final DataInfo dataInfo) {
+ final Builder sp = StructuredProperty.newBuilder().setValue(value).setQualifier(getQualifier(classname, schemename));
+ if (dataInfo != null) {
+ sp.setDataInfo(dataInfo);
+ }
+ return sp;
+ }
+
+ public static Qualifier.Builder getQualifier(final String classname, final String schemename) {
+ return Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename);
+ }
+
+ public static KeyValue getKV(final String id, final String name) {
+ return KeyValue.newBuilder().setKey(id).setValue(name).build();
+ }
+
+ public static OafEntity getDatasource(final String datasourceId) {
+ return OafEntity
+ .newBuilder()
+ .setType(Type.datasource)
+ .setId(datasourceId)
+ .setDatasource(
+ Datasource.newBuilder().setMetadata(
+ Datasource.Metadata.newBuilder().setOfficialname(sf("officialname")).setEnglishname(sf("englishname"))
+ .setWebsiteurl(sf("websiteurl")).setContactemail(sf("contactemail")).addAccessinfopackage(sf("accessinforpackage"))
+ .setNamespaceprefix(sf("namespaceprofix")).setDescription(sf("description")).setOdnumberofitems(sf("numberofitems"))
+ .setOdnumberofitemsdate(sf("numberofitems date"))
+ // .addOdsubjects("subjects")
+ .setOdpolicies(sf("policies")).addOdlanguages(sf("languages")).addOdcontenttypes(sf("contenttypes"))
+ .setDatasourcetype(getQualifier("type class", "type scheme")))).build();
+ }
+
+ public static OafEntity getResult(final String id) {
+ return getResultBuilder(id).build();
+ }
+
+ public static OafEntity.Builder getResultBuilder(final String id) {
+ return OafEntity
+ .newBuilder()
+ .setType(Type.result)
+ .setId(id)
+ .setResult(
+ Result.newBuilder()
+ .setMetadata(
+ Result.Metadata
+ .newBuilder()
+ .addTitle(
+ getStructuredproperty(
+ "Analysis of cell viability in intervertebral disc: Effect of endplate permeability on cell population",
+ "main title", "dnet:result_titles", getDataInfo()))
+ .addTitle(getStructuredproperty("Another title", "alternative title", "dnet:result_titles", getDataInfo()))
+ .addSubject(getStructuredproperty("Biophysics", "subject", "dnet:result_sujects"))
+ .setDateofacceptance(sf("2010-01-01")).addSource(sf("sourceA")).addSource(sf("sourceB"))
+ .addContext(Context.newBuilder().setId("egi::virtual::970"))
+ .addContext(Context.newBuilder().setId("egi::classification::natsc::math::applied"))
+ .addContext(Context.newBuilder().setId("egi::classification::natsc::math"))
+ .addContext(Context.newBuilder().setId("egi::classification::natsc"))
+ .addContext(Context.newBuilder().setId("egi::classification")).addContext(Context.newBuilder().setId("egi"))
+ .addDescription(sf("Responsible for making and maintaining the extracellular matrix ..."))
+ .addDescription(sf("Another description ...")).setPublisher(sf("ELSEVIER SCI LTD"))
+ .setResulttype(getQualifier("publication", "dnet:result_types"))
+ .setLanguage(getQualifier("eng", "dnet:languages"))).addInstance(getInstance("10|od__10", "Uk pubmed"))
+ .addInstance(getInstance("10|od__10", "arxiv")))
+ .addCollectedfrom(getKV("opendoar____::1064", "Oxford University Research Archive"))
+ .addPid(getStructuredproperty("doi:74293", "doi", "dnet:pids")).addPid(getStructuredproperty("oai:74295", "oai", "dnet:pids"))
+ .setDateofcollection("");
+ }
+
+ public static DataInfo getDataInfo() {
+ return getDataInfo("0.4");
+ }
+
+ public static DataInfo getDataInfo(final String trust) {
+ return DataInfo.newBuilder().setDeletedbyinference(false).setTrust("0.4").setInferenceprovenance("algo").setProvenanceaction(getQualifier("xx", "yy"))
+ .build();
+ }
+
+ public static Instance.Builder getInstance(final String hostedbyId, final String hostedbyName) {
+ return Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName)).setAccessright(getQualifier("OpenAccess", "dnet:access_modes"))
+ .setInstancetype(getQualifier("publication", "dnet:result_typologies")).addUrl("webresource url");
+
+ }
+
+ public static OafRel getDedupRel(final String source, final String target, final RelType relType, final String relClass) {
+ return OafRel.newBuilder().setSource(source).setTarget(target).setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass)
+ .setChild(false).setCachedTarget(getResult(target))
+ .setResultResult(ResultResult.newBuilder().setDedup(Dedup.newBuilder().setRelMetadata(RelMetadata.getDefaultInstance())))
+ .build();
+ }
+
+ public static OafRel getProjectOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException {
+ final OafRel.Builder oafRel = OafRel
+ .newBuilder()
+ .setSource(source)
+ .setTarget(target)
+ .setRelType(RelType.projectOrganization)
+ .setSubRelType(SubRelType.participation)
+ .setRelClass(relClass)
+ .setChild(false)
+ .setProjectOrganization(
+ ProjectOrganization.newBuilder().setParticipation(
+ Participation.newBuilder().setParticipantnumber("" + 1)
+ .setRelMetadata(relMetadata(relClass, "dnet:project_organization_relations"))));
+ switch (Participation.RelName.valueOf(relClass)) {
+ case hasParticipant:
+ oafRel.setCachedTarget(getProjectFP7(target, "SP3"));
+ break;
+ case isParticipant:
+ oafRel.setCachedTarget(getOrganization(target));
+ break;
+ default:
+ break;
+ }
+ return oafRel.build();
+ }
+
+ public static GeneratedMessage getOrganizationOrganization(final String source, final String target, final String relClass) {
+ final OafRel.Builder oafRel = OafRel
+ .newBuilder()
+ .setSource(source)
+ .setTarget(target)
+ .setRelType(RelType.organizationOrganization)
+ .setSubRelType(SubRelType.dedup)
+ .setRelClass(relClass)
+ .setChild(true)
+ .setOrganizationOrganization(
+ OrganizationOrganization.newBuilder().setDedup(
+ Dedup.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:organization_organization_relations"))));
+
+ switch (Dedup.RelName.valueOf(relClass)) {
+ case isMergedIn:
+ oafRel.setCachedTarget(getOrganization(source));
+ break;
+ case merges:
+ oafRel.setCachedTarget(getOrganization(target));
+ break;
+ default:
+ break;
+ }
+ return oafRel.build();
+ }
+
+ public static OafRel getDatasourceOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException {
+ final OafRel.Builder oafRel = OafRel
+ .newBuilder()
+ .setSource(source)
+ .setTarget(target)
+ .setRelType(RelType.datasourceOrganization)
+ .setSubRelType(SubRelType.provision)
+ .setRelClass(relClass)
+ .setChild(false)
+ .setDatasourceOrganization(
+ DatasourceOrganization.newBuilder().setProvision(
+ Provision.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:datasource_organization_relations"))));
+ switch (Provision.RelName.valueOf(relClass)) {
+ case isProvidedBy:
+ oafRel.setCachedTarget(getOrganization(target));
+ break;
+ case provides:
+ oafRel.setCachedTarget(getDatasource(target));
+ break;
+ default:
+ break;
+ }
+ return oafRel.build();
+ }
+
+ public static OafRel getSimilarityRel(final String sourceId, final String targetId, final OafEntity result, final String relClass) {
+ return OafRel
+ .newBuilder()
+ .setSource(sourceId)
+ .setTarget(targetId)
+ .setRelType(RelType.resultResult)
+ .setSubRelType(SubRelType.similarity)
+ .setRelClass(relClass)
+ .setChild(false)
+ .setCachedTarget(result)
+ .setResultResult(
+ ResultResult.newBuilder().setSimilarity(
+ Similarity.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:resultResult_relations")).setSimilarity(.4f)
+ .setType(Similarity.Type.STANDARD))).build();
+ }
+
+ public static RelMetadata.Builder relMetadata(final String classname, final String schemename) {
+ return RelMetadata.newBuilder().setSemantics(getQualifier(classname, schemename));
+ }
+
+ public static OafEntity getOrganization(final String orgId) {
+ return OafEntity
+ .newBuilder()
+ .setType(Type.organization)
+ .setId(orgId)
+ .addCollectedfrom(getKV("opendoar_1234", "UK pubmed"))
+ .setOrganization(
+ Organization.newBuilder().setMetadata(
+ Organization.Metadata.newBuilder().setLegalname(sf("CENTRE D'APPUI A LA RECHERCHE ET A LA FORMATION GIE"))
+ .setLegalshortname(sf("CAREF")).setWebsiteurl(sf("www.caref-mali.org"))
+ .setCountry(getQualifier("ML", "dnet:countries")))).build();
+ }
+
+ public static OafRel getResultProject(final String from, final String to, final OafEntity project, final String relClass)
+ throws InvalidProtocolBufferException {
+ return OafRel
+ .newBuilder()
+ .setSource(from)
+ .setTarget(to)
+ .setRelType(RelType.resultProject)
+ .setSubRelType(SubRelType.outcome)
+ .setRelClass(relClass)
+ .setChild(false)
+ .setResultProject(
+ ResultProject.newBuilder().setOutcome(Outcome.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:result_project_relations"))))
+ .setCachedTarget(project).build();
+ }
+
+ public static OafEntity getProjectFP7(final String projectId, final String fundingProgram) throws InvalidProtocolBufferException {
+ return OafEntity
+ .newBuilder()
+ .setType(Type.project)
+ .setId(projectId)
+ .addCollectedfrom(getKV("opendoar_1234", "UK pubmed"))
+ .setProject(
+ Project.newBuilder()
+ .setMetadata(
+ Project.Metadata
+ .newBuilder()
+ .setAcronym(sf("5CYRQOL"))
+ .setTitle(sf("Cypriot Researchers Contribute to our Quality of Life"))
+ .setStartdate(sf("2007-05-01"))
+ .setEnddate(sf("2007-10-31"))
+ .setEcsc39(sf("false"))
+ .setContracttype(getQualifier("CSA", "ec:FP7contractTypes"))
+ .addFundingtree(
+ sf("ec__________::ECECEuropean Commissionec__________::EC::FP7::"
+ + fundingProgram
+ + "::PEOPLEMarie-Curie ActionsPEOPLEec:programec__________::EC::FP7::"
+ + fundingProgram
+ + ""
+ + fundingProgram
+ + "-People"
+ + fundingProgram
+ + "ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram"))))
+ .build();
+ }
+
+ public static OafEntity getProjectWT() throws InvalidProtocolBufferException {
+ return OafEntity
+ .newBuilder()
+ .setType(Type.project)
+ .setId("project|wt::087536")
+ .addCollectedfrom(getKV("wellcomeTrust", "wellcome trust"))
+ .setProject(
+ Project.newBuilder()
+ .setMetadata(
+ Project.Metadata
+ .newBuilder()
+ .setAcronym(sf("UNKNOWN"))
+ .setTitle(sf("Research Institute for Infectious Diseases of Poverty (IIDP)."))
+ .setStartdate(sf("2007-05-01"))
+ .setEnddate(sf("2007-10-31"))
+ .setEcsc39(sf("false"))
+ .setContracttype(getQualifier("UNKNOWN", "wt:contractTypes"))
+ .addFundingtree(
+ sf("wt__________::WTWTWellcome Trustwt__________::WT::UNKNOWNUNKNOWNUNKNOWNwt:fundingStream"))
+ .addFundingtree(
+ sf("wt__________::WTWTWellcome Trustwt__________::WT::Technology TransferTechnology TransferTechnology Transferwt:fundingStream"))))
+ .build();
+ }
+
+ public static ExtraInfo extraInfo(final String name, final String provenance, final String trust, final String typology, final String value) {
+ final ExtraInfo.Builder e = ExtraInfo.newBuilder().setName(name).setProvenance(provenance).setTrust(trust).setTypology(typology).setValue(value);
+ return e.build();
+ }
+
+ // public static DocumentClasses documentClasses() {
+ // DocumentClasses.Builder builder = DocumentClasses.newBuilder();
+ // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASSES) + 1; i++) {
+ // builder.addArXivClasses(getDocumentClass()).addDdcClasses(getDocumentClass()).addWosClasses(getDocumentClass())
+ // .addMeshEuroPMCClasses(getDocumentClass());
+ // }
+ // return builder.build();
+ // }
+ //
+ // private static DocumentClass getDocumentClass() {
+ // DocumentClass.Builder builder = DocumentClass.newBuilder();
+ // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASS_LABELS) + 1; i++) {
+ // builder.addClassLabels("test_class_" + i);
+ // }
+ // return builder.setConfidenceLevel(0.5F).build();
+ // }
+ //
+ // public static DocumentStatistics documentStatistics() {
+ // return
+ // DocumentStatistics.newBuilder().setCitationsFromAllPapers(basicCitationStatistics()).setCitationsFromPublishedPapers(basicCitationStatistics())
+ // .build();
+ // }
+ //
+ // private static BasicCitationStatistics basicCitationStatistics() {
+ // BasicCitationStatistics.Builder builder = BasicCitationStatistics.newBuilder();
+ // for (int i = 0; i < N_CITATION_STATS; i++) {
+ // builder.addNumberOfCitationsPerYear(statisticsKeyValue());
+ // builder.setNumberOfCitations(RandomUtils.nextInt(5) + 1);
+ // }
+ // return builder.build();
+ // }
+ //
+ // private static StatisticsKeyValue statisticsKeyValue() {
+ // return StatisticsKeyValue.newBuilder().setKey((RandomUtils.nextInt(30) + 1980) + "").setValue(RandomUtils.nextInt(5) + 1).build();
+ // }
+ //
+ // public static AuthorStatistics authorStatistics() {
+ // AuthorStatistics.Builder builder = AuthorStatistics.newBuilder();
+ // builder.setCore(commonCoreStatistics());
+ // for (int i = 0; i < N_COAUTHORS; i++) {
+ // builder.addCoAuthors(coAuthor());
+ // }
+ // return builder.build();
+ // }
+ //
+ // private static CoAuthor coAuthor() {
+ // CoAuthor.Builder builder = CoAuthor.newBuilder();
+ // builder.setId("30|od______2345::" + Hashing.md5(RandomStringUtils.random(10)));
+ // builder.setCoauthoredPapersCount(RandomUtils.nextInt(5) + 1);
+ // return builder.build();
+ // }
+ //
+ // public static CommonCoreStatistics commonCoreStatistics() {
+ // CommonCoreStatistics.Builder builder = CommonCoreStatistics.newBuilder();
+ //
+ // builder.setAllPapers(coreStatistics());
+ // builder.setPublishedPapers(coreStatistics());
+ //
+ // return builder.build();
+ // }
+ //
+ // private static CoreStatistics coreStatistics() {
+ // CoreStatistics.Builder builder = CoreStatistics.newBuilder();
+ //
+ // builder.setNumberOfPapers(RandomUtils.nextInt(10));
+ // builder.setCitationsFromAllPapers(extendedStatistics());
+ // builder.setCitationsFromPublishedPapers(extendedStatistics());
+ //
+ // return builder.build();
+ // }
+ //
+ // private static ExtendedStatistics extendedStatistics() {
+ // ExtendedStatistics.Builder builder = ExtendedStatistics.newBuilder();
+ //
+ // builder.setBasic(basicCitationStatistics());
+ // builder.setAverageNumberOfCitationsPerPaper(RandomUtils.nextFloat());
+ // for (int i = 0; i < N_CITATION_STATS; i++) {
+ // builder.addNumberOfPapersCitedAtLeastXTimes(statisticsKeyValue());
+ // }
+ //
+ // return builder.build();
+ // }
+
+ public static StringField sf(final String s) {
+ return sf(s, null);
+ }
+
+ public static StringField sf(final String s, final DataInfo dataInfo) {
+ final StringField.Builder sf = StringField.newBuilder().setValue(s);
+ if (dataInfo != null) {
+ sf.setDataInfo(dataInfo);
+ }
+ return sf.build();
+ }
+
+ public static OafDecoder embed(final GeneratedMessage msg,
+ final Kind kind,
+ final boolean deletedByInference,
+ final boolean inferred,
+ final String provenance,
+ final String action) {
+
+ final Oaf.Builder oaf = Oaf
+ .newBuilder()
+ .setKind(kind)
+ .setLastupdatetimestamp(System.currentTimeMillis())
+ .setDataInfo(
+ DataInfo.newBuilder().setDeletedbyinference(deletedByInference).setInferred(inferred).setTrust("0.5")
+ .setInferenceprovenance(provenance).setProvenanceaction(getQualifier(action, action)));
+ switch (kind) {
+ case entity:
+ oaf.setEntity((OafEntity) msg);
+ break;
+ case relation:
+ oaf.setRel((OafRel) msg);
+ break;
+ default:
+ break;
+ }
+
+ return OafDecoder.decode(oaf.build());
+ }
+
+ public static OafDecoder embed(final GeneratedMessage msg, final Kind kind) {
+ return embed(msg, kind, false, false, "inference_provenance", "provenance_action");
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java
new file mode 100644
index 0000000..c9fa084
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.AbstractProtoPaceTest;
+import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Before;
+import org.junit.Test;
+
+public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
+
+ private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
+
+ private Config config;
+
+ @Before
+ public void setUp() {
+ config = getResultFullConf();
+ }
+
+ @Test
+ public void testCombine() {
+ final MapDocument result =
+ result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
+ final FieldListImpl fl = new FieldListImpl();
+ fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
+
+ result.getFieldMap().put("desc", fl);
+
+ fl.clear();
+ fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
+ final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
+ field.add(fl);
+
+ log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
+ }
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java
new file mode 100644
index 0000000..125bf63
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java
@@ -0,0 +1,39 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.AbstractProtoPaceTest;
+import eu.dnetlib.pace.clustering.ClusteringCombiner;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ClusteringCombinerTest extends AbstractProtoPaceTest {
+
+ private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
+
+ private Config config;
+
+ @Before
+ public void setUp() {
+ config = getResultFullConf();
+ }
+
+ @Test
+ public void testCombine() {
+ String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
+ MapDocument result = result(config, "A", title, "2013");
+
+ FieldListImpl fl = new FieldListImpl();
+ fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
+
+ result.getFieldMap().put("desc", fl);
+ log.info(title);
+ log.info(ClusteringCombiner.combine(result, config));
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java
new file mode 100644
index 0000000..e2d3ad7
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java
@@ -0,0 +1,405 @@
+package eu.dnetlib.pace.distance;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import com.googlecode.protobuf.format.JsonFormat;
+import eu.dnetlib.data.proto.OafProtos;
+import eu.dnetlib.pace.AbstractProtoPaceTest;
+import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.config.DedupConfig;
+import eu.dnetlib.pace.distance.eval.ScoreResult;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.model.ProtoDocumentBuilder;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class DetectorTest extends AbstractProtoPaceTest {
+
+ private static final Log log = LogFactory.getLog(DetectorTest.class);
+
+ @Test
+ public void testDistanceResultSimple() {
+ final Config config = getResultSimpleConf();
+
+ final MapDocument resA = result(config, "A", "Recent results from CDF");
+ final MapDocument resB = result(config, "B", "Recent results from CDF");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultSimpleMissingDates() {
+ final Config config = getResultSimpleConf();
+
+ final MapDocument resA = result(config, "A", "Recent results from BES");
+ final MapDocument resB = result(config, "A", "Recent results from CES");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d > 0.97);
+ }
+
+ @Test
+ public void testDistanceResultInvalidDate() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
+ final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d == 1.0);
+ }
+
+ @Ignore
+ @Test
+ public void testDistanceResultMissingOneDate() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "title title title 6BESR", null);
+ final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue((d > 0.9) && (d < 1.0));
+ }
+
+ @Ignore
+ @Test
+ public void testDistanceResult() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "title title title BES", "");
+ final MapDocument resB = result(config, "B", "title title title CLEO");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue((d > 0.9) && (d < 1.0));
+ }
+
+ @Ignore
+ @Test
+ public void testDistanceResultMissingTwoDate() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "title title title 6BESR");
+ final MapDocument resB = result(config, "B", "title title title 6CLER");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue((d > 0.9) && (d < 1.0));
+ }
+
+ @Ignore
+ @Test
+ public void testDistanceOrganizationIgnoreMissing() {
+
+ final Config config = getOrganizationSimpleConf();
+
+ final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
+ final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d > 0.99);
+ }
+
+ @Test
+ public void testDistanceResultCase1() {
+
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
+ final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue((d > 0.9) && (d < 1.0));
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch1() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("exact DOIs will produce an exact match", d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch2() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch3() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
+ final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch4() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
+ final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch5() {
+
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch6() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
+ final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseDoiMatch7() {
+ final Config config = getResultConf();
+
+ final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
+ final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
+ }
+
+ // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
+
+ @Test
+ public void testDistanceResultCaseAuthor1() {
+
+ final Config config = getResultAuthorsConf();
+
+ final List authorsA = Lists.newArrayList("a", "b", "c", "d");
+ final List authorsB = Lists.newArrayList("a", "b", "c");
+ final List pid = Lists.newArrayList();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d == 0.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseAuthor2() {
+
+ final Config config = getResultAuthorsConf();
+
+ final List authorsA = Lists.newArrayList("a", "b", "c");
+ final List authorsB = Lists.newArrayList("a", "b", "c");
+ final List pid = Lists.newArrayList();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue(d == 1.0);
+ }
+
+ @Test
+ public void testDistanceResultCaseAuthor3() {
+
+ final Config config = getResultAuthorsConf();
+
+ final List authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
+ final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
+ final List pid = Lists.newArrayList();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ assertTrue((d > 0.9) && (d < 1.0));
+ }
+
+ @Test
+ public void testDistanceResultCaseAuthor4() {
+
+ final Config config = getResultAuthorsConf();
+
+ final List authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
+ final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
+ final List pid = Lists.newArrayList();
+
+ final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
+ final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ // assertTrue(d.getScore() == 0.0);
+ }
+
+ @Test
+ public void testDistanceResultFullConf() {
+
+ final Config config = getResultFullConf();
+
+ final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
+ final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
+
+ final MapDocument resA =
+ result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
+ "10.1186/1752-1947-4-299", authorsA);
+
+ final MapDocument resB =
+ result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
+ "10.1186/1752-1947-4-299", authorsB);
+
+ final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
+ final double d = sr.getScore();
+ log.info(String.format(" d ---> %s", d));
+
+ // assertTrue(d.getScore() == 0.0);
+ }
+
+ @Ignore
+ @Test
+ public void testDistance() throws IOException {
+
+ final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
+
+ final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
+ final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
+
+ final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
+
+ log.info("score = " + result);
+
+ }
+
+ @Ignore
+ @Test
+ public void testDistanceOrgs() throws IOException {
+
+ final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
+
+ final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
+ final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
+
+ Set keysA = getGroupingKeys(conf, orgA);
+ Set keysB = getGroupingKeys(conf, orgB);
+
+ assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
+
+ log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
+ log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
+
+ final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
+
+ log.info("score = " + result);
+ log.info("distance = " + result.getScore());
+ }
+
+ private Set getGroupingKeys(DedupConfig conf, MapDocument doc) {
+ return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
+ }
+
+ private MapDocument asMapDocument(DedupConfig conf, final String json) {
+ OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
+ try {
+ JsonFormat.merge(json, b);
+ } catch (JsonFormat.ParseException e) {
+ throw new IllegalArgumentException(e);
+ }
+ return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
+ }
+
+
+}
diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java
new file mode 100644
index 0000000..56ddc2c
--- /dev/null
+++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.model;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+import com.google.common.collect.Sets.SetView;
+import eu.dnetlib.pace.AbstractProtoPaceTest;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.distance.DetectorTest;
+import eu.dnetlib.pace.model.MapDocument;
+import eu.dnetlib.pace.model.MapDocumentSerializer;
+import eu.dnetlib.pace.model.ProtoDocumentBuilder;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.junit.Test;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
+
+ private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
+
+ @Test
+ public void test_serialise1() {
+
+ final String id = "12345";
+
+ final Config config = getResultFullConf();
+
+ final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
+
+ assertFalse(document.fieldNames().isEmpty());
+ assertFalse(Iterables.isEmpty(document.fields()));
+
+ log.info("original:\n" + document);
+
+ final String stringDoc = MapDocumentSerializer.toString(document);
+
+ log.info("srialization:\n" + stringDoc);
+
+ final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
+
+ final SetView diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
+
+ assertTrue(diff.isEmpty());
+
+ log.info("decoded:\n" + decoded);
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json
new file mode 100644
index 0000000..be5ec28
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json
@@ -0,0 +1,121 @@
+{
+ "dateoftransformation": "2018-08-07T06:48:42.668Z",
+ "originalId": [
+ "oai:rua.ua.es:10045/34236"
+ ],
+ "oaiprovenance": {
+ "originDescription": {
+ "metadataNamespace": "http://www.openarchives.org/OAI/2.0/oai_dc/",
+ "altered": true,
+ "baseURL": "http://rua.ua.es/dspace-oai/request",
+ "datestamp": "2016-04-28T11:28:35Z",
+ "harvestDate": "2018-06-14T13:53:42.185Z",
+ "identifier": "oai:rua.ua.es:10045/34236"
+ }
+ },
+ "result": {
+ "instance": [
+ {
+ "hostedby": {
+ "value": "Repositorio Institucional de la Universidad de Alicante",
+ "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
+ },
+ "url": [
+ "http://hdl.handle.net/10045/34236"
+ ],
+ "dateofacceptance": {
+ "value": "2013-11-27"
+ },
+ "collectedfrom": {
+ "value": "Repositorio Institucional de la Universidad de Alicante",
+ "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
+ },
+ "accessright": {
+ "classid": "OPEN",
+ "classname": "Open Access",
+ "schemename": "dnet:access_modes",
+ "schemeid": "dnet:access_modes"
+ },
+ "instancetype": {
+ "classid": "0010",
+ "classname": "Lecture",
+ "schemename": "dnet:publication_resource",
+ "schemeid": "dnet:publication_resource"
+ }
+ }
+ ],
+ "metadata": {
+ "language": {
+ "classid": "eng",
+ "classname": "English",
+ "schemename": "dnet:languages",
+ "schemeid": "dnet:languages"
+ },
+ "title": [
+ {
+ "qualifier": {
+ "classid": "main title",
+ "classname": "main title",
+ "schemename": "dnet:dataCite_title",
+ "schemeid": "dnet:dataCite_title"
+ },
+ "value": "Henry James (1843-1916)"
+ }
+ ],
+ "journal": {
+ "name": ""
+ },
+ "author": [
+ {
+ "fullname": "Gómez Reus, Teresa",
+ "surname": "Gómez Reus",
+ "name": "Teresa",
+ "rank": 1
+ }
+ ],
+ "resulttype": {
+ "classid": "other",
+ "classname": "other",
+ "schemename": "dnet:result_typologies",
+ "schemeid": "dnet:result_typologies"
+ },
+ "dateofacceptance": {
+ "value": "2013-11-27"
+ },
+ "contributor": [
+ {
+ "value": "Universidad de Alicante. Departamento de Filología Inglesa"
+ }
+ ],
+ "subject": [
+ {
+ "qualifier": {
+ "classid": "keyword",
+ "classname": "keyword",
+ "schemename": "dnet:result_subject",
+ "schemeid": "dnet:result_subject"
+ },
+ "value": "James, Henry"
+ },
+ {
+ "qualifier": {
+ "classid": "keyword",
+ "classname": "keyword",
+ "schemename": "dnet:result_subject",
+ "schemeid": "dnet:result_subject"
+ },
+ "value": "Filología Inglesa"
+ }
+ ]
+ }
+ },
+ "collectedfrom": [
+ {
+ "value": "Repositorio Institucional de la Universidad de Alicante",
+ "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
+ }
+ ],
+ "dateofcollection": "2018-06-14T13:53:42.185Z",
+ "type": 50,
+ "id": "50|od_______935::2b908ad38030168759c568f49af50784"
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json
new file mode 100644
index 0000000..669e394
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json
@@ -0,0 +1,78 @@
+{
+ "pid": [
+ {
+ "qualifier": {
+ "classid": "doi",
+ "classname": "doi",
+ "schemename": "dnet:pid_types",
+ "schemeid": "dnet:pid_types"
+ },
+ "value": "10.1002/9781444393675.ch6"
+ }
+ ],
+ "result": {
+ "instance": [
+ {
+ "url": [
+ "http://dx.doi.org/10.1002/9781444393675.ch6"
+ ],
+ "collectedfrom": {
+ "value": "CrossRef",
+ "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
+ },
+ "hostedby": {
+ "value": "Unknown Repository",
+ "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"
+ },
+ "accessright": {
+ "classid": "CLOSED",
+ "classname": "Closed Access",
+ "schemename": "dnet:access_modes",
+ "schemeid": "dnet:access_modes"
+ },
+ "instancetype": {
+ "classid": "0013",
+ "classname": "Part of book or chapter of book",
+ "schemename": "dnet:publication_resource",
+ "schemeid": "dnet:publication_resource"
+ }
+ }
+ ],
+ "metadata": {
+ "title": [
+ {
+ "qualifier": {
+ "classid": "main title",
+ "classname": "main title",
+ "schemename": "dnet:dataCite_title",
+ "schemeid": "dnet:dataCite_title"
+ },
+ "value": "Henry James (1843-1916)"
+ }
+ ],
+ "resulttype": {
+ "classid": "publication",
+ "classname": "publication",
+ "schemename": "dnet:result_typologies",
+ "schemeid": "dnet:result_typologies"
+ }
+ }
+ },
+ "collectedfrom": [
+ {
+ "value": "Microsoft Academic Graph",
+ "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"
+ },
+ {
+ "value": "CrossRef",
+ "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
+ },
+ {
+ "value": "UnpayWall",
+ "key": "10|openaire____::8ac8380272269217cb09a928c8caa993"
+ }
+ ],
+ "dateofcollection": "2018-08-07 12:24:48Z",
+ "type": 50,
+ "id": "50|crossref____::0000002a9885b7ec89b7b9d8ff3331a0"
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf
new file mode 100644
index 0000000..0dcfe51
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf
@@ -0,0 +1,34 @@
+{
+ "wf" : {
+ "threshold" : "0.85",
+ "dedupRun" : "001",
+ "entityType" : "organization",
+ "orderField" : "legalname",
+ "queueMaxSize" : "20000",
+ "groupMaxSize" : "20",
+ "slidingWindowSize" : "400",
+ "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "clustering" : [
+ { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
+ { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
+ { "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } },
+ { "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
+ { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
+ ],
+ "conditions" : [
+ { "name" : "exactMatch", "fields" : [ "country" ] },
+ { "name" : "mustBeDifferent", "fields" : [ "gridid" ] }
+ ],
+ "model" : [
+ { "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
+ { "name" : "legalshortname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
+ { "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.6", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
+ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
+ { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
+ ],
+ "blacklists" : { }
+ }
+}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json
new file mode 100644
index 0000000..80bbaa3
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json
@@ -0,0 +1,34 @@
+{
+ "dateoftransformation": "2018-06-04",
+ "originalId": [
+ "opendoar____::Institute_of_Information_Science_and_Technology_"A._Faedo""
+ ],
+ "collectedfrom": [
+ {
+ "value": "OpenDOAR",
+ "key": "10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"
+ }
+ ],
+ "organization": {
+ "metadata": {
+ "legalshortname": {
+ "value": "CNR-ISTI"
+ },
+ "websiteurl": {
+ "value": "http://www.isti.cnr.it/aaaaa"
+ },
+ "country": {
+ "classid": "IT",
+ "classname": "IT",
+ "schemename": "dnet:countries",
+ "schemeid": "dnet:countries"
+ },
+ "legalname": {
+ "value": "Institute of Information Science and Technology "A. Faedo""
+ }
+ }
+ },
+ "dateofcollection": "2015-08-24",
+ "type": 20,
+ "id": "20|opendoar____::68d8b122736484cb07f75885af22e82f"
+}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json
new file mode 100644
index 0000000..dd91c26
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json
@@ -0,0 +1,48 @@
+{
+ "collectedfrom": [
+ {
+ "value": "GRID - Global Research Identifier Database",
+ "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977"
+ }
+ ],
+ "organization": {
+ "metadata": {
+ "legalshortname": {
+ "value": "ISTI"
+ },
+ "websiteurl": {
+ "value": "http://www.isti.cnr.it/aaaaaa"
+ },
+ "country": {
+ "classid": "IT",
+ "classname": "Italy",
+ "schemename": "dnet:countries",
+ "schemeid": "dnet:countries"
+ },
+ "alternativeNames": [
+ {
+ "value": "Istituto di Scienza e Tecnologie dell'Informazione \"A. Faedo\""
+ },
+ {
+ "value": "ISTI"
+ }
+ ],
+ "legalname": {
+ "value": "CNR - Institute of Information Science and Technologies"
+ }
+ }
+ },
+ "pid": [
+ {
+ "qualifier": {
+ "classid": "grid",
+ "classname": "grid",
+ "schemename": "dnet:pid_types",
+ "schemeid": "dnet:pid_types"
+ },
+ "value": "grid.451498.5"
+ }
+ ],
+ "type": 20,
+ "id": "20|grid________::e4095563f4e9d34dff7d47fb98af042f"
+}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
new file mode 100644
index 0000000..ee39fc0
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf
@@ -0,0 +1,25 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "run" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "2000",
+ "groupMaxSize" : "10",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "conditions" : [
+ { "name" : "sizeMatch", "fields" : [ "authors" ] },
+ { "name" : "titleVersionMatch", "fields" : [ "title" ] }
+ ],
+ "model" : [
+ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
+ { "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
+ ],
+ "blacklists" : { }
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf
new file mode 100644
index 0000000..80a5458
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf
@@ -0,0 +1,51 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "run" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "2000",
+ "groupMaxSize" : "10",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "clustering" : [
+ { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
+ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
+ ],
+ "conditions" : [
+ { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
+ { "name" : "titleVersionMatch", "fields" : [ "title" ] },
+ { "name" : "sizeMatch", "fields" : [ "authors" ] } ,
+ { "name" : "pidMatch", "fields" : [ "pid" ] }
+ ],
+ "model" : [
+ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
+ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
+ { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
+ { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
+ ],
+ "blacklists" : {
+ "title" : [
+ "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+ "^(Kiri Karl Morgensternile).*$",
+ "^(\\[Eksliibris Aleksandr).*\\]$",
+ "^(\\[Eksliibris Aleksandr).*$",
+ "^(Eksliibris Aleksandr).*$",
+ "^(Kiri A\\. de Vignolles).*$",
+ "^(2 kirja Karl Morgensternile).*$",
+ "^(Pirita kloostri idaosa arheoloogilised).*$",
+ "^(Kiri tundmatule).*$",
+ "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+ "^(Eksliibris Nikolai Birukovile).*$",
+ "^(Eksliibris Nikolai Issakovile).*$",
+ "^(WHP Cruise Summary Information of section).*$",
+ "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+ "^(Measurement of the spin\\-dependent structure function).*"
+ ] }
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf
new file mode 100644
index 0000000..86dd27f
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf
@@ -0,0 +1,29 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "run" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "2000",
+ "groupMaxSize" : "10",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "strictConditions" : [
+ { "name" : "pidMatch", "fields" : [ "pid" ] }
+ ],
+ "conditions" : [
+ { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
+ { "name" : "titleVersionMatch", "fields" : [ "title" ] }
+ ],
+ "model" : [
+ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
+ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
+ { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
+ ],
+ "blacklists" : { }
+ }
+
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf
new file mode 100644
index 0000000..462f79b
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf
@@ -0,0 +1,273 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "dedupRun" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "4000",
+ "groupMaxSize" : "40",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
+ "includeChildren" : "true",
+ "maxChildren" : "40"
+ },
+ "pace" : {
+ "clustering" : [
+ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
+ { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
+ ],
+ "strictConditions" : [
+ { "name" : "pidMatch", "fields" : [ "pid" ] }
+ ],
+ "conditions" : [
+ { "name" : "titleVersionMatch", "fields" : [ "title" ] },
+ { "name" : "sizeMatch", "fields" : [ "authors" ] }
+ ],
+ "model" : [
+ { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
+ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
+ { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
+ { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
+ ],
+ "blacklists" : {
+ "title" : [
+ "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+ "^Problems with perinatal pathology\.?$",
+ "(?i)^Cases? of Puerperal Convulsions$",
+ "(?i)^Operative Gyna?ecology$",
+ "(?i)^Mind the gap\!?\:?$",
+ "^Chronic fatigue syndrome\.?$",
+ "^Cartas? ao editor Letters? to the Editor$",
+ "^Note from the Editor$",
+ "^Anesthesia Abstract$",
+
+ "^Annual report$",
+ "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
+ "(?i)^Graph and Table of Infectious Diseases?$",
+ "^Presentation$",
+ "(?i)^Reviews and Information on Publications$",
+ "(?i)^PUBLIC HEALTH SERVICES?$",
+ "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+ "(?i)^Adrese autora$",
+ "(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+ "(?i)^Acknowledgement to Referees$",
+ "(?i)^Behçet's disease\.?$",
+ "(?i)^Isolation and identification of restriction endonuclease.*$",
+ "(?i)^CEREBROVASCULAR DISEASES?.?$",
+ "(?i)^Screening for abdominal aortic aneurysms?\.?$",
+ "^Event management$",
+ "(?i)^Breakfast and Crohn's disease.*\.?$",
+ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
+ "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
+ "^Gushi hakubutsugaku$",
+
+ "^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
+ "^Intestinal spirocha?etosis$",
+ "^Treatment of Rodent Ulcer$",
+ "(?i)^\W*Cloud Computing\W*$",
+ "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+ "^Free Communications, Poster Presentations: Session [A-F]$",
+
+ "^“The Historical Aspects? of Quackery\.?”$",
+ "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+ "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+ "(?i)^Case Report$",
+ "^Boletín Informativo$",
+ "(?i)^Glioblastoma Multiforme$",
+ "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+ "^Zaměstnanecké výhody$",
+ "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+ "(?i)^Carotid body tumours?\\.?$",
+ "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+ "^Avant-propos$",
+ "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+ "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+ "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+ "^Viñetas de Cortázar$",
+ "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
+ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
+ "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+ "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+
+ "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+ "^Aus der AGMB$",
+
+ "^Znanstveno-stručni prilozi$",
+ "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+ "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+ "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+ "^Finanční analýza podniku$",
+ "^Financial analysis( of business)?$",
+ "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+ "^Jikken nihon shūshinsho$",
+ "(?i)^CORONER('|s)(s|') INQUESTS$",
+ "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+ "(?i)^Consultants' contract(s)?$",
+ "(?i)^Upute autorima$",
+ "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+ "^Joshi shin kokubun$",
+ "^Kōtō shōgaku dokuhon nōson'yō$",
+ "^Jinjō shōgaku shōka$",
+ "^Shōgaku shūjichō$",
+ "^Nihon joshi dokuhon$",
+ "^Joshi shin dokuhon$",
+ "^Chūtō kanbun dokuhon$",
+ "^Wabun dokuhon$",
+ "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+ "(?i)^cardiac rehabilitation$",
+ "(?i)^Analytical summary$",
+ "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+ "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+ "^Prikazi i osvrti$",
+ "^Rodinný dům s provozovnou$",
+ "^Family house with an establishment$",
+ "^Shinsei chūtō shin kokugun$",
+ "^Pulmonary alveolar proteinosis(\\.?)$",
+ "^Shinshū kanbun$",
+ "^Viñeta(s?) de Rodríguez$",
+ "(?i)^RUBRIKA UREDNIKA$",
+ "^A Matching Model of the Academic Publication Market$",
+ "^Yōgaku kōyō$",
+
+ "^Internetový marketing$",
+ "^Internet marketing$",
+ "^Chūtō kokugo dokuhon$",
+ "^Kokugo dokuhon$",
+ "^Antibiotic Cover for Dental Extraction(s?)$",
+ "^Strategie podniku$",
+ "^Strategy of an Enterprise$",
+ "(?i)^respiratory disease(s?)(\.?)$",
+ "^Award(s?) for Gallantry in Civil Defence$",
+ "^Podniková kultura$",
+ "^Corporate Culture$",
+ "^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
+ "^Pracovní motivace$",
+ "^Work Motivation$",
+ "^Kaitei kōtō jogaku dokuhon$",
+ "^Konsolidovaná účetní závěrka$",
+ "^Consolidated Financial Statements$",
+ "(?i)^intracranial tumour(s?)$",
+ "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+ "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+ "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+ "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+ "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+ "^The level of motivation process as a leadership$",
+ "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+ "(?i)^news and events$",
+ "(?i)^NOVOSTI I DOGAĐAJI$",
+ "^Sansū no gakushū$",
+ "^Posouzení informačního systému firmy a návrh změn$",
+ "^Information System Assessment and Proposal for ICT Modification$",
+ "^Stresové zatížení pracovníků ve vybrané profesi$",
+ "^Stress load in a specific job$",
+
+ "^Sunday: Poster Sessions, Pt.*$",
+ "^Monday: Poster Sessions, Pt.*$",
+ "^Wednesday: Poster Sessions, Pt.*",
+ "^Tuesday: Poster Sessions, Pt.*$",
+
+ "^Analýza reklamy$",
+ "^Analysis of advertising$",
+
+ "^Shōgaku shūshinsho$",
+ "^Shōgaku sansū$",
+ "^Shintei joshi kokubun$",
+ "^Taishō joshi kokubun dokuhon$",
+ "^Joshi kokubun$",
+
+ "^Účetní uzávěrka a účetní závěrka v ČR$",
+ "(?i)^The \"?Causes\"? of Cancer$",
+ "^Normas para la publicación de artículos$",
+ "^Editor('|s)(s|') [Rr]eply$",
+ "^Editor(’|s)(s|’) letter$",
+ "^Redaktoriaus žodis$",
+ "^DISCUSSION ON THE PRECEDING PAPER$",
+ "^Kōtō shōgaku shūshinsho jidōyō$",
+ "^Shōgaku nihon rekishi$",
+ "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+ "^Préface$",
+ "^Occupational [Hh]ealth [Ss]ervices.$",
+ "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+ "^Účetní závěrka ve vybraném podniku.*$",
+ "^Financial statements in selected company$",
+ "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+ "^Pseudomyxoma peritonei$",
+ "^Kazalo autora$",
+
+ "(?i)^uvodna riječ$",
+ "^Motivace jako způsob vedení lidí$",
+ "^Motivation as a leadership$",
+ "^Polyfunkční dům$",
+ "^Multi\\-funkcional building$",
+ "^Podnikatelský plán$",
+ "(?i)^Podnikatelský záměr$",
+ "(?i)^Business Plan$",
+ "^Oceňování nemovitostí$",
+ "^Marketingová komunikace$",
+ "^Marketing communication$",
+ "^Sumario Analítico$",
+ "^Riječ uredništva$",
+ "^Savjetovanja i priredbe$",
+ "^Índice$",
+ "^(Starobosanski nadpisi).*$",
+ "^Vzdělávání pracovníků v organizaci$",
+ "^Staff training in organization$",
+ "^(Life Histories of North American Geometridae).*$",
+ "^Strategická analýza podniku$",
+ "^Strategic Analysis of an Enterprise$",
+ "^Sadržaj$",
+ "^Upute suradnicima$",
+ "^Rodinný dům$",
+ "(?i)^Fami(l)?ly house$",
+ "^Upute autorima$",
+ "^Strategic Analysis$",
+ "^Finanční analýza vybraného podniku$",
+ "^Finanční analýza$",
+ "^Riječ urednika$",
+ "(?i)^Content(s?)$",
+ "(?i)^Inhalt$",
+ "^Jinjō shōgaku shūshinsho jidōyō$",
+ "(?i)^Index$",
+ "^Chūgaku kokubun kyōkasho$",
+ "^Retrato de una mujer$",
+ "^Retrato de un hombre$",
+ "^Kōtō shōgaku dokuhon$",
+ "^Shotōka kokugo$",
+ "^Shōgaku dokuhon$",
+ "^Jinjō shōgaku kokugo dokuhon$",
+ "^Shinsei kokugo dokuhon$",
+ "^Teikoku dokuhon$",
+ "^Instructions to Authors$",
+ "^KİTAP TAHLİLİ$",
+ "^PRZEGLĄD PIŚMIENNICTWA$",
+ "(?i)^Presentación$",
+ "^İçindekiler$",
+ "(?i)^Tabl?e of contents$",
+ "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+ "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+ "^Editorial( Board)?$",
+ "(?i)^Editorial \\(English\\)$",
+ "^Editörden$",
+ "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+ "^(Kiri Karl Morgensternile).*$",
+ "^(\\[Eksliibris Aleksandr).*\\]$",
+ "^(\\[Eksliibris Aleksandr).*$",
+ "^(Eksliibris Aleksandr).*$",
+ "^(Kiri A\\. de Vignolles).*$",
+ "^(2 kirja Karl Morgensternile).*$",
+ "^(Pirita kloostri idaosa arheoloogilised).*$",
+ "^(Kiri tundmatule).*$",
+ "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+ "^(Eksliibris Nikolai Birukovile).*$",
+ "^(Eksliibris Nikolai Issakovile).*$",
+ "^(WHP Cruise Summary Information of section).*$",
+ "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+ "^(Measurement of the spin\\-dependent structure function).*",
+ "(?i)^.*authors['’′]? reply\.?$",
+ "(?i)^.*authors['’′]? response\.?$"
+ ]
+ }
+ }
+}
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json
new file mode 100644
index 0000000..4e99d6d
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json
@@ -0,0 +1,275 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "dedupRun" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "4000",
+ "groupMaxSize" : "40",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
+ "includeChildren" : "true",
+ "maxChildren" : "40"
+ },
+ "pace" : {
+ "clustering" : [
+ { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
+ { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
+ { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
+ ],
+ "strictConditions" : [
+ { "name" : "pidMatch", "fields" : [ "pid" ] }
+ ],
+ "conditions" : [
+ { "name" : "titleVersionMatch", "fields" : [ "title" ] },
+ { "name" : "sizeMatch", "fields" : [ "authors" ] }
+ ],
+ "model" : [
+ { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
+ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
+ { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
+ { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
+ ],
+ "blacklists" : {
+ "title" : [
+ "^Inside Front Cover$",
+ "(?i)^Poster presentations$",
+ "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+ "^Problems with perinatal pathology\.?$",
+ "(?i)^Cases? of Puerperal Convulsions$",
+ "(?i)^Operative Gyna?ecology$",
+ "(?i)^Mind the gap\!?\:?$",
+ "^Chronic fatigue syndrome\.?$",
+ "^Cartas? ao editor Letters? to the Editor$",
+ "^Note from the Editor$",
+ "^Anesthesia Abstract$",
+
+ "^Annual report$",
+ "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
+ "(?i)^Graph and Table of Infectious Diseases?$",
+ "^Presentation$",
+ "(?i)^Reviews and Information on Publications$",
+ "(?i)^PUBLIC HEALTH SERVICES?$",
+ "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+ "(?i)^Adrese autora$",
+ "(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+ "(?i)^Acknowledgement to Referees$",
+ "(?i)^Behçet's disease\.?$",
+ "(?i)^Isolation and identification of restriction endonuclease.*$",
+ "(?i)^CEREBROVASCULAR DISEASES?.?$",
+ "(?i)^Screening for abdominal aortic aneurysms?\.?$",
+ "^Event management$",
+ "(?i)^Breakfast and Crohn's disease.*\.?$",
+ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
+ "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
+ "^Gushi hakubutsugaku$",
+
+ "^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
+ "^Intestinal spirocha?etosis$",
+ "^Treatment of Rodent Ulcer$",
+ "(?i)^\W*Cloud Computing\W*$",
+ "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+ "^Free Communications, Poster Presentations: Session [A-F]$",
+
+ "^“The Historical Aspects? of Quackery\.?”$",
+ "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+ "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+ "(?i)^Case Report$",
+ "^Boletín Informativo$",
+ "(?i)^Glioblastoma Multiforme$",
+ "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+ "^Zaměstnanecké výhody$",
+ "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+ "(?i)^Carotid body tumours?\\.?$",
+ "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+ "^Avant-propos$",
+ "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+ "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+ "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+ "^Viñetas de Cortázar$",
+ "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
+ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
+ "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+ "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+
+ "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+ "^Aus der AGMB$",
+
+ "^Znanstveno-stručni prilozi$",
+ "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+ "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+ "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+ "^Finanční analýza podniku$",
+ "^Financial analysis( of business)?$",
+ "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+ "^Jikken nihon shūshinsho$",
+ "(?i)^CORONER('|s)(s|') INQUESTS$",
+ "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+ "(?i)^Consultants' contract(s)?$",
+ "(?i)^Upute autorima$",
+ "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+ "^Joshi shin kokubun$",
+ "^Kōtō shōgaku dokuhon nōson'yō$",
+ "^Jinjō shōgaku shōka$",
+ "^Shōgaku shūjichō$",
+ "^Nihon joshi dokuhon$",
+ "^Joshi shin dokuhon$",
+ "^Chūtō kanbun dokuhon$",
+ "^Wabun dokuhon$",
+ "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+ "(?i)^cardiac rehabilitation$",
+ "(?i)^Analytical summary$",
+ "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+ "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+ "^Prikazi i osvrti$",
+ "^Rodinný dům s provozovnou$",
+ "^Family house with an establishment$",
+ "^Shinsei chūtō shin kokugun$",
+ "^Pulmonary alveolar proteinosis(\\.?)$",
+ "^Shinshū kanbun$",
+ "^Viñeta(s?) de Rodríguez$",
+ "(?i)^RUBRIKA UREDNIKA$",
+ "^A Matching Model of the Academic Publication Market$",
+ "^Yōgaku kōyō$",
+
+ "^Internetový marketing$",
+ "^Internet marketing$",
+ "^Chūtō kokugo dokuhon$",
+ "^Kokugo dokuhon$",
+ "^Antibiotic Cover for Dental Extraction(s?)$",
+ "^Strategie podniku$",
+ "^Strategy of an Enterprise$",
+ "(?i)^respiratory disease(s?)(\.?)$",
+ "^Award(s?) for Gallantry in Civil Defence$",
+ "^Podniková kultura$",
+ "^Corporate Culture$",
+ "^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
+ "^Pracovní motivace$",
+ "^Work Motivation$",
+ "^Kaitei kōtō jogaku dokuhon$",
+ "^Konsolidovaná účetní závěrka$",
+ "^Consolidated Financial Statements$",
+ "(?i)^intracranial tumour(s?)$",
+ "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+ "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+ "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+ "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+ "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+ "^The level of motivation process as a leadership$",
+ "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+ "(?i)^news and events$",
+ "(?i)^NOVOSTI I DOGAĐAJI$",
+ "^Sansū no gakushū$",
+ "^Posouzení informačního systému firmy a návrh změn$",
+ "^Information System Assessment and Proposal for ICT Modification$",
+ "^Stresové zatížení pracovníků ve vybrané profesi$",
+ "^Stress load in a specific job$",
+
+ "^Sunday: Poster Sessions, Pt.*$",
+ "^Monday: Poster Sessions, Pt.*$",
+ "^Wednesday: Poster Sessions, Pt.*",
+ "^Tuesday: Poster Sessions, Pt.*$",
+
+ "^Analýza reklamy$",
+ "^Analysis of advertising$",
+
+ "^Shōgaku shūshinsho$",
+ "^Shōgaku sansū$",
+ "^Shintei joshi kokubun$",
+ "^Taishō joshi kokubun dokuhon$",
+ "^Joshi kokubun$",
+
+ "^Účetní uzávěrka a účetní závěrka v ČR$",
+ "(?i)^The \"?Causes\"? of Cancer$",
+ "^Normas para la publicación de artículos$",
+ "^Editor('|s)(s|') [Rr]eply$",
+ "^Editor(’|s)(s|’) letter$",
+ "^Redaktoriaus žodis$",
+ "^DISCUSSION ON THE PRECEDING PAPER$",
+ "^Kōtō shōgaku shūshinsho jidōyō$",
+ "^Shōgaku nihon rekishi$",
+ "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+ "^Préface$",
+ "^Occupational [Hh]ealth [Ss]ervices.$",
+ "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+ "^Účetní závěrka ve vybraném podniku.*$",
+ "^Financial statements in selected company$",
+ "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+ "^Pseudomyxoma peritonei$",
+ "^Kazalo autora$",
+
+ "(?i)^uvodna riječ$",
+ "^Motivace jako způsob vedení lidí$",
+ "^Motivation as a leadership$",
+ "^Polyfunkční dům$",
+ "^Multi\\-funkcional building$",
+ "^Podnikatelský plán$",
+ "(?i)^Podnikatelský záměr$",
+ "(?i)^Business Plan$",
+ "^Oceňování nemovitostí$",
+ "^Marketingová komunikace$",
+ "^Marketing communication$",
+ "^Sumario Analítico$",
+ "^Riječ uredništva$",
+ "^Savjetovanja i priredbe$",
+ "^Índice$",
+ "^(Starobosanski nadpisi).*$",
+ "^Vzdělávání pracovníků v organizaci$",
+ "^Staff training in organization$",
+ "^(Life Histories of North American Geometridae).*$",
+ "^Strategická analýza podniku$",
+ "^Strategic Analysis of an Enterprise$",
+ "^Sadržaj$",
+ "^Upute suradnicima$",
+ "^Rodinný dům$",
+ "(?i)^Fami(l)?ly house$",
+ "^Upute autorima$",
+ "^Strategic Analysis$",
+ "^Finanční analýza vybraného podniku$",
+ "^Finanční analýza$",
+ "^Riječ urednika$",
+ "(?i)^Content(s?)$",
+ "(?i)^Inhalt$",
+ "^Jinjō shōgaku shūshinsho jidōyō$",
+ "(?i)^Index$",
+ "^Chūgaku kokubun kyōkasho$",
+ "^Retrato de una mujer$",
+ "^Retrato de un hombre$",
+ "^Kōtō shōgaku dokuhon$",
+ "^Shotōka kokugo$",
+ "^Shōgaku dokuhon$",
+ "^Jinjō shōgaku kokugo dokuhon$",
+ "^Shinsei kokugo dokuhon$",
+ "^Teikoku dokuhon$",
+ "^Instructions to Authors$",
+ "^KİTAP TAHLİLİ$",
+ "^PRZEGLĄD PIŚMIENNICTWA$",
+ "(?i)^Presentación$",
+ "^İçindekiler$",
+ "(?i)^Tabl?e of contents$",
+ "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+ "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+ "^Editorial( Board)?$",
+ "(?i)^Editorial \\(English\\)$",
+ "^Editörden$",
+ "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+ "^(Kiri Karl Morgensternile).*$",
+ "^(\\[Eksliibris Aleksandr).*\\]$",
+ "^(\\[Eksliibris Aleksandr).*$",
+ "^(Eksliibris Aleksandr).*$",
+ "^(Kiri A\\. de Vignolles).*$",
+ "^(2 kirja Karl Morgensternile).*$",
+ "^(Pirita kloostri idaosa arheoloogilised).*$",
+ "^(Kiri tundmatule).*$",
+ "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+ "^(Eksliibris Nikolai Birukovile).*$",
+ "^(Eksliibris Nikolai Issakovile).*$",
+ "^(WHP Cruise Summary Information of section).*$",
+ "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+ "^(Measurement of the spin\\-dependent structure function).*",
+ "(?i)^.*authors['’′]? reply\.?$",
+ "(?i)^.*authors['’′]? response\.?$"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
new file mode 100644
index 0000000..910fbcd
--- /dev/null
+++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf
@@ -0,0 +1,21 @@
+{
+ "wf" : {
+ "threshold" : "0.99",
+ "run" : "001",
+ "entityType" : "result",
+ "orderField" : "title",
+ "queueMaxSize" : "2000",
+ "groupMaxSize" : "10",
+ "slidingWindowSize" : "200",
+ "rootBuilder" : [ "result" ],
+ "includeChildren" : "true"
+ },
+ "pace" : {
+ "conditions" : [ ],
+ "model" : [
+ { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
+ ],
+ "blacklists" : { }
+ }
+
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 3cbb5bc..e445041 100644
--- a/pom.xml
+++ b/pom.xml
@@ -109,6 +109,13 @@
dnet-openaire-data-protos
3.9.3-proto250
+
+ eu.dnetlib
+ dnet-openaireplus-mapping-utils
+ 6.2.17-SNAPSHOT
+
+
+
com.google.guava
guava