diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index ff9b378..43a7c2b 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -59,6 +59,19 @@ org.apache.spark spark-graphx_2.11 + + + eu.dnetlib + dnet-openaireplus-mapping-utils + test + + + + junit + junit + test + + \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala index 2d34afe..f893570 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala @@ -1,24 +1,21 @@ -package eu.dnetlib.graph -import java.lang +package eu.dnetlib.graph import eu.dnetlib.ConnectedComponent import eu.dnetlib.pace.model.MapDocument import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD -import scala.collection.JavaConversions -; - +import scala.collection.JavaConversions; object GraphProcessor { - def findCCs(vertexes: RDD[(VertexId,MapDocument)], edges:RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = { + def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = { val graph: Graph[MapDocument, String] = Graph(vertexes, edges) val cc = graph.connectedComponents(maxIterations).vertices val joinResult = vertexes.leftOuterJoin(cc).map { case (id, (openaireId, cc)) => { - if (cc.isEmpty){ + if (cc.isEmpty) { (id, openaireId) } else { @@ -33,7 +30,7 @@ object GraphProcessor { } - def asConnectedComponent(group: (VertexId, Iterable[MapDocument])) : ConnectedComponent = { + def asConnectedComponent(group: (VertexId, Iterable[MapDocument])): ConnectedComponent = { val docs = group._2.toSet[MapDocument] val connectedComponent = new ConnectedComponent("empty", JavaConversions.setAsJavaSet[MapDocument](docs)); connectedComponent.initializeID(); diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java new file mode 100644 index 0000000..961fdd6 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java @@ -0,0 +1,198 @@ +package eu.dnetlib.pace; + +import com.google.common.collect.Lists; +import com.google.gson.Gson; +import eu.dnetlib.data.proto.FieldTypeProtos.Author; +import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; +import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; +import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; +import eu.dnetlib.data.proto.OafProtos.Oaf; +import eu.dnetlib.data.proto.OafProtos.OafEntity; +import eu.dnetlib.data.proto.OrganizationProtos.Organization; +import eu.dnetlib.data.proto.ResultProtos.Result; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.ProtoDocumentBuilder; +import eu.dnetlib.pace.model.gt.GTAuthor; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.RandomStringUtils; +import org.apache.commons.lang.StringUtils; + +import java.io.IOException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public abstract class AbstractProtoPaceTest extends OafTest { + + protected DedupConfig getResultFullConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf")); + } + + protected DedupConfig getResultSimpleConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf")); + } + + protected DedupConfig getResultConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf")); + } + + protected DedupConfig getOrganizationSimpleConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); + } + + protected DedupConfig getResultAuthorsConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf")); + } + + protected DedupConfig getResultProdConf() { + return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf")); + } + + protected MapDocument author(final Config conf, final String id, final Oaf oaf) { + return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model()); + } + + protected GTAuthor getGTAuthor(final String path) { + + final Gson gson = new Gson(); + + final String json = readFromClasspath(path); + + final GTAuthor gta = gson.fromJson(json, GTAuthor.class); + + return gta; + } + + protected String readFromClasspath(final String filename) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(getClass().getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } + + protected MapDocument result(final Config config, final String id, final String title) { + return result(config, id, title, null, new ArrayList<>(), null); + } + + protected MapDocument result(final Config config, final String id, final String title, final String date) { + return result(config, id, title, date, new ArrayList<>(), null); + } + + protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid) { + return result(config, id, title, date, pid, null); + } + + protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { + return result(config, id, title, date, pid, null); + } + + protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) { + return result(config, id, title, date, Lists.newArrayList(pid), authors); + } + + protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { + final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); + if (!StringUtils.isBlank(title)) { + metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); + metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); + } + if (!StringUtils.isBlank(date)) { + metadata.setDateofacceptance(sf(date)); + } + + final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); + final Result.Builder result = Result.newBuilder().setMetadata(metadata); + + if (authors != null) { + result.getMetadataBuilder().addAllAuthor( + IntStream.range(0, authors.size()) + .mapToObj(i -> author(authors.get(i), i)) + .collect(Collectors.toCollection(LinkedList::new))); + } + + entity.setResult(result); + + if (pid != null) { + for(String p : pid) { + if (!StringUtils.isBlank(p)) { + entity.addPid(sp(p, "doi")); + //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); + } + } + } + + final OafEntity build = entity.build(); + return ProtoDocumentBuilder.newInstance(id, build, config.model()); + } + + private Author author(final String s, int rank) { + final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); + final Author.Builder author = Author.newBuilder(); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } + author.setFullname(p.getNormalisedFullname()); + author.setRank(rank); + + return author.build(); + } + + private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { + final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type); + return entity; + } + + protected MapDocument organization(final Config config, final String id, final String legalName) { + return organization(config, id, legalName, null); + } + + protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) { + final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder(); + if (legalName != null) { + metadata.setLegalname(sf(legalName)); + } + if (legalShortName != null) { + metadata.setLegalshortname(sf(legalShortName)); + } + + final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); + entity.setOrganization(Organization.newBuilder().setMetadata(metadata)); + + return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model()); + } + + private StructuredProperty sp(final String pid, final String type) { + final Builder pidSp = + StructuredProperty.newBuilder().setValue(pid) + .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); + return pidSp.build(); + } + + protected Field title(final String s) { + return new FieldValueImpl(Type.String, "title", s); + } + + protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) { + return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); + } + + /* + * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); } + * + * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return + * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); } + */ + +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java new file mode 100644 index 0000000..590c416 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java @@ -0,0 +1,446 @@ +package eu.dnetlib.pace; + +import com.google.protobuf.GeneratedMessage; +import com.google.protobuf.InvalidProtocolBufferException; +import eu.dnetlib.data.mapreduce.util.OafDecoder; +import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization; +import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision; +import eu.dnetlib.data.proto.DatasourceProtos.Datasource; +import eu.dnetlib.data.proto.DedupProtos.Dedup; +import eu.dnetlib.data.proto.FieldTypeProtos.*; +import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; +import eu.dnetlib.data.proto.KindProtos.Kind; +import eu.dnetlib.data.proto.OafProtos.Oaf; +import eu.dnetlib.data.proto.OafProtos.OafEntity; +import eu.dnetlib.data.proto.OafProtos.OafRel; +import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization; +import eu.dnetlib.data.proto.OrganizationProtos.Organization; +import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; +import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation; +import eu.dnetlib.data.proto.ProjectProtos.Project; +import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata; +import eu.dnetlib.data.proto.RelTypeProtos.RelType; +import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; +import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject; +import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; +import eu.dnetlib.data.proto.ResultProtos.Result; +import eu.dnetlib.data.proto.ResultProtos.Result.Context; +import eu.dnetlib.data.proto.ResultProtos.Result.Instance; +import eu.dnetlib.data.proto.ResultResultProtos.ResultResult; +import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity; +import eu.dnetlib.data.proto.TypeProtos.Type; + +public class OafTest { + + public static final String CITATION_JSON = + "\n \n [10] M. Foret et al., Phys. Rev. B 66, 024204 (2002).\n \n \n [11] B. Ru\175404\264e et al., Phys. Rev. Lett. 90, 095502 (2003).\n \n \n [12] U. Buchenau et al., Phys. Rev. B 34, 5665 (1986).\n \n \n [13] S.N. Taraskin and S.R. Elliott, J. Phys.: Condens. Mat- ter 11, A219 (1999).\n \n \n [14] B. Hehlen et al., Phys. Rev. Lett. 84, 5355 (2000).\n \n \n [15] N.V. Surotsev et al., J. Phys.: Condens. Matter 10, L113 (1998).\n \n \n [16] D.A. Parshin and C. Laermans, Phys. Rev. B 63, 132203 (2001).\n \n \n [17] V.L. Gurevich et al., Phys. Rev. B 67, 094203 (2003).\n \n \n [18] A. Matic et al., Phys. Rev. Lett. 86, 3803 (2001).\n \n \n [19] E. Rat et al., arXiv:cond-mat/0505558, 23 May 2005.\n \n \n [1] R.C. Zeller and R.O. Pohl, Phys. Rev. B 4, 2029 (1971).\n \n \n [20] C.A. Angell, J. Non-Cryst. Solids 131\20023133, 13 (1991).\n \n \n [21] A.P. Sokolov et al., Phys. Rev. Lett. 71, 2062 (1993).\n \n \n [22] T. Matsuo et al., Solid State Ionics 154-155, 759 (2002).\n \n \n [23] V.K. Malinovsky et al., Europhys. Lett. 11, 43 (1990).\n \n \n [24] J. Lor\250osch et al., J. Non-Cryst. Solids 69, 1 (1984).\n \n \n [25] U. Buchenau, Z. Phys. B 58, 181 (1985).\n \n \n [26] A.F. Io\175400e and A.R. Regel, Prog. Semicond. 4, 237 (1960).\n \n \n [27] R. Dell\20031Anna et al., Phys. Rev. Lett. 80, 1236 (1998).\n \n \n [28] D. Fioretto et al., Phys. Rev. E 59, 4470 (1999).\n \n \n [29] U. Buchenau et al., Phys. Rev. Lett. 77, 4035 (1996).\n \n \n [2] M. Rothenfusser et al., Phys. Rev. B 27, 5196 (1983).\n \n \n [30] J. Mattsson et al., J. Phys.: Condens. Matter 15, S1259 (2003).\n \n \n [31] T. Scopigno et al., Phys. Rev. Lett. 92, 025503 (2004).\n \n \n [32] M. Foret et al., Phys. Rev. Lett. 81, 2100 (1998).\n \n \n [33] F. Sette et al., Science 280, 1550 (1998).\n \n \n [34] J. Wuttke et al., Phys. Rev. E 52, 4026 (1995).\n \n \n [35] M.A. Ramos et al., Phys. Rev. Lett. 78, 82 (1997).\n \n \n [36] G. Monaco et al., Phys. Rev. Lett. 80, 2161 (1998).\n \n \n [37] A. T\250olle, Rep. Prog. Phys. 64, 1473 (2001).\n \n \n [38] As the straight lines do not cross the origin, this does not 2 imply \1623 \21035 \1651 .\n \n \n [39] A. Matic et al., Europhys. Lett. 54, 77 (2001).\n \n \n [3] S. Hunklinger and W. Arnold, in Physical Acoustics, Vol. XII, W.P. Mason and R.N. Thurston Eds. (Academic Press, N.Y. 1976), p. 155.\n \n \n [40] IXS data are usually not available below \1651co, mostly for experimental reasons. E.g., that the rapid onset was not evidenced in vitreous silica [27], is not indicative of its absence but rather of a low qco \21074 1 nm\210221.\n \n \n [41] G. Ruocco et al., Phys. Rev. Lett. 83, 5583 (1999).\n \n \n [42] D. C\1307 iplys et al., J. Physique (Paris) 42, C6-184 (1981).\n \n \n [43] R. Vacher et al., Rev. Sci. Instrum. 51, 288 (1980).\n \n \n [44] R. Vacher et al., arXiv:cond-mat/0505560, 23 May 2005.\n \n \n [45] T.N. Claytor et al., Phys. Rev. B 18, 5842 (1978).\n \n \n [46] M. Arai et al., Physica B 263-264, 268 (1999).\n \n \n [4] R. Vacher et al., J. Non-Cryst. Solids 45, 397 (1981); T.C. Zhu et al., Phys. Rev. B 44, 4281 (1991).\n \n \n [5] J.E. Graebner et al., Phys. Rev. B 34, 5696 (1986).\n \n \n [6] E. Duval and A. Mermet, Phys. Rev. B 58, 8159 (1998).\n \n \n [7] A. Matic et al., Phys. Rev. Lett. 93, 145502 (2004).\n \n \n [8] Often alluded to, e.g. in the Encyclopedia of Materials: Science and Technology, K.H.J. Buschow et al., Eds., Vol. 1 (Elsevier, Oxford, 2001), articles by S.R. Elliott on pp. 171-174 and U. Buchenau on pp. 212-215.\n \n \n [9] E. Rat et al., Phys. Rev. Lett. 83, 1355 (1999).\n \n"; + + public static final String STATISTICS_JSON = + "[{ \"citationsPerYear\": \"many\", \"anotherCoolStatistic\": \"WoW\", \"nestedStat\": { \"firstNestedStat\" : \"value 1\", \"secondNestedStat\" : \"value 2\"}, \"listingStat\" : [ \"one\", \"two\" ] }]"; + + public static Builder getStructuredproperty(final String value, final String classname, final String schemename) { + return getStructuredproperty(value, classname, schemename, null); + } + + public static Builder getStructuredproperty(final String value, final String classname, final String schemename, final DataInfo dataInfo) { + final Builder sp = StructuredProperty.newBuilder().setValue(value).setQualifier(getQualifier(classname, schemename)); + if (dataInfo != null) { + sp.setDataInfo(dataInfo); + } + return sp; + } + + public static Qualifier.Builder getQualifier(final String classname, final String schemename) { + return Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); + } + + public static KeyValue getKV(final String id, final String name) { + return KeyValue.newBuilder().setKey(id).setValue(name).build(); + } + + public static OafEntity getDatasource(final String datasourceId) { + return OafEntity + .newBuilder() + .setType(Type.datasource) + .setId(datasourceId) + .setDatasource( + Datasource.newBuilder().setMetadata( + Datasource.Metadata.newBuilder().setOfficialname(sf("officialname")).setEnglishname(sf("englishname")) + .setWebsiteurl(sf("websiteurl")).setContactemail(sf("contactemail")).addAccessinfopackage(sf("accessinforpackage")) + .setNamespaceprefix(sf("namespaceprofix")).setDescription(sf("description")).setOdnumberofitems(sf("numberofitems")) + .setOdnumberofitemsdate(sf("numberofitems date")) + // .addOdsubjects("subjects") + .setOdpolicies(sf("policies")).addOdlanguages(sf("languages")).addOdcontenttypes(sf("contenttypes")) + .setDatasourcetype(getQualifier("type class", "type scheme")))).build(); + } + + public static OafEntity getResult(final String id) { + return getResultBuilder(id).build(); + } + + public static OafEntity.Builder getResultBuilder(final String id) { + return OafEntity + .newBuilder() + .setType(Type.result) + .setId(id) + .setResult( + Result.newBuilder() + .setMetadata( + Result.Metadata + .newBuilder() + .addTitle( + getStructuredproperty( + "Analysis of cell viability in intervertebral disc: Effect of endplate permeability on cell population", + "main title", "dnet:result_titles", getDataInfo())) + .addTitle(getStructuredproperty("Another title", "alternative title", "dnet:result_titles", getDataInfo())) + .addSubject(getStructuredproperty("Biophysics", "subject", "dnet:result_sujects")) + .setDateofacceptance(sf("2010-01-01")).addSource(sf("sourceA")).addSource(sf("sourceB")) + .addContext(Context.newBuilder().setId("egi::virtual::970")) + .addContext(Context.newBuilder().setId("egi::classification::natsc::math::applied")) + .addContext(Context.newBuilder().setId("egi::classification::natsc::math")) + .addContext(Context.newBuilder().setId("egi::classification::natsc")) + .addContext(Context.newBuilder().setId("egi::classification")).addContext(Context.newBuilder().setId("egi")) + .addDescription(sf("Responsible for making and maintaining the extracellular matrix ...")) + .addDescription(sf("Another description ...")).setPublisher(sf("ELSEVIER SCI LTD")) + .setResulttype(getQualifier("publication", "dnet:result_types")) + .setLanguage(getQualifier("eng", "dnet:languages"))).addInstance(getInstance("10|od__10", "Uk pubmed")) + .addInstance(getInstance("10|od__10", "arxiv"))) + .addCollectedfrom(getKV("opendoar____::1064", "Oxford University Research Archive")) + .addPid(getStructuredproperty("doi:74293", "doi", "dnet:pids")).addPid(getStructuredproperty("oai:74295", "oai", "dnet:pids")) + .setDateofcollection(""); + } + + public static DataInfo getDataInfo() { + return getDataInfo("0.4"); + } + + public static DataInfo getDataInfo(final String trust) { + return DataInfo.newBuilder().setDeletedbyinference(false).setTrust("0.4").setInferenceprovenance("algo").setProvenanceaction(getQualifier("xx", "yy")) + .build(); + } + + public static Instance.Builder getInstance(final String hostedbyId, final String hostedbyName) { + return Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName)).setAccessright(getQualifier("OpenAccess", "dnet:access_modes")) + .setInstancetype(getQualifier("publication", "dnet:result_typologies")).addUrl("webresource url"); + + } + + public static OafRel getDedupRel(final String source, final String target, final RelType relType, final String relClass) { + return OafRel.newBuilder().setSource(source).setTarget(target).setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass) + .setChild(false).setCachedTarget(getResult(target)) + .setResultResult(ResultResult.newBuilder().setDedup(Dedup.newBuilder().setRelMetadata(RelMetadata.getDefaultInstance()))) + .build(); + } + + public static OafRel getProjectOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { + final OafRel.Builder oafRel = OafRel + .newBuilder() + .setSource(source) + .setTarget(target) + .setRelType(RelType.projectOrganization) + .setSubRelType(SubRelType.participation) + .setRelClass(relClass) + .setChild(false) + .setProjectOrganization( + ProjectOrganization.newBuilder().setParticipation( + Participation.newBuilder().setParticipantnumber("" + 1) + .setRelMetadata(relMetadata(relClass, "dnet:project_organization_relations")))); + switch (Participation.RelName.valueOf(relClass)) { + case hasParticipant: + oafRel.setCachedTarget(getProjectFP7(target, "SP3")); + break; + case isParticipant: + oafRel.setCachedTarget(getOrganization(target)); + break; + default: + break; + } + return oafRel.build(); + } + + public static GeneratedMessage getOrganizationOrganization(final String source, final String target, final String relClass) { + final OafRel.Builder oafRel = OafRel + .newBuilder() + .setSource(source) + .setTarget(target) + .setRelType(RelType.organizationOrganization) + .setSubRelType(SubRelType.dedup) + .setRelClass(relClass) + .setChild(true) + .setOrganizationOrganization( + OrganizationOrganization.newBuilder().setDedup( + Dedup.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:organization_organization_relations")))); + + switch (Dedup.RelName.valueOf(relClass)) { + case isMergedIn: + oafRel.setCachedTarget(getOrganization(source)); + break; + case merges: + oafRel.setCachedTarget(getOrganization(target)); + break; + default: + break; + } + return oafRel.build(); + } + + public static OafRel getDatasourceOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { + final OafRel.Builder oafRel = OafRel + .newBuilder() + .setSource(source) + .setTarget(target) + .setRelType(RelType.datasourceOrganization) + .setSubRelType(SubRelType.provision) + .setRelClass(relClass) + .setChild(false) + .setDatasourceOrganization( + DatasourceOrganization.newBuilder().setProvision( + Provision.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:datasource_organization_relations")))); + switch (Provision.RelName.valueOf(relClass)) { + case isProvidedBy: + oafRel.setCachedTarget(getOrganization(target)); + break; + case provides: + oafRel.setCachedTarget(getDatasource(target)); + break; + default: + break; + } + return oafRel.build(); + } + + public static OafRel getSimilarityRel(final String sourceId, final String targetId, final OafEntity result, final String relClass) { + return OafRel + .newBuilder() + .setSource(sourceId) + .setTarget(targetId) + .setRelType(RelType.resultResult) + .setSubRelType(SubRelType.similarity) + .setRelClass(relClass) + .setChild(false) + .setCachedTarget(result) + .setResultResult( + ResultResult.newBuilder().setSimilarity( + Similarity.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:resultResult_relations")).setSimilarity(.4f) + .setType(Similarity.Type.STANDARD))).build(); + } + + public static RelMetadata.Builder relMetadata(final String classname, final String schemename) { + return RelMetadata.newBuilder().setSemantics(getQualifier(classname, schemename)); + } + + public static OafEntity getOrganization(final String orgId) { + return OafEntity + .newBuilder() + .setType(Type.organization) + .setId(orgId) + .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) + .setOrganization( + Organization.newBuilder().setMetadata( + Organization.Metadata.newBuilder().setLegalname(sf("CENTRE D'APPUI A LA RECHERCHE ET A LA FORMATION GIE")) + .setLegalshortname(sf("CAREF")).setWebsiteurl(sf("www.caref-mali.org")) + .setCountry(getQualifier("ML", "dnet:countries")))).build(); + } + + public static OafRel getResultProject(final String from, final String to, final OafEntity project, final String relClass) + throws InvalidProtocolBufferException { + return OafRel + .newBuilder() + .setSource(from) + .setTarget(to) + .setRelType(RelType.resultProject) + .setSubRelType(SubRelType.outcome) + .setRelClass(relClass) + .setChild(false) + .setResultProject( + ResultProject.newBuilder().setOutcome(Outcome.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:result_project_relations")))) + .setCachedTarget(project).build(); + } + + public static OafEntity getProjectFP7(final String projectId, final String fundingProgram) throws InvalidProtocolBufferException { + return OafEntity + .newBuilder() + .setType(Type.project) + .setId(projectId) + .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) + .setProject( + Project.newBuilder() + .setMetadata( + Project.Metadata + .newBuilder() + .setAcronym(sf("5CYRQOL")) + .setTitle(sf("Cypriot Researchers Contribute to our Quality of Life")) + .setStartdate(sf("2007-05-01")) + .setEnddate(sf("2007-10-31")) + .setEcsc39(sf("false")) + .setContracttype(getQualifier("CSA", "ec:FP7contractTypes")) + .addFundingtree( + sf("ec__________::ECECEuropean Commissionec__________::EC::FP7::" + + fundingProgram + + "::PEOPLEMarie-Curie ActionsPEOPLEec:programec__________::EC::FP7::" + + fundingProgram + + "" + + fundingProgram + + "-People" + + fundingProgram + + "ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram")))) + .build(); + } + + public static OafEntity getProjectWT() throws InvalidProtocolBufferException { + return OafEntity + .newBuilder() + .setType(Type.project) + .setId("project|wt::087536") + .addCollectedfrom(getKV("wellcomeTrust", "wellcome trust")) + .setProject( + Project.newBuilder() + .setMetadata( + Project.Metadata + .newBuilder() + .setAcronym(sf("UNKNOWN")) + .setTitle(sf("Research Institute for Infectious Diseases of Poverty (IIDP).")) + .setStartdate(sf("2007-05-01")) + .setEnddate(sf("2007-10-31")) + .setEcsc39(sf("false")) + .setContracttype(getQualifier("UNKNOWN", "wt:contractTypes")) + .addFundingtree( + sf("wt__________::WTWTWellcome Trustwt__________::WT::UNKNOWNUNKNOWNUNKNOWNwt:fundingStream")) + .addFundingtree( + sf("wt__________::WTWTWellcome Trustwt__________::WT::Technology TransferTechnology TransferTechnology Transferwt:fundingStream")))) + .build(); + } + + public static ExtraInfo extraInfo(final String name, final String provenance, final String trust, final String typology, final String value) { + final ExtraInfo.Builder e = ExtraInfo.newBuilder().setName(name).setProvenance(provenance).setTrust(trust).setTypology(typology).setValue(value); + return e.build(); + } + + // public static DocumentClasses documentClasses() { + // DocumentClasses.Builder builder = DocumentClasses.newBuilder(); + // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASSES) + 1; i++) { + // builder.addArXivClasses(getDocumentClass()).addDdcClasses(getDocumentClass()).addWosClasses(getDocumentClass()) + // .addMeshEuroPMCClasses(getDocumentClass()); + // } + // return builder.build(); + // } + // + // private static DocumentClass getDocumentClass() { + // DocumentClass.Builder builder = DocumentClass.newBuilder(); + // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASS_LABELS) + 1; i++) { + // builder.addClassLabels("test_class_" + i); + // } + // return builder.setConfidenceLevel(0.5F).build(); + // } + // + // public static DocumentStatistics documentStatistics() { + // return + // DocumentStatistics.newBuilder().setCitationsFromAllPapers(basicCitationStatistics()).setCitationsFromPublishedPapers(basicCitationStatistics()) + // .build(); + // } + // + // private static BasicCitationStatistics basicCitationStatistics() { + // BasicCitationStatistics.Builder builder = BasicCitationStatistics.newBuilder(); + // for (int i = 0; i < N_CITATION_STATS; i++) { + // builder.addNumberOfCitationsPerYear(statisticsKeyValue()); + // builder.setNumberOfCitations(RandomUtils.nextInt(5) + 1); + // } + // return builder.build(); + // } + // + // private static StatisticsKeyValue statisticsKeyValue() { + // return StatisticsKeyValue.newBuilder().setKey((RandomUtils.nextInt(30) + 1980) + "").setValue(RandomUtils.nextInt(5) + 1).build(); + // } + // + // public static AuthorStatistics authorStatistics() { + // AuthorStatistics.Builder builder = AuthorStatistics.newBuilder(); + // builder.setCore(commonCoreStatistics()); + // for (int i = 0; i < N_COAUTHORS; i++) { + // builder.addCoAuthors(coAuthor()); + // } + // return builder.build(); + // } + // + // private static CoAuthor coAuthor() { + // CoAuthor.Builder builder = CoAuthor.newBuilder(); + // builder.setId("30|od______2345::" + Hashing.md5(RandomStringUtils.random(10))); + // builder.setCoauthoredPapersCount(RandomUtils.nextInt(5) + 1); + // return builder.build(); + // } + // + // public static CommonCoreStatistics commonCoreStatistics() { + // CommonCoreStatistics.Builder builder = CommonCoreStatistics.newBuilder(); + // + // builder.setAllPapers(coreStatistics()); + // builder.setPublishedPapers(coreStatistics()); + // + // return builder.build(); + // } + // + // private static CoreStatistics coreStatistics() { + // CoreStatistics.Builder builder = CoreStatistics.newBuilder(); + // + // builder.setNumberOfPapers(RandomUtils.nextInt(10)); + // builder.setCitationsFromAllPapers(extendedStatistics()); + // builder.setCitationsFromPublishedPapers(extendedStatistics()); + // + // return builder.build(); + // } + // + // private static ExtendedStatistics extendedStatistics() { + // ExtendedStatistics.Builder builder = ExtendedStatistics.newBuilder(); + // + // builder.setBasic(basicCitationStatistics()); + // builder.setAverageNumberOfCitationsPerPaper(RandomUtils.nextFloat()); + // for (int i = 0; i < N_CITATION_STATS; i++) { + // builder.addNumberOfPapersCitedAtLeastXTimes(statisticsKeyValue()); + // } + // + // return builder.build(); + // } + + public static StringField sf(final String s) { + return sf(s, null); + } + + public static StringField sf(final String s, final DataInfo dataInfo) { + final StringField.Builder sf = StringField.newBuilder().setValue(s); + if (dataInfo != null) { + sf.setDataInfo(dataInfo); + } + return sf.build(); + } + + public static OafDecoder embed(final GeneratedMessage msg, + final Kind kind, + final boolean deletedByInference, + final boolean inferred, + final String provenance, + final String action) { + + final Oaf.Builder oaf = Oaf + .newBuilder() + .setKind(kind) + .setLastupdatetimestamp(System.currentTimeMillis()) + .setDataInfo( + DataInfo.newBuilder().setDeletedbyinference(deletedByInference).setInferred(inferred).setTrust("0.5") + .setInferenceprovenance(provenance).setProvenanceaction(getQualifier(action, action))); + switch (kind) { + case entity: + oaf.setEntity((OafEntity) msg); + break; + case relation: + oaf.setRel((OafRel) msg); + break; + default: + break; + } + + return OafDecoder.decode(oaf.build()); + } + + public static OafDecoder embed(final GeneratedMessage msg, final Kind kind) { + return embed(msg, kind, false, false, "inference_provenance", "provenance_action"); + } + +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java new file mode 100644 index 0000000..c9fa084 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java @@ -0,0 +1,42 @@ +package eu.dnetlib.pace.clustering; + +import eu.dnetlib.pace.AbstractProtoPaceTest; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.Before; +import org.junit.Test; + +public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest { + + private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class); + + private Config config; + + @Before + public void setUp() { + config = getResultFullConf(); + } + + @Test + public void testCombine() { + final MapDocument result = + result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013"); + final FieldListImpl fl = new FieldListImpl(); + fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline")); + + result.getFieldMap().put("desc", fl); + + fl.clear(); + fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty")); + final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title"); + field.add(fl); + + log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config)); + } +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java new file mode 100644 index 0000000..125bf63 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java @@ -0,0 +1,39 @@ +package eu.dnetlib.pace.clustering; + +import eu.dnetlib.pace.AbstractProtoPaceTest; +import eu.dnetlib.pace.clustering.ClusteringCombiner; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.model.MapDocument; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.Before; +import org.junit.Test; + +public class ClusteringCombinerTest extends AbstractProtoPaceTest { + + private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); + + private Config config; + + @Before + public void setUp() { + config = getResultFullConf(); + } + + @Test + public void testCombine() { + String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission"; + MapDocument result = result(config, "A", title, "2013"); + + FieldListImpl fl = new FieldListImpl(); + fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty")); + + result.getFieldMap().put("desc", fl); + log.info(title); + log.info(ClusteringCombiner.combine(result, config)); + } + +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java new file mode 100644 index 0000000..e2d3ad7 --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java @@ -0,0 +1,405 @@ +package eu.dnetlib.pace.distance; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.googlecode.protobuf.format.JsonFormat; +import eu.dnetlib.data.proto.OafProtos; +import eu.dnetlib.pace.AbstractProtoPaceTest; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.distance.eval.ScoreResult; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.ProtoDocumentBuilder; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class DetectorTest extends AbstractProtoPaceTest { + + private static final Log log = LogFactory.getLog(DetectorTest.class); + + @Test + public void testDistanceResultSimple() { + final Config config = getResultSimpleConf(); + + final MapDocument resA = result(config, "A", "Recent results from CDF"); + final MapDocument resB = result(config, "B", "Recent results from CDF"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d == 1.0); + } + + @Test + public void testDistanceResultSimpleMissingDates() { + final Config config = getResultSimpleConf(); + + final MapDocument resA = result(config, "A", "Recent results from BES"); + final MapDocument resB = result(config, "A", "Recent results from CES"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d > 0.97); + } + + @Test + public void testDistanceResultInvalidDate() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05"); + final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d == 1.0); + } + + @Ignore + @Test + public void testDistanceResultMissingOneDate() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "title title title 6BESR", null); + final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue((d > 0.9) && (d < 1.0)); + } + + @Ignore + @Test + public void testDistanceResult() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "title title title BES", ""); + final MapDocument resB = result(config, "B", "title title title CLEO"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue((d > 0.9) && (d < 1.0)); + } + + @Ignore + @Test + public void testDistanceResultMissingTwoDate() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "title title title 6BESR"); + final MapDocument resB = result(config, "B", "title title title 6CLER"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue((d > 0.9) && (d < 1.0)); + } + + @Ignore + @Test + public void testDistanceOrganizationIgnoreMissing() { + + final Config config = getOrganizationSimpleConf(); + + final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE"); + final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR"); + + final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d > 0.99); + } + + @Test + public void testDistanceResultCase1() { + + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003"); + final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue((d > 0.9) && (d < 1.0)); + } + + @Test + public void testDistanceResultCaseDoiMatch1() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855"); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("exact DOIs will produce an exact match", d == 1.0); + } + + @Test + public void testDistanceResultCaseDoiMatch2() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855"); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0); + } + + @Test + public void testDistanceResultCaseDoiMatch3() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); + final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0); + } + + @Test + public void testDistanceResultCaseDoiMatch4() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); + final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0); + } + + @Test + public void testDistanceResultCaseDoiMatch5() { + + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020"); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0)); + } + + @Test + public void testDistanceResultCaseDoiMatch6() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); + final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0); + } + + @Test + public void testDistanceResultCaseDoiMatch7() { + final Config config = getResultConf(); + + final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds")); + final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944"); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1); + } + + // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855 + + @Test + public void testDistanceResultCaseAuthor1() { + + final Config config = getResultAuthorsConf(); + + final List authorsA = Lists.newArrayList("a", "b", "c", "d"); + final List authorsB = Lists.newArrayList("a", "b", "c"); + final List pid = Lists.newArrayList(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d == 0.0); + } + + @Test + public void testDistanceResultCaseAuthor2() { + + final Config config = getResultAuthorsConf(); + + final List authorsA = Lists.newArrayList("a", "b", "c"); + final List authorsB = Lists.newArrayList("a", "b", "c"); + final List pid = Lists.newArrayList(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue(d == 1.0); + } + + @Test + public void testDistanceResultCaseAuthor3() { + + final Config config = getResultAuthorsConf(); + + final List authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M."); + final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); + final List pid = Lists.newArrayList(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + assertTrue((d > 0.9) && (d < 1.0)); + } + + @Test + public void testDistanceResultCaseAuthor4() { + + final Config config = getResultAuthorsConf(); + + final List authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a"); + final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); + final List pid = Lists.newArrayList(); + + final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); + final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + // assertTrue(d.getScore() == 0.0); + } + + @Test + public void testDistanceResultFullConf() { + + final Config config = getResultFullConf(); + + final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); + final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); + + final MapDocument resA = + result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", + "10.1186/1752-1947-4-299", authorsA); + + final MapDocument resB = + result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", + "10.1186/1752-1947-4-299", authorsB); + + final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); + final double d = sr.getScore(); + log.info(String.format(" d ---> %s", d)); + + // assertTrue(d.getScore() == 0.0); + } + + @Ignore + @Test + public void testDistance() throws IOException { + + final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json")); + + final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json"); + final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json"); + + final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf); + + log.info("score = " + result); + + } + + @Ignore + @Test + public void testDistanceOrgs() throws IOException { + + final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); + + final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json")); + final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json")); + + Set keysA = getGroupingKeys(conf, orgA); + Set keysB = getGroupingKeys(conf, orgB); + + assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty()); + + log.info("clustering keys A = " + getGroupingKeys(conf, orgA)); + log.info("clustering keys B = " + getGroupingKeys(conf, orgB)); + + final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf); + + log.info("score = " + result); + log.info("distance = " + result.getScore()); + } + + private Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } + + private MapDocument asMapDocument(DedupConfig conf, final String json) { + OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder(); + try { + JsonFormat.merge(json, b); + } catch (JsonFormat.ParseException e) { + throw new IllegalArgumentException(e); + } + return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel()); + } + + +} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java new file mode 100644 index 0000000..56ddc2c --- /dev/null +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.model; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.google.common.collect.Sets.SetView; +import eu.dnetlib.pace.AbstractProtoPaceTest; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.DetectorTest; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentSerializer; +import eu.dnetlib.pace.model.ProtoDocumentBuilder; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.junit.Test; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { + + private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class); + + @Test + public void test_serialise1() { + + final String id = "12345"; + + final Config config = getResultFullConf(); + + final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model()); + + assertFalse(document.fieldNames().isEmpty()); + assertFalse(Iterables.isEmpty(document.fields())); + + log.info("original:\n" + document); + + final String stringDoc = MapDocumentSerializer.toString(document); + + log.info("srialization:\n" + stringDoc); + + final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); + + final SetView diff = Sets.difference(document.fieldNames(), decoded.fieldNames()); + + assertTrue(diff.isEmpty()); + + log.info("decoded:\n" + decoded); + } + +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json new file mode 100644 index 0000000..be5ec28 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/alicante.json @@ -0,0 +1,121 @@ +{ + "dateoftransformation": "2018-08-07T06:48:42.668Z", + "originalId": [ + "oai:rua.ua.es:10045/34236" + ], + "oaiprovenance": { + "originDescription": { + "metadataNamespace": "http://www.openarchives.org/OAI/2.0/oai_dc/", + "altered": true, + "baseURL": "http://rua.ua.es/dspace-oai/request", + "datestamp": "2016-04-28T11:28:35Z", + "harvestDate": "2018-06-14T13:53:42.185Z", + "identifier": "oai:rua.ua.es:10045/34236" + } + }, + "result": { + "instance": [ + { + "hostedby": { + "value": "Repositorio Institucional de la Universidad de Alicante", + "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0" + }, + "url": [ + "http://hdl.handle.net/10045/34236" + ], + "dateofacceptance": { + "value": "2013-11-27" + }, + "collectedfrom": { + "value": "Repositorio Institucional de la Universidad de Alicante", + "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0" + }, + "accessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemename": "dnet:access_modes", + "schemeid": "dnet:access_modes" + }, + "instancetype": { + "classid": "0010", + "classname": "Lecture", + "schemename": "dnet:publication_resource", + "schemeid": "dnet:publication_resource" + } + } + ], + "metadata": { + "language": { + "classid": "eng", + "classname": "English", + "schemename": "dnet:languages", + "schemeid": "dnet:languages" + }, + "title": [ + { + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemename": "dnet:dataCite_title", + "schemeid": "dnet:dataCite_title" + }, + "value": "Henry James (1843-1916)" + } + ], + "journal": { + "name": "" + }, + "author": [ + { + "fullname": "Gómez Reus, Teresa", + "surname": "Gómez Reus", + "name": "Teresa", + "rank": 1 + } + ], + "resulttype": { + "classid": "other", + "classname": "other", + "schemename": "dnet:result_typologies", + "schemeid": "dnet:result_typologies" + }, + "dateofacceptance": { + "value": "2013-11-27" + }, + "contributor": [ + { + "value": "Universidad de Alicante. Departamento de Filología Inglesa" + } + ], + "subject": [ + { + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemename": "dnet:result_subject", + "schemeid": "dnet:result_subject" + }, + "value": "James, Henry" + }, + { + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemename": "dnet:result_subject", + "schemeid": "dnet:result_subject" + }, + "value": "Filología Inglesa" + } + ] + } + }, + "collectedfrom": [ + { + "value": "Repositorio Institucional de la Universidad de Alicante", + "key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0" + } + ], + "dateofcollection": "2018-06-14T13:53:42.185Z", + "type": 50, + "id": "50|od_______935::2b908ad38030168759c568f49af50784" +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json new file mode 100644 index 0000000..669e394 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/crossref.json @@ -0,0 +1,78 @@ +{ + "pid": [ + { + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemename": "dnet:pid_types", + "schemeid": "dnet:pid_types" + }, + "value": "10.1002/9781444393675.ch6" + } + ], + "result": { + "instance": [ + { + "url": [ + "http://dx.doi.org/10.1002/9781444393675.ch6" + ], + "collectedfrom": { + "value": "CrossRef", + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" + }, + "hostedby": { + "value": "Unknown Repository", + "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c" + }, + "accessright": { + "classid": "CLOSED", + "classname": "Closed Access", + "schemename": "dnet:access_modes", + "schemeid": "dnet:access_modes" + }, + "instancetype": { + "classid": "0013", + "classname": "Part of book or chapter of book", + "schemename": "dnet:publication_resource", + "schemeid": "dnet:publication_resource" + } + } + ], + "metadata": { + "title": [ + { + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemename": "dnet:dataCite_title", + "schemeid": "dnet:dataCite_title" + }, + "value": "Henry James (1843-1916)" + } + ], + "resulttype": { + "classid": "publication", + "classname": "publication", + "schemename": "dnet:result_typologies", + "schemeid": "dnet:result_typologies" + } + } + }, + "collectedfrom": [ + { + "value": "Microsoft Academic Graph", + "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a" + }, + { + "value": "CrossRef", + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" + }, + { + "value": "UnpayWall", + "key": "10|openaire____::8ac8380272269217cb09a928c8caa993" + } + ], + "dateofcollection": "2018-08-07 12:24:48Z", + "type": 50, + "id": "50|crossref____::0000002a9885b7ec89b7b9d8ff3331a0" +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf new file mode 100644 index 0000000..0dcfe51 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization.pace.conf @@ -0,0 +1,34 @@ +{ + "wf" : { + "threshold" : "0.85", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "20000", + "groupMaxSize" : "20", + "slidingWindowSize" : "400", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "immutablefieldvalue", "fields" : [ "country" ], "params" : { } }, + { "name" : "spacetrimmingfieldvalue", "fields" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + ], + "conditions" : [ + { "name" : "exactMatch", "fields" : [ "country" ] }, + { "name" : "mustBeDifferent", "fields" : [ "gridid" ] } + ], + "model" : [ + { "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, + { "name" : "legalshortname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.6", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { } + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json new file mode 100644 index 0000000..80bbaa3 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization1.json @@ -0,0 +1,34 @@ +{ + "dateoftransformation": "2018-06-04", + "originalId": [ + "opendoar____::Institute_of_Information_Science_and_Technology_"A._Faedo"" + ], + "collectedfrom": [ + { + "value": "OpenDOAR", + "key": "10|openaire____::47ce9e9f4fad46e732cff06419ecaabb" + } + ], + "organization": { + "metadata": { + "legalshortname": { + "value": "CNR-ISTI" + }, + "websiteurl": { + "value": "http://www.isti.cnr.it/aaaaa" + }, + "country": { + "classid": "IT", + "classname": "IT", + "schemename": "dnet:countries", + "schemeid": "dnet:countries" + }, + "legalname": { + "value": "Institute of Information Science and Technology "A. Faedo"" + } + } + }, + "dateofcollection": "2015-08-24", + "type": 20, + "id": "20|opendoar____::68d8b122736484cb07f75885af22e82f" +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json new file mode 100644 index 0000000..dd91c26 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/organization2.json @@ -0,0 +1,48 @@ +{ + "collectedfrom": [ + { + "value": "GRID - Global Research Identifier Database", + "key": "10|openaire____::ff4a008470319a22d9cf3d14af485977" + } + ], + "organization": { + "metadata": { + "legalshortname": { + "value": "ISTI" + }, + "websiteurl": { + "value": "http://www.isti.cnr.it/aaaaaa" + }, + "country": { + "classid": "IT", + "classname": "Italy", + "schemename": "dnet:countries", + "schemeid": "dnet:countries" + }, + "alternativeNames": [ + { + "value": "Istituto di Scienza e Tecnologie dell'Informazione \"A. Faedo\"" + }, + { + "value": "ISTI" + } + ], + "legalname": { + "value": "CNR - Institute of Information Science and Technologies" + } + } + }, + "pid": [ + { + "qualifier": { + "classid": "grid", + "classname": "grid", + "schemename": "dnet:pid_types", + "schemeid": "dnet:pid_types" + }, + "value": "grid.451498.5" + } + ], + "type": 20, + "id": "20|grid________::e4095563f4e9d34dff7d47fb98af042f" +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf new file mode 100644 index 0000000..ee39fc0 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.authors.pace.conf @@ -0,0 +1,25 @@ +{ + "wf" : { + "threshold" : "0.99", + "run" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "conditions" : [ + { "name" : "sizeMatch", "fields" : [ "authors" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] } + ], + "model" : [ + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } + ], + "blacklists" : { } + } + +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf new file mode 100644 index 0000000..80a5458 --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.full.pace.conf @@ -0,0 +1,51 @@ +{ + "wf" : { + "threshold" : "0.99", + "run" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } , + { "name" : "pidMatch", "fields" : [ "pid" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , + { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } + } + +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf new file mode 100644 index 0000000..86dd27f --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.pace.conf @@ -0,0 +1,29 @@ +{ + "wf" : { + "threshold" : "0.99", + "run" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "strictConditions" : [ + { "name" : "pidMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } + ], + "blacklists" : { } + } + +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf new file mode 100644 index 0000000..462f79b --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.conf @@ -0,0 +1,273 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "4000", + "groupMaxSize" : "40", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ], + "includeChildren" : "true", + "maxChildren" : "40" + }, + "pace" : { + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } + ], + "strictConditions" : [ + { "name" : "pidMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" }, + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } + ], + "blacklists" : { + "title" : [ + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\!?\:?$", + "^Chronic fatigue syndrome\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$", + "^Gushi hakubutsugaku$", + + "^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\W*Cloud Computing\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + + "^“The Historical Aspects? of Quackery\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + + "^Analýza reklamy$", + "^Analysis of advertising$", + + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\.?$", + "(?i)^.*authors['’′]? response\.?$" + ] + } + } +} diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json new file mode 100644 index 0000000..4e99d6d --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.prod.pace.json @@ -0,0 +1,275 @@ +{ + "wf" : { + "threshold" : "0.99", + "dedupRun" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "4000", + "groupMaxSize" : "40", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ], + "includeChildren" : "true", + "maxChildren" : "40" + }, + "pace" : { + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } + ], + "strictConditions" : [ + { "name" : "pidMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" }, + { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } + ], + "blacklists" : { + "title" : [ + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\!?\:?$", + "^Chronic fatigue syndrome\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$", + "^Gushi hakubutsugaku$", + + "^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\W*Cloud Computing\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + + "^“The Historical Aspects? of Quackery\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + + "^Analýza reklamy$", + "^Analysis of advertising$", + + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\.?$", + "(?i)^.*authors['’′]? response\.?$" + ] + } + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf new file mode 100644 index 0000000..910fbcd --- /dev/null +++ b/dnet-dedup-test/src/test/resources/eu/dnetlib/pace/result.simple.pace.conf @@ -0,0 +1,21 @@ +{ + "wf" : { + "threshold" : "0.99", + "run" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "conditions" : [ ], + "model" : [ + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" } + ], + "blacklists" : { } + } + +} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 3cbb5bc..e445041 100644 --- a/pom.xml +++ b/pom.xml @@ -109,6 +109,13 @@ dnet-openaire-data-protos 3.9.3-proto250 + + eu.dnetlib + dnet-openaireplus-mapping-utils + 6.2.17-SNAPSHOT + + + com.google.guava guava