From 101d9e830dc914bac5969317e78a12a3d5957045 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 11 Dec 2024 15:59:13 +0100 Subject: [PATCH] JsonListMatch do not lower the extracted strings Fix test configurations and assertions --- .../eu/dnetlib/pace/tree/JsonListMatch.java | 2 +- .../dhp/oa/dedup/DecisionTreeTest.java | 8 +++--- .../dnetlib/dhp/oa/dedup/SparkDedupTest.java | 27 ++++++++++--------- .../dhp/oa/dedup/SparkOpenorgsDedupTest.java | 4 +-- .../oa/dedup/SparkPublicationRootsTest.java | 2 +- .../oa/dedup/SparkPublicationRootsTest2.java | 9 ++++--- .../dnetlib/dhp/oa/dedup/SparkStatsTest.java | 2 +- .../dhp/oa/dedup/jpath/JsonPathTest.java | 6 ++--- .../dnetlib/dhp/dedup/conf/org.curr.conf.json | 24 +++++++---------- 9 files changed, 39 insertions(+), 45 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index d9558df90b..2f845b0674 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -86,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator { // for each path in the param list for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { String path = params.get(key); - String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase(); + String value = MapDocumentUtil.getJPathString(path, documentContext); if (value == null || value.isEmpty()) value = ""; st.append(value); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DecisionTreeTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DecisionTreeTest.java index 6acc65e052..5094317cb5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DecisionTreeTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DecisionTreeTest.java @@ -21,17 +21,15 @@ class DecisionTreeTest { void testJPath() throws IOException { DedupConfig conf = DedupConfig - .load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json"))); + .load(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json"))); - final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json")); + final String org = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/jpath/organization.json")); Row row = SparkModel.apply(conf).rowFromJson(org); System.out.println("row = " + row); Assertions.assertNotNull(row); Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier"))); - - System.out.println("row = " + row.getAs("countrytitle")); } @Test @@ -44,7 +42,7 @@ class DecisionTreeTest { .getResourceAsStream( "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json")); + final String org = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/jpath/organization_example1.json")); Row row = SparkModel.apply(conf).rowFromJson(org); // to check that the same parsing returns the same row diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 3bfd861f83..d2d5af501a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable { System.out.println("orp_simrel = " + orp_simrel); if (CHECK_CARDINALITIES) { - assertEquals(742, orgs_simrel); + assertEquals(720, orgs_simrel); assertEquals(566, pubs_simrel); assertEquals(113, sw_simrel); assertEquals(148, ds_simrel); @@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable { // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) if (CHECK_CARDINALITIES) { - assertEquals(742, orgs_simrel); + assertEquals(720, orgs_simrel); assertEquals(566, pubs_simrel); assertEquals(148, ds_simrel); assertEquals(280, orp_simrel); @@ -440,25 +440,26 @@ public class SparkDedupTest implements Serializable { .count(); final List merges = pubs - .filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") + .filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")// and relClass = '"+ModelConstants.MERGES+"'") .collectAsList(); - assertEquals(1, merges.size()); + assertEquals(4, merges.size()); Set dups = Sets .newHashSet( "50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73", "50|doi_________::d5021b53204e4fdeab6ff5d5bc468032", - "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c"); + "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c", + "50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c"); merges.forEach(r -> { assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType()); - assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass()); + assertEquals(ModelConstants.MERGES, r.getRelClass()); assertTrue(dups.contains(r.getTarget())); }); final List mergedIn = pubs - .filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") + .filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'") .collectAsList(); - assertEquals(3, mergedIn.size()); + assertEquals(4, mergedIn.size()); mergedIn.forEach(r -> { assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType()); @@ -473,8 +474,8 @@ public class SparkDedupTest implements Serializable { System.out.println("orp_mergerel = " + orp_mergerel); if (CHECK_CARDINALITIES) { - assertEquals(1268, orgs_mergerel); - assertEquals(1156, pubs.count()); + assertEquals(1280, orgs_mergerel); + assertEquals(1158, pubs.count()); assertEquals(292, sw_mergerel); assertEquals(476, ds_mergerel); assertEquals(742, orp_mergerel); @@ -561,7 +562,7 @@ public class SparkDedupTest implements Serializable { System.out.println("orp_mergerel = " + orp_mergerel); if (CHECK_CARDINALITIES) { - assertEquals(1278, orgs_mergerel); + assertEquals(1280, orgs_mergerel); assertEquals(1156, pubs.count()); assertEquals(292, sw_mergerel); assertEquals(476, ds_mergerel); @@ -618,7 +619,7 @@ public class SparkDedupTest implements Serializable { System.out.println("orp_deduprecord = " + orp_deduprecord); if (CHECK_CARDINALITIES) { - assertEquals(78, orgs_deduprecord); + assertEquals(87, orgs_deduprecord); assertEquals(96, pubs.count()); assertEquals(47, sw_deduprecord); assertEquals(97, ds_deduprecord); @@ -761,7 +762,7 @@ public class SparkDedupTest implements Serializable { if (CHECK_CARDINALITIES) { assertEquals(930, publications); - assertEquals(831, organizations); + assertEquals(840, organizations); assertEquals(100, projects); assertEquals(100, datasource); assertEquals(196, softwares); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java index b2b5d824b3..6b3ef8a209 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkOpenorgsDedupTest.java @@ -146,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .count(); - assertEquals(92, orgs_simrel); + assertEquals(91, orgs_simrel); } @Test @@ -175,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .count(); - assertEquals(128, orgs_simrel); + assertEquals(127, orgs_simrel); } @Test diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java index 9d73475be3..2efa26c020 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest.java @@ -324,7 +324,7 @@ public class SparkPublicationRootsTest implements Serializable { private void verifyRoot_case_3(Dataset roots, Dataset pubs) { Publication root = roots - .filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'") + .filter("id = '50|dedup_wf_002::7143f4ff5708f3657db0b7e68ea74d55'") .first(); assertNotNull(root); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java index 9afe1e34b8..1e73801ff9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkPublicationRootsTest2.java @@ -143,7 +143,9 @@ public class SparkPublicationRootsTest2 implements Serializable { "--graphBasePath", graphInputPath, "--actionSetId", testActionSetId, "--isLookUpUrl", "lookupurl", - "--workingPath", workingPath + "--workingPath", workingPath, + "--hiveMetastoreUris", "", + }), spark) .run(isLookUpService); @@ -153,7 +155,7 @@ public class SparkPublicationRootsTest2 implements Serializable { .as(Encoders.bean(Relation.class)); assertEquals( - 3, merges + 4, merges .filter("relclass == 'isMergedIn'") .map((MapFunction) Relation::getTarget, Encoders.STRING()) .distinct() @@ -178,7 +180,7 @@ public class SparkPublicationRootsTest2 implements Serializable { .textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord") .map(asEntity(Publication.class), Encoders.bean(Publication.class)); - assertEquals(3, roots.count()); + assertEquals(4, roots.count()); final Dataset pubs = spark .read() @@ -195,7 +197,6 @@ public class SparkPublicationRootsTest2 implements Serializable { .collectAsList() .get(0); - assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue()); assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName()); assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted()); assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue()); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java index 19f2c81024..d21a0dcf41 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkStatsTest.java @@ -168,7 +168,7 @@ public class SparkStatsTest implements Serializable { .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .count(); - assertEquals(414, orgs_blocks); + assertEquals(406, orgs_blocks); assertEquals(221, pubs_blocks); assertEquals(134, sw_blocks); assertEquals(196, ds_blocks); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java index 18c9ce18df..0923f67fae 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java @@ -19,17 +19,15 @@ class JsonPathTest { void testJPath() throws IOException { DedupConfig conf = DedupConfig - .load(IOUtils.toString(getClass().getResourceAsStream("dedup_conf_organization.json"))); + .load(IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/jpath/dedup_conf_organization.json"))); - final String org = IOUtils.toString(getClass().getResourceAsStream("organization.json")); + final String org = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/dedup/jpath/organization.json")); Row row = SparkModel.apply(conf).rowFromJson(org); System.out.println("row = " + row); Assertions.assertNotNull(row); Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier"))); - - System.out.println("row = " + row.getAs("countrytitle")); } @Test diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json index f00f6198e1..388f0b6ff1 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json @@ -24,22 +24,19 @@ "start": { "fields": [ { - "field": "gridid", - "comparator": "exactMatch", + "field": "pid", + "comparator": "jsonListMatch", "weight": 1, "countIfUndefined": "false", - "params": {} - }, - { - "field": "rorid", - "comparator": "exactMatch", - "weight": 1, - "countIfUndefined": "false", - "params": {} + "params": { + "jpath_classid": "$.qualifier.classid", + "jpath_value": "$.value", + "mode": "type" + } } ], "threshold": 1, - "aggregation": "OR", + "aggregation": "MAX", "positive": "MATCH", "negative": "NO_MATCH", "undefined": "necessaryConditions", @@ -149,11 +146,10 @@ "model" : [ { "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"}, - { "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" }, + { "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value", "clean": "title"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"}, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, - { "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"}, + { "name": "pid", "type": "JSON", "path": "$.pid[*]", "overrideMatch": "true"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : {},