From a10e8d9f05b32d59ec5b9feda1ca0d7e5e12ba70 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 28 Jun 2024 16:46:52 +0200 Subject: [PATCH] implementation of countryMatch and addition of workflow parameters --- .../eu/dnetlib/pace/tree/CountryMatch.java | 47 +++++++++++++++++++ .../pace/comparators/ComparatorTest.java | 19 ++++++++ .../openorgs/oozie_app/config-default.xml | 8 ++++ .../oa/dedup/openorgs/oozie_app/workflow.xml | 2 + .../dnetlib/dhp/dedup/conf/org.curr.conf.json | 2 +- 5 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java new file mode 100644 index 0000000000..9cf6163562 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CountryMatch.java @@ -0,0 +1,47 @@ +package eu.dnetlib.pace.tree; + +import java.util.Map; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +@ComparatorClass("countryMatch") +public class CountryMatch extends AbstractStringComparator { + + public CountryMatch(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } + + public CountryMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } + if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) { + return -1.0; // return -1 if a country is UNKNOWN + } + + return a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } +} diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index b37e16cf5e..8e72f4efc9 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -336,4 +336,23 @@ public class ComparatorTest extends AbstractPaceTest { System.out.println("compare = " + compare); } + @Test + public void countryMatch() { + + CountryMatch countryMatch = new CountryMatch(params); + + double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf); + assertEquals(-1.0, result); + + result = countryMatch.distance("CHILE", "UNKNOWN", conf); + assertEquals(-1.0, result); + + result = countryMatch.distance("CHILE", "ITALY", conf); + assertEquals(0.0, result); + + result = countryMatch.distance("CHILE", "CHILE", conf); + assertEquals(1.0, result); + + } + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml index 2e0ed9aeea..6d375f03f9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/config-default.xml @@ -15,4 +15,12 @@ oozie.action.sharelib.for.spark spark2 + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + pivotHistoryDatabase + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml index 6947019e8b..7c633facc3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/openorgs/oozie_app/workflow.xml @@ -198,6 +198,8 @@ --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --cutConnectedComponent${cutConnectedComponent} + --hiveMetastoreUris${hiveMetastoreUris} + --pivotHistoryDatabase${pivotHistoryDatabase} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json index 726f2b8997..917c9426b9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json @@ -49,7 +49,7 @@ }, { "field": "country", - "comparator": "exactMatch", + "comparator": "countryMatch", "weight": 1, "countIfUndefined": "true", "params": {}