diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 1dc3208b5..b295bc1f1 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -92,6 +92,17 @@
com.squareup.okhttp3
okhttp
+
+
+ eu.dnetlib
+ dnet-pace-core
+
+
+
+ eu.dnetlib.dhp
+ dhp-schemas
+ ${project.version}
+
diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
new file mode 100644
index 000000000..bc86a0245
--- /dev/null
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java
@@ -0,0 +1,168 @@
+package eu.dnetlib.dhp.oa.merge;
+import java.text.Normalizer;
+import java.util.*;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang3.StringUtils;
+
+import com.wcohen.ss.JaroWinkler;
+
+import eu.dnetlib.dhp.schema.oaf.Author;
+import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
+import eu.dnetlib.pace.model.Person;
+import scala.Tuple2;
+
+public class AuthorMerger {
+
+ private static final Double THRESHOLD = 0.95;
+
+ public static List merge(List> authors) {
+
+ authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
+
+ List author = new ArrayList<>();
+
+ for (List a : authors) {
+ author = mergeAuthor(author, a);
+ }
+
+ return author;
+
+ }
+
+ public static List mergeAuthor(final List a, final List b) {
+ int pa = countAuthorsPids(a);
+ int pb = countAuthorsPids(b);
+ List base, enrich;
+ int sa = authorsSize(a);
+ int sb = authorsSize(b);
+
+ if (pa == pb) {
+ base = sa > sb ? a : b;
+ enrich = sa > sb ? b : a;
+ } else {
+ base = pa > pb ? a : b;
+ enrich = pa > pb ? b : a;
+ }
+ enrichPidFromList(base, enrich);
+ return base;
+ }
+
+ private static void enrichPidFromList(List base, List enrich) {
+ if (base == null || enrich == null)
+ return;
+ final Map basePidAuthorMap = base
+ .stream()
+ .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+ .flatMap(
+ a -> a
+ .getPid()
+ .stream()
+ .map(p -> new Tuple2<>(pidToComparableString(p), a)))
+ .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
+
+ final List> pidToEnrich = enrich
+ .stream()
+ .filter(a -> a.getPid() != null && a.getPid().size() > 0)
+ .flatMap(
+ a -> a
+ .getPid()
+ .stream()
+ .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
+ .map(p -> new Tuple2<>(p, a)))
+ .collect(Collectors.toList());
+
+ pidToEnrich
+ .forEach(
+ a -> {
+ Optional> simAuthor = base
+ .stream()
+ .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
+ .max(Comparator.comparing(Tuple2::_1));
+
+ if (simAuthor.isPresent()) {
+ double th = THRESHOLD;
+ // increase the threshold if the surname is too short
+ if (simAuthor.get()._2().getSurname() != null
+ && simAuthor.get()._2().getSurname().length() <= 3)
+ th = 0.99;
+
+ if (simAuthor.get()._1() > th) {
+ Author r = simAuthor.get()._2();
+ if (r.getPid() == null) {
+ r.setPid(new ArrayList<>());
+ }
+ r.getPid().add(a._1());
+ }
+ }
+ });
+ }
+
+ public static String pidToComparableString(StructuredProperty pid) {
+ return (pid.getQualifier() != null
+ ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
+ : "")
+ + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
+ }
+
+ public static int countAuthorsPids(List authors) {
+ if (authors == null)
+ return 0;
+
+ return (int) authors.stream().filter(AuthorMerger::hasPid).count();
+ }
+
+ private static int authorsSize(List authors) {
+ if (authors == null)
+ return 0;
+ return authors.size();
+ }
+
+ private static Double sim(Author a, Author b) {
+
+ final Person pa = parse(a);
+ final Person pb = parse(b);
+
+ // if both are accurate (e.g. they have name and surname)
+ if (pa.isAccurate() & pb.isAccurate()) {
+ return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
+ + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
+ } else {
+ return new JaroWinkler()
+ .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
+ }
+ }
+
+ private static boolean hasPid(Author a) {
+ if (a == null || a.getPid() == null || a.getPid().size() == 0)
+ return false;
+ return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
+ }
+
+ private static Person parse(Author author) {
+ if (StringUtils.isNotBlank(author.getSurname())) {
+ return new Person(author.getSurname() + ", " + author.getName(), false);
+ } else {
+ return new Person(author.getFullname(), false);
+ }
+ }
+
+ private static String normalize(final String s) {
+ return nfd(s)
+ .toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError
+ // in case
+ // of large input strings
+ .replaceAll("(\\W)+", " ")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ private static String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
+
+}
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
deleted file mode 100644
index ee5fd5165..000000000
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AuthorMerger.java
+++ /dev/null
@@ -1,170 +0,0 @@
-
-package eu.dnetlib.dhp.oa.dedup;
-
-import java.text.Normalizer;
-import java.util.*;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang3.StringUtils;
-
-import com.wcohen.ss.JaroWinkler;
-
-import eu.dnetlib.dhp.schema.oaf.Author;
-import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
-import eu.dnetlib.pace.model.Person;
-import scala.Tuple2;
-
-public class AuthorMerger {
-
- private static final Double THRESHOLD = 0.95;
-
- public static List merge(List> authors) {
-
- authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
-
- List author = new ArrayList<>();
-
- for (List a : authors) {
- author = mergeAuthor(author, a);
- }
-
- return author;
-
- }
-
- public static List mergeAuthor(final List a, final List b) {
- int pa = countAuthorsPids(a);
- int pb = countAuthorsPids(b);
- List base, enrich;
- int sa = authorsSize(a);
- int sb = authorsSize(b);
-
- if (pa == pb) {
- base = sa > sb ? a : b;
- enrich = sa > sb ? b : a;
- } else {
- base = pa > pb ? a : b;
- enrich = pa > pb ? b : a;
- }
- enrichPidFromList(base, enrich);
- return base;
- }
-
- private static void enrichPidFromList(List base, List enrich) {
- if (base == null || enrich == null)
- return;
- final Map basePidAuthorMap = base
- .stream()
- .filter(a -> a.getPid() != null && a.getPid().size() > 0)
- .flatMap(
- a -> a
- .getPid()
- .stream()
- .map(p -> new Tuple2<>(pidToComparableString(p), a)))
- .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
-
- final List> pidToEnrich = enrich
- .stream()
- .filter(a -> a.getPid() != null && a.getPid().size() > 0)
- .flatMap(
- a -> a
- .getPid()
- .stream()
- .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
- .map(p -> new Tuple2<>(p, a)))
- .collect(Collectors.toList());
-
- pidToEnrich
- .forEach(
- a -> {
- Optional> simAuthor = base
- .stream()
- .map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
- .max(Comparator.comparing(Tuple2::_1));
-
- if (simAuthor.isPresent()) {
- double th = THRESHOLD;
- // increase the threshold if the surname is too short
- if (simAuthor.get()._2().getSurname() != null
- && simAuthor.get()._2().getSurname().length() <= 3)
- th = 0.99;
-
- if (simAuthor.get()._1() > th) {
- Author r = simAuthor.get()._2();
- if (r.getPid() == null) {
- r.setPid(new ArrayList<>());
- }
- r.getPid().add(a._1());
- }
- }
- });
- }
-
- public static String pidToComparableString(StructuredProperty pid) {
- return (pid.getQualifier() != null
- ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
- : "")
- + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
- }
-
- public static int countAuthorsPids(List authors) {
- if (authors == null)
- return 0;
-
- return (int) authors.stream().filter(AuthorMerger::hasPid).count();
- }
-
- private static int authorsSize(List authors) {
- if (authors == null)
- return 0;
- return authors.size();
- }
-
- private static Double sim(Author a, Author b) {
-
- final Person pa = parse(a);
- final Person pb = parse(b);
-
- // if both are accurate (e.g. they have name and surname)
- if (pa.isAccurate() & pb.isAccurate()) {
- return new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())) * 0.5
- + new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString())) * 0.5;
- } else {
- return new JaroWinkler()
- .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
- }
- }
-
- private static boolean hasPid(Author a) {
- if (a == null || a.getPid() == null || a.getPid().size() == 0)
- return false;
- return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
- }
-
- private static Person parse(Author author) {
- if (StringUtils.isNotBlank(author.getSurname())) {
- return new Person(author.getSurname() + ", " + author.getName(), false);
- } else {
- return new Person(author.getFullname(), false);
- }
- }
-
- private static String normalize(final String s) {
- return nfd(s)
- .toLowerCase()
- // do not compact the regexes in a single expression, would cause StackOverflowError
- // in case
- // of large input strings
- .replaceAll("(\\W)+", " ")
- .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
- .replaceAll("(\\p{Punct})+", " ")
- .replaceAll("(\\d)+", " ")
- .replaceAll("(\\n)+", " ")
- .trim();
- }
-
- private static String nfd(final String s) {
- return Normalizer.normalize(s, Normalizer.Form.NFD);
- }
-
-}
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
index 8028d5a94..6a030f376 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java
@@ -1,11 +1,12 @@
package eu.dnetlib.dhp.oa.dedup;
-import java.io.Serializable;
+
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
+import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
index 4fbd7c223..e00f6ac2a 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
+++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/EntityMergerTest.java
@@ -10,6 +10,7 @@ import java.io.Serializable;
import java.nio.file.Paths;
import java.util.*;
+import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml
index 38c5c8af7..3e1d84c01 100644
--- a/dhp-workflows/dhp-graph-mapper/pom.xml
+++ b/dhp-workflows/dhp-graph-mapper/pom.xml
@@ -83,13 +83,6 @@
dhp-schemas
${project.version}
-
-
- eu.dnetlib.dhp
- dhp-dedup-openaire
- ${project.version}
-
-
com.jayway.jsonpath
json-path
diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
index 90d665e0c..ee2dbadfd 100644
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/ebi/EBIAggregator.scala
@@ -1,5 +1,5 @@
package eu.dnetlib.dhp.sx.ebi
-import eu.dnetlib.dhp.oa.dedup.AuthorMerger
+import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
import org.apache.spark.sql.{Encoder, Encoders}