From d6498278edc87aeb15ee61b33edf7f280829b56a Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 25 Jun 2020 18:43:29 +0200 Subject: [PATCH] added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) --- .../orcid/ActivitiesDecompressor.java | 2 +- .../doiboost/orcid/SummariesDecompressor.java | 2 +- .../doiboost/orcid/json/JsonHelper.java | 16 + .../orcidnodoi/ActivitiesDumpReader.java | 149 +++++ .../orcidnodoi/GenOrcidAuthorWork.java | 52 ++ .../SparkGenEnrichedOrcidWorks.java | 119 ++++ .../json/JsonWriter.java | 2 +- .../orcidnodoi/model/Contributor.java | 6 +- .../orcidnodoi/model/WorkDataNoDoi.java | 1 - .../orcidnodoi/similarity/AuthorMatcher.java | 204 +++++++ .../oozie_app/config-default.xml | 22 + .../oozie_app/workflow.xml | 524 ++++++++++++++++++ .../gen_enriched_orcid_works_parameters.json | 7 + .../orcidnodoi/xml/OrcidNoDoiTest.java | 250 +-------- 14 files changed, 1125 insertions(+), 231 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java rename dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/{orcid => orcidnodoi}/json/JsonWriter.java (94%) create mode 100644 dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java index 570fdef17..80ccd71a1 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ActivitiesDecompressor.java @@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.WorkData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java index f0bbb5c32..603bfedf6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/SummariesDecompressor.java @@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.mortbay.log.Log; -import eu.dnetlib.doiboost.orcid.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; import eu.dnetlib.doiboost.orcid.model.AuthorData; import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java new file mode 100644 index 000000000..13a3cee8f --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonHelper.java @@ -0,0 +1,16 @@ + +package eu.dnetlib.doiboost.orcid.json; + +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class JsonHelper { + + public static String createOidWork(WorkDataNoDoi workData) { + JsonObject oidWork = new JsonObject(); + oidWork.addProperty("oid", workData.getOid()); + oidWork.addProperty("work", new Gson().toJson(workData)); + return oidWork.toString(); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java new file mode 100644 index 000000000..7eb6faf54 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java @@ -0,0 +1,149 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import eu.dnetlib.doiboost.orcid.json.JsonHelper; +import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.mortbay.log.Log; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URI; + +public class ActivitiesDumpReader { + + private static final int MAX_XML_WORKS_PARSED = -1; + private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000; + + public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath) + throws Exception { + String uri = inputUri; + FileSystem fs = FileSystem.get(URI.create(uri), conf); + Path inputPath = new Path(uri); + CompressionCodecFactory factory = new CompressionCodecFactory(conf); + CompressionCodec codec = factory.getCodec(inputPath); + if (codec == null) { + System.err.println("No codec found for " + uri); + System.exit(1); + } + CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fs.open(inputPath)); + parseTarActivities(fs, conf, gzipInputStream, outputPath); + + } finally { + Log.debug("Closing gzip stream"); + IOUtils.closeStream(gzipInputStream); + } + } + + private static void parseTarActivities( + FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) { + int counter = 0; + int noDoiFound = 0; + int errorFromOrcidFound = 0; + int xmlParserErrorFound = 0; + try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) { + TarArchiveEntry entry = null; + + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(outputPath), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class))) { + while ((entry = tais.getNextTarEntry()) != null) { + String filename = entry.getName(); + + try { + if (entry.isDirectory() || !filename.contains("works")) { + + } else { + Log.debug("XML work entry name: " + entry.getName()); + counter++; + BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from + // tarInput + String line; + StringBuffer buffer = new StringBuffer(); + while ((line = br.readLine()) != null) { + buffer.append(line); + } + WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes()); + if (workDataNoDoi != null) { + if (workDataNoDoi.getErrorCode() != null) { + errorFromOrcidFound += 1; + Log + .debug( + "error from Orcid with code " + + workDataNoDoi.getErrorCode() + + " for entry " + + entry.getName()); + continue; + } + boolean isDoiFound = workDataNoDoi.getExtIds().stream() + .filter(e -> e.getType()!=null) + .anyMatch(e -> e.getType().equals("doi")); + if (!isDoiFound) { + String jsonData = JsonHelper.createOidWork(workDataNoDoi); + Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData); + + final Text key = new Text(workDataNoDoi.getOid()); + final Text value = new Text(jsonData); + + try { + writer.append(key, value); + } catch (IOException e) { + Log.debug("Writing to sequence file: " + e.getMessage()); + Log.debug(e); + throw new RuntimeException(e); + } + noDoiFound += 1; + } + + } else { + Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString()); + xmlParserErrorFound += 1; + } + } + } catch (Exception e) { + Log + .warn( + "Parsing work from tar archive and xml work: " + filename + " " + e.getMessage()); + Log.warn(e); + } + + if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) { + Log.info("Current xml works parsed: " + counter); + } + + if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) { + break; + } + } + } + } catch (IOException e) { + Log.warn("Parsing work from gzip archive: " + e.getMessage()); + Log.warn(e); + throw new RuntimeException(e); + } + Log.info("Activities parse completed"); + Log.info("Total XML works parsed: " + counter); + Log.info("Total no doi work found: " + noDoiFound); + Log.info("Error from Orcid found: " + errorFromOrcidFound); + Log.info("Error parsing xml work found: " + xmlParserErrorFound); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java new file mode 100644 index 000000000..b82f4bc4c --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/GenOrcidAuthorWork.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.OrcidDSManager; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.mortbay.log.Log; + +import java.io.IOException; + +public class GenOrcidAuthorWork extends OrcidDSManager { + + private String activitiesFileNameTarGz; + private String outputWorksPath; + private String workingPath; + + public static void main(String[] args) throws IOException, Exception { + GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork(); + genOrcidAuthorWork.loadArgs(args); + genOrcidAuthorWork.generateAuthorsDOIsData(); + } + + public void generateAuthorsDOIsData() throws Exception { + Configuration conf = initConfigurationObject(); + FileSystem fs = initFileSystemObject(conf); + String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz); + Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath)); + ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath); + } + + private void loadArgs(String[] args) throws IOException, Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenOrcidAuthorWork.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + parser.parseArgument(args); + + hdfsServerUri = parser.get("hdfsServerUri"); + Log.info("HDFS URI: " + hdfsServerUri); + workingPath = parser.get("workingPath"); + Log.info("Working Path: " + workingPath); + activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz"); + Log.info("Activities File Name: " + activitiesFileNameTarGz); + outputWorksPath = parser.get("outputWorksPath"); + Log.info("Output Author Work Data: " + outputWorksPath); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java new file mode 100644 index 000000000..6bb31bcf6 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/SparkGenEnrichedOrcidWorks.java @@ -0,0 +1,119 @@ + +package eu.dnetlib.doiboost.orcidnodoi; + +import com.google.gson.Gson; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.io.IOException; +import java.util.Objects; +import java.util.Optional; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +public class SparkGenEnrichedOrcidWorks { + + public static void main(String[] args) throws IOException, Exception { + Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class); + logger.info("[ SparkGenerateDoiAuthorList STARTED]"); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenEnrichedOrcidWorks.class + .getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json"))); + parser.parseArgument(args); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final String workingPath = parser.get("workingPath"); + logger.info("workingPath: ", workingPath); + final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath"); + logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath); + final String outputWorksPath = parser.get("outputWorksPath"); + logger.info("outputWorksPath: ", outputWorksPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaPairRDD summariesRDD = sc + .sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class); + Dataset summariesDataset = spark + .createDataset( + summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(), + Encoders.bean(AuthorData.class)); + + JavaPairRDD activitiesRDD = sc + .sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class); + Dataset activitiesDataset = spark + .createDataset( + activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(), + Encoders.bean(WorkDataNoDoi.class)); + + activitiesDataset + .joinWith( + summariesDataset, + activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner") + .map( + (MapFunction, Tuple2>) value -> { + WorkDataNoDoi w = value._1; + AuthorData a = value._2; + AuthorMatcher.match(a, w.getContributors()); + return new Tuple2<>(a.getOid(), w); + }, + Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class))) + .filter(Objects::nonNull) + .toJavaRDD() + .saveAsTextFile(workingPath + outputEnrichedWorksPath);; + }); + } + + private static AuthorData loadAuthorFromJson(Text orcidId, Text json) { + AuthorData authorData = new AuthorData(); + authorData.setOid(orcidId.toString()); + JsonElement jElement = new JsonParser().parse(json.toString()); + authorData.setName(getJsonValue(jElement, "name")); + authorData.setSurname(getJsonValue(jElement, "surname")); + authorData.setCreditName(getJsonValue(jElement, "creditname")); + return authorData; + } + + private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) { + WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class); + return workData; + } + + private static String getJsonValue(JsonElement jElement, String property) { + if (jElement.getAsJsonObject().has(property)) { + JsonElement name = null; + name = jElement.getAsJsonObject().get(property); + if (name != null && !name.isJsonNull()) { + return name.getAsString(); + } + } + return null; + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java similarity index 94% rename from dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java rename to dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java index 35676d5ba..7f7e3a10a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/json/JsonWriter.java @@ -1,5 +1,5 @@ -package eu.dnetlib.doiboost.orcid.json; +package eu.dnetlib.doiboost.orcidnodoi.json; import com.google.gson.JsonObject; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java index 42076de5d..8a170de09 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/Contributor.java @@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData; public class Contributor extends AuthorData implements Serializable { private String sequence; private String role; - private boolean simpleMatch = false; - private Double score = 0.0; - private boolean bestMatch = false; + private transient boolean simpleMatch = false; + private transient Double score = 0.0; + private transient boolean bestMatch = false; public String getSequence() { return sequence; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java index ee13454e1..5756521e7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/model/WorkDataNoDoi.java @@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable { public void setContributors(List contributors) { this.contributors = contributors; } - } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java new file mode 100644 index 000000000..09fd8b36b --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/similarity/AuthorMatcher.java @@ -0,0 +1,204 @@ + +package eu.dnetlib.doiboost.orcidnodoi.similarity; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + +import org.apache.commons.text.similarity.JaroWinklerSimilarity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; + +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; + +public class AuthorMatcher { + + private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class); + private static final Double threshold = 0.8; + + public static void match(AuthorData author, List contributors) + throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + + int matchCounter = 0; + List matchCounters = Arrays.asList(matchCounter); + Contributor contributor = null; + contributors.forEach(c -> { + if (normalize(c.getCreditName()).contains(normalize(author.getName())) || + normalize(c.getCreditName()).contains(normalize(author.getSurname())) || + ((author.getOtherName() != null) + && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { + matchCounters.set(0, matchCounters.get(0) + 1); + c.setSimpleMatch(true); + } + }); + logger.info("match counter: " + Integer.toString(matchCounters.get(0))); + if (matchCounters.get(0) == 1) { + updateAuthorsSimpleMatch(contributors, author); + } else if (matchCounters.get(0) > 1) { + Optional optCon = contributors + .stream() + .filter(c -> c.isSimpleMatch()) + .map(c -> { + c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName())); + logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); + return c; + }) + .filter(c -> c.getScore() >= threshold) + .max(Comparator.comparing(c -> c.getScore())); + Contributor bestMatchContributor = null; + if (optCon.isPresent()) { + bestMatchContributor = optCon.get(); + bestMatchContributor.setBestMatch(true); + logger.info("best match: " + bestMatchContributor.getCreditName()); + updateAuthorsSimilarityMatch(contributors, author); + } + + } + + logger.info("UPDATED contributors: "); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence()); + }); + } + + private static Double bestMatch(String authorSurname, String authorName, String contributor) { + logger.debug(authorSurname + " " + authorName + " vs " + contributor); + String[] contributorSplitted = contributor.split(" "); + if (contributorSplitted.length == 0) { + return 0.0; + } + final String contributorName = contributorSplitted[contributorSplitted.length - 1]; + String contributorSurname = ""; + if (contributorSplitted.length > 1) { + StringJoiner joiner = new StringJoiner(" "); + for (int i = 0; i < contributorSplitted.length - 1; i++) { + joiner.add(contributorSplitted[i]); + } + contributorSurname = joiner.toString(); + } + logger + .debug( + "contributorName: " + contributorName + + " contributorSurname: " + contributorSurname); + String authorNameNrm = normalize(authorName); + String authorSurnameNrm = normalize(authorSurname); + String contributorNameNrm = normalize(contributorName); + String contributorSurnameNrm = normalize(contributorSurname); + Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); + Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); + if (sm1.compareTo(sm2) >= 0) { + return sm1; + } + return sm2; + } + + private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { + Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); + logger + .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); + return score; + } + + private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { + return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); + } + + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + private static String parse(String name, String surname) { + return surname + " " + name; + } + + private static void updateAuthorsSimpleMatch(List contributors, AuthorData author) { + contributors.forEach(c -> { + if (c.isSimpleMatch()) { + logger.info("simple match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + } + }); + updateRanks(contributors); + } + + private static void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { + logger.info("inside updateAuthorsSimilarityMatch ..."); + contributors.forEach(c -> { + logger + .info( + c.getOid() + " - " + c.getCreditName() + " - " + + c.getName() + " - " + c.getSurname() + " - " + + c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " + + c.isSimpleMatch()); + }); + + contributors + .stream() + .filter(c -> c.isBestMatch()) + .forEach(c -> { + logger.info("similarity match on : " + c.getCreditName()); + c.setName(author.getName()); + c.setSurname(author.getSurname()); + c.setOid(author.getOid()); + }); + updateRanks(contributors); + } + + private static void updateRanks(List contributors) { + boolean seqFound = false; + if (contributors + .stream() + .filter( + c -> c.getRole() != null && c.getSequence() != null && + c.getRole().equals("author") && (c.getSequence().equals("first") || + c.getSequence().equals("additional"))) + .count() > 0) { + seqFound = true; + logger.info("sequence data found"); + } + if (!seqFound) { + List seqIds = Arrays.asList(0); + contributors.forEach(c -> { + int currentSeq = seqIds.get(0) + 1; + seqIds.set(0, currentSeq); + c.setSequence(Integer.toString(seqIds.get(0))); + }); + } + } + + private static String toJson(WorkDataNoDoi work) { + GsonBuilder builder = new GsonBuilder(); + Gson gson = builder.create(); + return gson.toJson(work); + } +} diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml new file mode 100644 index 000000000..f2d51e260 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + oozie.action.sharelib.for.java + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + + oozie.launcher.mapreduce.map.java.opts + -Xmx4g + + + jobTracker + hadoop-rm3.garr-pa1.d4science.org:8032 + + + nameNode + hdfs://hadoop-rm1.garr-pa1.d4science.org:8020 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml new file mode 100644 index 000000000..2486bdb24 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works/oozie_app/workflow.xml @@ -0,0 +1,524 @@ + + + + workingPath_activities + the working dir base path + + + shell_cmd_0 + wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 0 + + + shell_cmd_1 + wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 1 + + + shell_cmd_2 + wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 2 + + + shell_cmd_3 + wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 3 + + + shell_cmd_4 + wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 4 + + + shell_cmd_5 + wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 5 + + + shell_cmd_6 + wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 6 + + + shell_cmd_7 + wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 7 + + + shell_cmd_8 + wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 8 + + + shell_cmd_9 + wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file 9 + + + shell_cmd_X + wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz + + the shell command that downloads and puts to hdfs orcid activity file X + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_0} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_0.tar.gz + -owno_doi_works/works_0.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_1} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_1.tar.gz + -owno_doi_works/works_1.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_2} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_2.tar.gz + -owno_doi_works/works_2.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_3} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_3.tar.gz + -owno_doi_works/works_3.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_4} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_4.tar.gz + -owno_doi_works/works_4.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_5} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_5.tar.gz + -owno_doi_works/works_5.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_6} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_6.tar.gz + -owno_doi_works/works_6.seq + + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_7} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_7.tar.gz + -owno_doi_works/works_7.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_8} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_8.tar.gz + -owno_doi_works/works_8.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_9} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_9.tar.gz + -owno_doi_works/works_9.seq + + + + + + + + + ${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))} + + + + + + + + ${jobTracker} + ${nameNode} + bash + -c + ${shell_cmd_X} + + + + + + + + + ${jobTracker} + ${nameNode} + eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork + -w${workingPath_activities}/ + -n${nameNode} + -fORCID_2019_activites_X.tar.gz + -owno_doi_works/works_X.seq + + + + + + + + + + ${jobTracker} + ${nameNode} + yarn + cluster + Gen_Enriched_Orcid_Works + eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks + dhp-doiboost-1.2.3-SNAPSHOT.jar + --num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} + + -w${workingPath}/ + -owno_doi_works/ + -oewno_doi_enriched_works/ + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json new file mode 100644 index 000000000..c3a8f92ec --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json @@ -0,0 +1,7 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true}, + {"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}, + {"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java index 31f8432ac..6a5faddbd 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcidnodoi/xml/OrcidNoDoiTest.java @@ -1,15 +1,12 @@ package eu.dnetlib.doiboost.orcidnodoi.xml; -import com.ximpleware.NavException; -import com.ximpleware.ParseException; -import com.ximpleware.XPathEvalException; -import com.ximpleware.XPathParseException; -import eu.dnetlib.dhp.parser.utility.VtdException; -import eu.dnetlib.doiboost.orcid.model.AuthorData; -import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; -import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; -import jdk.nashorn.internal.ir.annotations.Ignore; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.IOException; +import java.text.Normalizer; +import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.commons.text.similarity.JaccardSimilarity; import org.apache.commons.text.similarity.JaroWinklerSimilarity; @@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.text.Normalizer; -import java.util.*; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.ximpleware.NavException; +import com.ximpleware.ParseException; +import com.ximpleware.XPathEvalException; +import com.ximpleware.XPathParseException; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import eu.dnetlib.dhp.parser.utility.VtdException; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.doiboost.orcid.model.AuthorData; +import eu.dnetlib.doiboost.orcidnodoi.model.Contributor; +import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi; +import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher; +import jdk.nashorn.internal.ir.annotations.Ignore; public class OrcidNoDoiTest { @@ -33,100 +39,10 @@ public class OrcidNoDoiTest { String nameB = "K"; String surnameB = "Abdel-Dayem"; String orcidIdA = "0000-0003-2760-1191"; - Double threshold = 0.8; @Test @Ignore - private void similarityTest() throws Exception { - logger.info("running testSimilarity ...."); - logger - .info( - "JaroWinklerSimilarity: " - + Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB))); - logger - .info( - "JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB))); - } - - @Test - @Ignore - private void bestMatchTest() throws Exception { - logger.info("running bestMatchTest ...."); - String contributor = surnameB + ", " + nameB; - logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor))); - } - - private static Double bestMatch(String authorSurname, String authorName, String contributor) { - logger.debug(authorSurname + " " + authorName + " vs " + contributor); - String[] contributorSplitted = contributor.split(" "); - if (contributorSplitted.length == 0) { - return 0.0; - } - final String contributorName = contributorSplitted[contributorSplitted.length - 1]; - String contributorSurname = ""; - if (contributorSplitted.length > 1) { - StringJoiner joiner = new StringJoiner(" "); - for (int i = 0; i < contributorSplitted.length - 1; i++) { - joiner.add(contributorSplitted[i]); - } - contributorSurname = joiner.toString(); - } - logger - .debug( - "contributorName: " + contributorName + - " contributorSurname: " + contributorSurname); - String authorNameNrm = normalize(authorName); - String authorSurnameNrm = normalize(authorSurname); - String contributorNameNrm = normalize(contributorName); - String contributorSurnameNrm = normalize(contributorSurname); - Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm); - Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm); - if (sm1.compareTo(sm2) >= 0) { - return sm1; - } - return sm2; - } - - private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) { - Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB); - logger - .debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score)); - return score; - } - - private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) { - return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); - } - - private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) { - return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB))); - } - - private static String parse(String name, String surname) { - return surname + " " + name; - } - - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - @Test - @Ignore - public void readPublicationFieldsTest() + private void readPublicationFieldsTest() throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { logger.info("running loadPublicationFieldsTest ...."); String xml = IOUtils @@ -178,78 +94,10 @@ public class OrcidNoDoiTest { } - private void updateRanks(List contributors) { - boolean seqFound = false; - if (contributors - .stream() - .filter( - c -> c.getRole() != null && c.getSequence() != null && - c.getRole().equals("author") && (c.getSequence().equals("first") || - c.getSequence().equals("additional"))) - .count() > 0) { - seqFound = true; - logger.info("sequence data found"); - } - if (!seqFound) { - List seqIds = Arrays.asList(0); - contributors.forEach(c -> { - int currentSeq = seqIds.get(0) + 1; - seqIds.set(0, currentSeq); - c.setSequence(Integer.toString(seqIds.get(0))); - }); - } - } - - private void updateAuthorsSimpleMatch(List contributors, AuthorData author) { - contributors.forEach(c -> { - if (c.isSimpleMatch()) { - logger.info("simple match on : " + c.getCreditName()); - c.setName(author.getName()); - c.setSurname(author.getSurname()); - c.setOid(author.getOid()); - } - }); - updateRanks(contributors); - } - - private void updateAuthorsSimilarityMatch(List contributors, AuthorData author) { - logger.info("inside updateAuthorsSimilarityMatch ..."); - contributors.forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: " - + c.isSimpleMatch()); - }); - - contributors - .stream() - .filter(c -> c.isBestMatch()) - .forEach(c -> { - logger.info("similarity match on : " + c.getCreditName()); - c.setName(author.getName()); - c.setSurname(author.getSurname()); - c.setOid(author.getOid()); - }); - updateRanks(contributors); - } - @Test - @Ignore - public void authorSimilarityMatchTest() throws Exception { - logger.info("running authorSimilarityMatchTest ...."); - authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml"); - } - - @Test - private void authorSimpleMatchTest() throws Exception { + public void authorMatchTest() throws Exception { logger.info("running authorSimpleMatchTest ...."); - authorMatchTest("activity_work_0000-0003-2760-1191.xml"); - } - - private void authorMatchTest(String orcidWork) - throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException { + String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml"; AuthorData author = new AuthorData(); author.setName(nameA); author.setSurname(surnameA); @@ -272,55 +120,9 @@ public class OrcidNoDoiTest { logger.error("parsing xml", e); } assertNotNull(workData); - int matchCounter = 0; - List matchCounters = Arrays.asList(matchCounter); - Contributor contributor = null; - workData.getContributors().forEach(c -> { - if (normalize(c.getCreditName()).contains(normalize(author.getName())) || - normalize(c.getCreditName()).contains(normalize(author.getSurname())) || - ((author.getOtherName() != null) - && normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) { - matchCounters.set(0, matchCounters.get(0) + 1); - c.setSimpleMatch(true); - } - }); - logger.info("match counter: " + Integer.toString(matchCounters.get(0))); - if (matchCounters.get(0) == 1) { - updateAuthorsSimpleMatch(workData.getContributors(), author); - } else if (matchCounters.get(0) > 1) { - Optional optCon = workData - .getContributors() - .stream() - .filter(c -> c.isSimpleMatch()) - .map(c -> { - c.setScore(bestMatch(nameA, surnameA, c.getCreditName())); - logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore()); - return c; - }) - .filter(c -> c.getScore() >= threshold) - .max(Comparator.comparing(c -> c.getScore())); - Contributor bestMatchContributor = null; - if (optCon.isPresent()) { - bestMatchContributor = optCon.get(); - bestMatchContributor.setBestMatch(true); - logger.info("best match: " + bestMatchContributor.getCreditName()); - updateAuthorsSimilarityMatch(workData.getContributors(), author); - } - - } - - logger.info("UPDATED contributors: "); - workData.getContributors().forEach(c -> { - logger - .info( - c.getOid() + " - " + c.getCreditName() + " - " + - c.getName() + " - " + c.getSurname() + " - " + - c.getRole() + " - " + c.getSequence()); - }); + AuthorMatcher.match(author, workData.getContributors()); + GsonBuilder builder = new GsonBuilder(); + Gson gson = builder.create(); + logger.info(gson.toJson(workData)); } } - -// -// orcid_RDD = sc.textFile(ORCID_DUMP_PATH) -// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec") -// \ No newline at end of file