orcid-no-doi #43

Merged
claudio.atzori merged 45 commits from enrico.ottonello/dnet-hadoop:orcid-no-doi into master 2020-12-02 10:55:12 +01:00
14 changed files with 1125 additions and 231 deletions
Showing only changes of commit d6498278ed - Show all commits

View File

@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;

View File

@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;

View File

@ -0,0 +1,16 @@
package eu.dnetlib.doiboost.orcid.json;
import com.google.gson.Gson;
import com.google.gson.JsonObject;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
public class JsonHelper {
public static String createOidWork(WorkDataNoDoi workData) {
JsonObject oidWork = new JsonObject();
oidWork.addProperty("oid", workData.getOid());
oidWork.addProperty("work", new Gson().toJson(workData));
return oidWork.toString();
}
}

View File

@ -0,0 +1,149 @@
package eu.dnetlib.doiboost.orcidnodoi;
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
public class ActivitiesDumpReader {
private static final int MAX_XML_WORKS_PARSED = -1;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
parseTarActivities(fs, conf, gzipInputStream, outputPath);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
private static void parseTarActivities(
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
int counter = 0;
int noDoiFound = 0;
int errorFromOrcidFound = 0;
int xmlParserErrorFound = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
try {
if (entry.isDirectory() || !filename.contains("works")) {
} else {
Log.debug("XML work entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
if (workDataNoDoi != null) {
if (workDataNoDoi.getErrorCode() != null) {
errorFromOrcidFound += 1;
Log
.debug(
"error from Orcid with code "
+ workDataNoDoi.getErrorCode()
+ " for entry "
+ entry.getName());
continue;
}
boolean isDoiFound = workDataNoDoi.getExtIds().stream()
.filter(e -> e.getType()!=null)
.anyMatch(e -> e.getType().equals("doi"));
if (!isDoiFound) {
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
final Text key = new Text(workDataNoDoi.getOid());
final Text value = new Text(jsonData);
try {
writer.append(key, value);
} catch (IOException e) {
Log.debug("Writing to sequence file: " + e.getMessage());
Log.debug(e);
throw new RuntimeException(e);
}
noDoiFound += 1;
}
} else {
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
xmlParserErrorFound += 1;
}
}
} catch (Exception e) {
Log
.warn(
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
Log.warn(e);
}
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log.info("Current xml works parsed: " + counter);
}

What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.

What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
break;
}
}
}
} catch (IOException e) {
Log.warn("Parsing work from gzip archive: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
Log.info("Activities parse completed");
Log.info("Total XML works parsed: " + counter);
Log.info("Total no doi work found: " + noDoiFound);
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
}
}

View File

@ -0,0 +1,52 @@
package eu.dnetlib.doiboost.orcidnodoi;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import java.io.IOException;
public class GenOrcidAuthorWork extends OrcidDSManager {
private String activitiesFileNameTarGz;
private String outputWorksPath;
private String workingPath;
public static void main(String[] args) throws IOException, Exception {
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
genOrcidAuthorWork.loadArgs(args);
genOrcidAuthorWork.generateAuthorsDOIsData();
}
public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath");
Log.info("Output Author Work Data: " + outputWorksPath);
}
}

View File

@ -0,0 +1,119 @@
package eu.dnetlib.doiboost.orcidnodoi;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class SparkGenEnrichedOrcidWorks {
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkGenEnrichedOrcidWorks.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
final String outputWorksPath = parser.get("outputWorksPath");
logger.info("outputWorksPath: ", outputWorksPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark
.createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(AuthorData.class));
JavaPairRDD<Text, Text> activitiesRDD = sc
.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
Dataset<WorkDataNoDoi> activitiesDataset = spark
.createDataset(
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(WorkDataNoDoi.class));
activitiesDataset
.joinWith(
summariesDataset,
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
.map(
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
WorkDataNoDoi w = value._1;
AuthorData a = value._2;
AuthorMatcher.match(a, w.getContributors());
return new Tuple2<>(a.getOid(), w);
},
Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
.filter(Objects::nonNull)
.toJavaRDD()
.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
});
}
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
AuthorData authorData = new AuthorData();
authorData.setOid(orcidId.toString());
JsonElement jElement = new JsonParser().parse(json.toString());
authorData.setName(getJsonValue(jElement, "name"));
authorData.setSurname(getJsonValue(jElement, "surname"));
authorData.setCreditName(getJsonValue(jElement, "creditname"));
return authorData;
}
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
return workData;
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return null;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.doiboost.orcid.json;
package eu.dnetlib.doiboost.orcidnodoi.json;
import com.google.gson.JsonObject;

View File

@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
public class Contributor extends AuthorData implements Serializable {
private String sequence;
private String role;
private boolean simpleMatch = false;
private Double score = 0.0;
private boolean bestMatch = false;
private transient boolean simpleMatch = false;
private transient Double score = 0.0;
private transient boolean bestMatch = false;
public String getSequence() {
return sequence;

View File

@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
}

View File

@ -0,0 +1,204 @@
package eu.dnetlib.doiboost.orcidnodoi.similarity;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
public class AuthorMatcher {
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
private static final Double threshold = 0.8;
public static void match(AuthorData author, List<Contributor> contributors)
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
contributors.forEach(c -> {
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
((author.getOtherName() != null)
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
if (matchCounters.get(0) == 1) {
updateAuthorsSimpleMatch(contributors, author);
} else if (matchCounters.get(0) > 1) {
Optional<Contributor> optCon = contributors
.stream()
.filter(c -> c.isSimpleMatch())
.map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
logger.info("best match: " + bestMatchContributor.getCreditName());
updateAuthorsSimilarityMatch(contributors, author);
}
}
logger.info("UPDATED contributors: ");
contributors.forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence());
});
}
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
String[] contributorSplitted = contributor.split(" ");
if (contributorSplitted.length == 0) {
return 0.0;
}
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
String contributorSurname = "";
if (contributorSplitted.length > 1) {
StringJoiner joiner = new StringJoiner(" ");
for (int i = 0; i < contributorSplitted.length - 1; i++) {
joiner.add(contributorSplitted[i]);
}
contributorSurname = joiner.toString();
}
logger
.debug(
"contributorName: " + contributorName +
" contributorSurname: " + contributorSurname);
String authorNameNrm = normalize(authorName);
String authorSurnameNrm = normalize(authorSurname);
String contributorNameNrm = normalize(contributorName);
String contributorSurnameNrm = normalize(contributorSurname);
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
if (sm1.compareTo(sm2) >= 0) {
return sm1;
}
return sm2;
}
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
logger
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
return score;
}
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static String normalize(final String s) {
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
private static String parse(String name, String surname) {
return surname + " " + name;
}
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
contributors.forEach(c -> {
if (c.isSimpleMatch()) {
logger.info("simple match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
}
});
updateRanks(contributors);
}
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
logger.info("inside updateAuthorsSimilarityMatch ...");
contributors.forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+ c.isSimpleMatch());
});
contributors
.stream()
.filter(c -> c.isBestMatch())
.forEach(c -> {
logger.info("similarity match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
});
updateRanks(contributors);
}
private static void updateRanks(List<Contributor> contributors) {
boolean seqFound = false;
if (contributors
.stream()
.filter(
c -> c.getRole() != null && c.getSequence() != null &&
c.getRole().equals("author") && (c.getSequence().equals("first") ||
c.getSequence().equals("additional")))
.count() > 0) {
seqFound = true;
logger.info("sequence data found");
}
if (!seqFound) {
List<Integer> seqIds = Arrays.asList(0);
contributors.forEach(c -> {
int currentSeq = seqIds.get(0) + 1;
seqIds.set(0, currentSeq);
c.setSequence(Integer.toString(seqIds.get(0)));
});
}
}
private static String toJson(WorkDataNoDoi work) {
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
return gson.toJson(work);
}
}

View File

@ -0,0 +1,22 @@
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx4g</value>
</property>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
</configuration>

View File

@ -0,0 +1,524 @@
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath_activities</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath_activities}/no_doi_works/*'/>
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
</fs>
<ok to="fork_gen_orcid_author_work"/>
<error to="Kill"/>
</action>
<fork name = "fork_gen_orcid_author_work">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="GenOrcidAuthorWork_0">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_0"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="GenOrcidAuthorWork_1">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_1"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="GenOrcidAuthorWork_2">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_2"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="GenOrcidAuthorWork_3">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_3"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="GenOrcidAuthorWork_4">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_4"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="GenOrcidAuthorWork_5">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_5"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="GenOrcidAuthorWork_6">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_6"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="GenOrcidAuthorWork_7">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_7"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="GenOrcidAuthorWork_8">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_8"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="GenOrcidAuthorWork_9">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_9"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="GenOrcidAuthorWork_X">
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="GenOrcidAuthorWork_X"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath_activities}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
<action name="Gen_Enriched_Orcid_Works">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master>
<mode>cluster</mode>
<name>Gen_Enriched_Orcid_Works</name>
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
<jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
<spark-opts>--num-executors 10 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-ow</arg><arg>no_doi_works/</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
]

View File

@ -1,15 +1,12 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import jdk.nashorn.internal.ir.annotations.Ignore;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.text.similarity.JaccardSimilarity;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class OrcidNoDoiTest {

Why are the tests in this class marked as @Ignore? I see you spent some effort implementing them, I didn't dive much into the details, but I see assertions properly defined so if they are valuable I suggest to keep the test as active.

Why are the tests in this class marked as `@Ignore`? I see you spent some effort implementing them, I didn't dive much into the details, but I see assertions properly defined so if they are valuable I suggest to keep the test as active.
@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
String nameB = "K";
String surnameB = "Abdel-Dayem";
String orcidIdA = "0000-0003-2760-1191";
Double threshold = 0.8;
@Test
@Ignore
private void similarityTest() throws Exception {
logger.info("running testSimilarity ....");
logger
.info(
"JaroWinklerSimilarity: "
+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
logger
.info(
"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
}
@Test
@Ignore
private void bestMatchTest() throws Exception {
logger.info("running bestMatchTest ....");
String contributor = surnameB + ", " + nameB;
logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
}
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
String[] contributorSplitted = contributor.split(" ");
if (contributorSplitted.length == 0) {
return 0.0;
}
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
String contributorSurname = "";
if (contributorSplitted.length > 1) {
StringJoiner joiner = new StringJoiner(" ");
for (int i = 0; i < contributorSplitted.length - 1; i++) {
joiner.add(contributorSplitted[i]);
}
contributorSurname = joiner.toString();
}
logger
.debug(
"contributorName: " + contributorName +
" contributorSurname: " + contributorSurname);
String authorNameNrm = normalize(authorName);
String authorSurnameNrm = normalize(authorSurname);
String contributorNameNrm = normalize(contributorName);
String contributorSurnameNrm = normalize(contributorSurname);
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
if (sm1.compareTo(sm2) >= 0) {
return sm1;
}
return sm2;
}
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
logger
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
return score;
}
private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
private static String parse(String name, String surname) {
return surname + " " + name;
}
private static String normalize(final String s) {
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
@Test
@Ignore
public void readPublicationFieldsTest()
private void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
@ -178,78 +94,10 @@ public class OrcidNoDoiTest {
}
private void updateRanks(List<Contributor> contributors) {
boolean seqFound = false;
if (contributors
.stream()
.filter(
c -> c.getRole() != null && c.getSequence() != null &&
c.getRole().equals("author") && (c.getSequence().equals("first") ||
c.getSequence().equals("additional")))
.count() > 0) {
seqFound = true;
logger.info("sequence data found");
}
if (!seqFound) {
List<Integer> seqIds = Arrays.asList(0);
contributors.forEach(c -> {
int currentSeq = seqIds.get(0) + 1;
seqIds.set(0, currentSeq);
c.setSequence(Integer.toString(seqIds.get(0)));
});
}
}
private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
contributors.forEach(c -> {
if (c.isSimpleMatch()) {
logger.info("simple match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
}
});
updateRanks(contributors);
}
private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
logger.info("inside updateAuthorsSimilarityMatch ...");
contributors.forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
+ c.isSimpleMatch());
});
contributors
.stream()
.filter(c -> c.isBestMatch())
.forEach(c -> {
logger.info("similarity match on : " + c.getCreditName());
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
});
updateRanks(contributors);
}
@Test
@Ignore
public void authorSimilarityMatchTest() throws Exception {
logger.info("running authorSimilarityMatchTest ....");
authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
}
@Test
private void authorSimpleMatchTest() throws Exception {
public void authorMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ....");
authorMatchTest("activity_work_0000-0003-2760-1191.xml");
}
private void authorMatchTest(String orcidWork)
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData();
author.setName(nameA);
author.setSurname(surnameA);
@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
logger.error("parsing xml", e);
}
assertNotNull(workData);
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
workData.getContributors().forEach(c -> {
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
((author.getOtherName() != null)
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
if (matchCounters.get(0) == 1) {
updateAuthorsSimpleMatch(workData.getContributors(), author);
} else if (matchCounters.get(0) > 1) {
Optional<Contributor> optCon = workData
.getContributors()
.stream()
.filter(c -> c.isSimpleMatch())
.map(c -> {
c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
logger.info("best match: " + bestMatchContributor.getCreditName());
updateAuthorsSimilarityMatch(workData.getContributors(), author);
}
}
logger.info("UPDATED contributors: ");
workData.getContributors().forEach(c -> {
logger
.info(
c.getOid() + " - " + c.getCreditName() + " - " +
c.getName() + " - " + c.getSurname() + " - " +
c.getRole() + " - " + c.getSequence());
});
AuthorMatcher.match(author, workData.getContributors());
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
logger.info(gson.toJson(workData));
}
}
//
// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
//