orcid-no-doi #43
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid.json;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
public class JsonHelper {
|
||||||
|
|
||||||
|
public static String createOidWork(WorkDataNoDoi workData) {
|
||||||
|
JsonObject oidWork = new JsonObject();
|
||||||
|
oidWork.addProperty("oid", workData.getOid());
|
||||||
|
oidWork.addProperty("work", new Gson().toJson(workData));
|
||||||
|
return oidWork.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,149 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
public class ActivitiesDumpReader {
|
||||||
|
|
||||||
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||||
|
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||||
|
|
||||||
|
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||||
|
throws Exception {
|
||||||
|
String uri = inputUri;
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||||
|
Path inputPath = new Path(uri);
|
||||||
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||||
|
CompressionCodec codec = factory.getCodec(inputPath);
|
||||||
|
if (codec == null) {
|
||||||
|
System.err.println("No codec found for " + uri);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||||
|
InputStream gzipInputStream = null;
|
||||||
|
try {
|
||||||
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||||
|
parseTarActivities(fs, conf, gzipInputStream, outputPath);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
Log.debug("Closing gzip stream");
|
||||||
|
IOUtils.closeStream(gzipInputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void parseTarActivities(
|
||||||
|
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||||
|
int counter = 0;
|
||||||
|
int noDoiFound = 0;
|
||||||
|
int errorFromOrcidFound = 0;
|
||||||
|
int xmlParserErrorFound = 0;
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(outputPath),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class))) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
String filename = entry.getName();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (entry.isDirectory() || !filename.contains("works")) {
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.debug("XML work entry name: " + entry.getName());
|
||||||
|
counter++;
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||||
|
// tarInput
|
||||||
|
String line;
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
|
||||||
|
if (workDataNoDoi != null) {
|
||||||
|
if (workDataNoDoi.getErrorCode() != null) {
|
||||||
|
errorFromOrcidFound += 1;
|
||||||
|
Log
|
||||||
|
.debug(
|
||||||
|
"error from Orcid with code "
|
||||||
|
+ workDataNoDoi.getErrorCode()
|
||||||
|
+ " for entry "
|
||||||
|
+ entry.getName());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
boolean isDoiFound = workDataNoDoi.getExtIds().stream()
|
||||||
|
.filter(e -> e.getType()!=null)
|
||||||
|
.anyMatch(e -> e.getType().equals("doi"));
|
||||||
|
if (!isDoiFound) {
|
||||||
|
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
|
||||||
|
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
|
||||||
|
|
||||||
|
final Text key = new Text(workDataNoDoi.getOid());
|
||||||
|
final Text value = new Text(jsonData);
|
||||||
|
|
||||||
|
try {
|
||||||
|
writer.append(key, value);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||||
|
Log.debug(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
noDoiFound += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||||
|
xmlParserErrorFound += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
Log
|
||||||
|
.warn(
|
||||||
|
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||||
|
Log.warn(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||||
|
Log.info("Current xml works parsed: " + counter);
|
||||||
|
}
|
||||||
|
|||||||
|
|
||||||
|
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
||||||
|
Log.warn(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
Log.info("Activities parse completed");
|
||||||
|
Log.info("Total XML works parsed: " + counter);
|
||||||
|
Log.info("Total no doi work found: " + noDoiFound);
|
||||||
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||||
|
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
|
|
||||||
|
private String activitiesFileNameTarGz;
|
||||||
|
private String outputWorksPath;
|
||||||
|
private String workingPath;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||||
|
genOrcidAuthorWork.loadArgs(args);
|
||||||
|
genOrcidAuthorWork.generateAuthorsDOIsData();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void generateAuthorsDOIsData() throws Exception {
|
||||||
|
Configuration conf = initConfigurationObject();
|
||||||
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
|
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
|
||||||
|
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadArgs(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenOrcidAuthorWork.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
Log.info("Working Path: " + workingPath);
|
||||||
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
|
outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
public class SparkGenEnrichedOrcidWorks {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
||||||
|
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkGenEnrichedOrcidWorks.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
logger.info("workingPath: ", workingPath);
|
||||||
|
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||||
|
logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
|
||||||
|
final String outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
logger.info("outputWorksPath: ", outputWorksPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||||
|
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
||||||
|
Dataset<AuthorData> summariesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(AuthorData.class));
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> activitiesRDD = sc
|
||||||
|
.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
|
||||||
|
Dataset<WorkDataNoDoi> activitiesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(WorkDataNoDoi.class));
|
||||||
|
|
||||||
|
activitiesDataset
|
||||||
|
.joinWith(
|
||||||
|
summariesDataset,
|
||||||
|
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
|
||||||
|
WorkDataNoDoi w = value._1;
|
||||||
|
AuthorData a = value._2;
|
||||||
|
AuthorMatcher.match(a, w.getContributors());
|
||||||
|
return new Tuple2<>(a.getOid(), w);
|
||||||
|
},
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toJavaRDD()
|
||||||
|
.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
||||||
|
AuthorData authorData = new AuthorData();
|
||||||
|
authorData.setOid(orcidId.toString());
|
||||||
|
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||||
|
authorData.setName(getJsonValue(jElement, "name"));
|
||||||
|
authorData.setSurname(getJsonValue(jElement, "surname"));
|
||||||
|
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
|
||||||
|
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
|
||||||
|
return workData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.json;
|
package eu.dnetlib.doiboost.orcidnodoi.json;
|
||||||
|
|
||||||
import com.google.gson.JsonObject;
|
import com.google.gson.JsonObject;
|
||||||
|
|
|
@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
public class Contributor extends AuthorData implements Serializable {
|
public class Contributor extends AuthorData implements Serializable {
|
||||||
private String sequence;
|
private String sequence;
|
||||||
private String role;
|
private String role;
|
||||||
private boolean simpleMatch = false;
|
private transient boolean simpleMatch = false;
|
||||||
private Double score = 0.0;
|
private transient Double score = 0.0;
|
||||||
private boolean bestMatch = false;
|
private transient boolean bestMatch = false;
|
||||||
|
|
||||||
public String getSequence() {
|
public String getSequence() {
|
||||||
return sequence;
|
return sequence;
|
||||||
|
|
|
@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
|
||||||
public void setContributors(List<Contributor> contributors) {
|
public void setContributors(List<Contributor> contributors) {
|
||||||
this.contributors = contributors;
|
this.contributors = contributors;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.similarity;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
public class AuthorMatcher {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||||
|
private static final Double threshold = 0.8;
|
||||||
|
|
||||||
|
public static void match(AuthorData author, List<Contributor> contributors)
|
||||||
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
|
||||||
|
int matchCounter = 0;
|
||||||
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
|
Contributor contributor = null;
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
||||||
|
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
||||||
|
((author.getOtherName() != null)
|
||||||
|
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
||||||
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
||||||
|
if (matchCounters.get(0) == 1) {
|
||||||
|
updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
} else if (matchCounters.get(0) > 1) {
|
||||||
|
Optional<Contributor> optCon = contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isSimpleMatch())
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||||
|
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
Contributor bestMatchContributor = null;
|
||||||
|
if (optCon.isPresent()) {
|
||||||
|
bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
logger.info("best match: " + bestMatchContributor.getCreditName());
|
||||||
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("UPDATED contributors: ");
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||||
|
c.getName() + " - " + c.getSurname() + " - " +
|
||||||
|
c.getRole() + " - " + c.getSequence());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||||
|
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
||||||
|
String[] contributorSplitted = contributor.split(" ");
|
||||||
|
if (contributorSplitted.length == 0) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
||||||
|
String contributorSurname = "";
|
||||||
|
if (contributorSplitted.length > 1) {
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
||||||
|
joiner.add(contributorSplitted[i]);
|
||||||
|
}
|
||||||
|
contributorSurname = joiner.toString();
|
||||||
|
}
|
||||||
|
logger
|
||||||
|
.debug(
|
||||||
|
"contributorName: " + contributorName +
|
||||||
|
" contributorSurname: " + contributorSurname);
|
||||||
|
String authorNameNrm = normalize(authorName);
|
||||||
|
String authorSurnameNrm = normalize(authorSurname);
|
||||||
|
String contributorNameNrm = normalize(contributorName);
|
||||||
|
String contributorSurnameNrm = normalize(contributorSurname);
|
||||||
|
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
||||||
|
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
||||||
|
if (sm1.compareTo(sm2) >= 0) {
|
||||||
|
return sm1;
|
||||||
|
}
|
||||||
|
return sm2;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||||
|
logger
|
||||||
|
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(final String s) {
|
||||||
|
return nfd(s)
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
|
// in case
|
||||||
|
// of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String parse(String name, String surname) {
|
||||||
|
return surname + " " + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
if (c.isSimpleMatch()) {
|
||||||
|
logger.info("simple match on : " + c.getCreditName());
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
logger.info("inside updateAuthorsSimilarityMatch ...");
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||||
|
c.getName() + " - " + c.getSurname() + " - " +
|
||||||
|
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
||||||
|
+ c.isSimpleMatch());
|
||||||
|
});
|
||||||
|
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isBestMatch())
|
||||||
|
.forEach(c -> {
|
||||||
|
logger.info("similarity match on : " + c.getCreditName());
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateRanks(List<Contributor> contributors) {
|
||||||
|
boolean seqFound = false;
|
||||||
|
if (contributors
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
c -> c.getRole() != null && c.getSequence() != null &&
|
||||||
|
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
||||||
|
c.getSequence().equals("additional")))
|
||||||
|
.count() > 0) {
|
||||||
|
seqFound = true;
|
||||||
|
logger.info("sequence data found");
|
||||||
|
}
|
||||||
|
if (!seqFound) {
|
||||||
|
List<Integer> seqIds = Arrays.asList(0);
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
int currentSeq = seqIds.get(0) + 1;
|
||||||
|
seqIds.set(0, currentSeq);
|
||||||
|
c.setSequence(Integer.toString(seqIds.get(0)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toJson(WorkDataNoDoi work) {
|
||||||
|
GsonBuilder builder = new GsonBuilder();
|
||||||
|
Gson gson = builder.create();
|
||||||
|
return gson.toJson(work);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx4g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,524 @@
|
||||||
|
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath_activities</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_1</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_2</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_3</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_4</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_5</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_6</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_7</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_8</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_9</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_X</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath_activities}/no_doi_works/*'/>
|
||||||
|
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="fork_gen_orcid_author_work"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name = "fork_gen_orcid_author_work">
|
||||||
|
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_0">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_0">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_0" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_0">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_0"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_0">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_1">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_1">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_1" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_1">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_1}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_1"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_1">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_2">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_2">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_2" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_2">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_2}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_2">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_3">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_3">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_3" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_3">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_3}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_3"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_3">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_4">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_4">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_4" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_4">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_4}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_4"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_4">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_5">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_5">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_5" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_5">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_5}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_5"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_5">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_6">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_6">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_6" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_6">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_6}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_6"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_6">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_7">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_7">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_7" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_7">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_7}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_7"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_7">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_8">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_8">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_8" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_8">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_8}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_8"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_8">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_9">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_9">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_9" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_9">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_9}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_9"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_9">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_X">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_X">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_X" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_X">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_X}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_X"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_X">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
|
||||||
|
|
||||||
|
<action name="Gen_Enriched_Orcid_Works">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Gen_Enriched_Orcid_Works</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
|
||||||
|
<spark-opts>--num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,7 @@
|
||||||
|
[
|
||||||
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
|
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
|
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
|
]
|
|
@ -1,15 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||||
|
|
||||||
import com.ximpleware.NavException;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import com.ximpleware.ParseException;
|
|
||||||
import com.ximpleware.XPathEvalException;
|
import java.io.IOException;
|
||||||
import com.ximpleware.XPathParseException;
|
import java.text.Normalizer;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import java.util.*;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.text.similarity.JaccardSimilarity;
|
import org.apache.commons.text.similarity.JaccardSimilarity;
|
||||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||||
|
@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import com.google.gson.Gson;
|
||||||
import java.text.Normalizer;
|
import com.google.gson.GsonBuilder;
|
||||||
import java.util.*;
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class OrcidNoDoiTest {
|
public class OrcidNoDoiTest {
|
||||||
claudio.atzori
commented
Why are the tests in this class marked as Why are the tests in this class marked as `@Ignore`? I see you spent some effort implementing them, I didn't dive much into the details, but I see assertions properly defined so if they are valuable I suggest to keep the test as active.
|
|||||||
|
|
||||||
|
@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
|
||||||
String nameB = "K";
|
String nameB = "K";
|
||||||
String surnameB = "Abdel-Dayem";
|
String surnameB = "Abdel-Dayem";
|
||||||
String orcidIdA = "0000-0003-2760-1191";
|
String orcidIdA = "0000-0003-2760-1191";
|
||||||
Double threshold = 0.8;
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
private void similarityTest() throws Exception {
|
private void readPublicationFieldsTest()
|
||||||
logger.info("running testSimilarity ....");
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
"JaroWinklerSimilarity: "
|
|
||||||
+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Ignore
|
|
||||||
private void bestMatchTest() throws Exception {
|
|
||||||
logger.info("running bestMatchTest ....");
|
|
||||||
String contributor = surnameB + ", " + nameB;
|
|
||||||
logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
|
||||||
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
|
||||||
String[] contributorSplitted = contributor.split(" ");
|
|
||||||
if (contributorSplitted.length == 0) {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
|
||||||
String contributorSurname = "";
|
|
||||||
if (contributorSplitted.length > 1) {
|
|
||||||
StringJoiner joiner = new StringJoiner(" ");
|
|
||||||
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
|
||||||
joiner.add(contributorSplitted[i]);
|
|
||||||
}
|
|
||||||
contributorSurname = joiner.toString();
|
|
||||||
}
|
|
||||||
logger
|
|
||||||
.debug(
|
|
||||||
"contributorName: " + contributorName +
|
|
||||||
" contributorSurname: " + contributorSurname);
|
|
||||||
String authorNameNrm = normalize(authorName);
|
|
||||||
String authorSurnameNrm = normalize(authorSurname);
|
|
||||||
String contributorNameNrm = normalize(contributorName);
|
|
||||||
String contributorSurnameNrm = normalize(contributorSurname);
|
|
||||||
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
|
||||||
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
|
||||||
if (sm1.compareTo(sm2) >= 0) {
|
|
||||||
return sm1;
|
|
||||||
}
|
|
||||||
return sm2;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
|
||||||
logger
|
|
||||||
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
|
||||||
return score;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String parse(String name, String surname) {
|
|
||||||
return surname + " " + name;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
|
||||||
return nfd(s)
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
||||||
// in case
|
|
||||||
// of large input strings
|
|
||||||
.replaceAll("(\\W)+", " ")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Ignore
|
|
||||||
public void readPublicationFieldsTest()
|
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
logger.info("running loadPublicationFieldsTest ....");
|
logger.info("running loadPublicationFieldsTest ....");
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
|
@ -178,78 +94,10 @@ public class OrcidNoDoiTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateRanks(List<Contributor> contributors) {
|
|
||||||
boolean seqFound = false;
|
|
||||||
if (contributors
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> c.getRole() != null && c.getSequence() != null &&
|
|
||||||
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
|
||||||
c.getSequence().equals("additional")))
|
|
||||||
.count() > 0) {
|
|
||||||
seqFound = true;
|
|
||||||
logger.info("sequence data found");
|
|
||||||
}
|
|
||||||
if (!seqFound) {
|
|
||||||
List<Integer> seqIds = Arrays.asList(0);
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
int currentSeq = seqIds.get(0) + 1;
|
|
||||||
seqIds.set(0, currentSeq);
|
|
||||||
c.setSequence(Integer.toString(seqIds.get(0)));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
if (c.isSimpleMatch()) {
|
|
||||||
logger.info("simple match on : " + c.getCreditName());
|
|
||||||
c.setName(author.getName());
|
|
||||||
c.setSurname(author.getSurname());
|
|
||||||
c.setOid(author.getOid());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
updateRanks(contributors);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
|
||||||
logger.info("inside updateAuthorsSimilarityMatch ...");
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
|
||||||
c.getName() + " - " + c.getSurname() + " - " +
|
|
||||||
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
|
||||||
+ c.isSimpleMatch());
|
|
||||||
});
|
|
||||||
|
|
||||||
contributors
|
|
||||||
.stream()
|
|
||||||
.filter(c -> c.isBestMatch())
|
|
||||||
.forEach(c -> {
|
|
||||||
logger.info("similarity match on : " + c.getCreditName());
|
|
||||||
c.setName(author.getName());
|
|
||||||
c.setSurname(author.getSurname());
|
|
||||||
c.setOid(author.getOid());
|
|
||||||
});
|
|
||||||
updateRanks(contributors);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
public void authorMatchTest() throws Exception {
|
||||||
public void authorSimilarityMatchTest() throws Exception {
|
|
||||||
logger.info("running authorSimilarityMatchTest ....");
|
|
||||||
authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
private void authorSimpleMatchTest() throws Exception {
|
|
||||||
logger.info("running authorSimpleMatchTest ....");
|
logger.info("running authorSimpleMatchTest ....");
|
||||||
authorMatchTest("activity_work_0000-0003-2760-1191.xml");
|
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||||
}
|
|
||||||
|
|
||||||
private void authorMatchTest(String orcidWork)
|
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
|
||||||
AuthorData author = new AuthorData();
|
AuthorData author = new AuthorData();
|
||||||
author.setName(nameA);
|
author.setName(nameA);
|
||||||
author.setSurname(surnameA);
|
author.setSurname(surnameA);
|
||||||
|
@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
|
||||||
logger.error("parsing xml", e);
|
logger.error("parsing xml", e);
|
||||||
}
|
}
|
||||||
assertNotNull(workData);
|
assertNotNull(workData);
|
||||||
int matchCounter = 0;
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
GsonBuilder builder = new GsonBuilder();
|
||||||
Contributor contributor = null;
|
Gson gson = builder.create();
|
||||||
workData.getContributors().forEach(c -> {
|
logger.info(gson.toJson(workData));
|
||||||
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
|
||||||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
|
||||||
((author.getOtherName() != null)
|
|
||||||
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
|
||||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
|
||||||
c.setSimpleMatch(true);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
|
||||||
if (matchCounters.get(0) == 1) {
|
|
||||||
updateAuthorsSimpleMatch(workData.getContributors(), author);
|
|
||||||
} else if (matchCounters.get(0) > 1) {
|
|
||||||
Optional<Contributor> optCon = workData
|
|
||||||
.getContributors()
|
|
||||||
.stream()
|
|
||||||
.filter(c -> c.isSimpleMatch())
|
|
||||||
.map(c -> {
|
|
||||||
c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
|
|
||||||
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
|
||||||
return c;
|
|
||||||
})
|
|
||||||
.filter(c -> c.getScore() >= threshold)
|
|
||||||
.max(Comparator.comparing(c -> c.getScore()));
|
|
||||||
Contributor bestMatchContributor = null;
|
|
||||||
if (optCon.isPresent()) {
|
|
||||||
bestMatchContributor = optCon.get();
|
|
||||||
bestMatchContributor.setBestMatch(true);
|
|
||||||
logger.info("best match: " + bestMatchContributor.getCreditName());
|
|
||||||
updateAuthorsSimilarityMatch(workData.getContributors(), author);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("UPDATED contributors: ");
|
|
||||||
workData.getContributors().forEach(c -> {
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
|
||||||
c.getName() + " - " + c.getSurname() + " - " +
|
|
||||||
c.getRole() + " - " + c.getSequence());
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
|
|
||||||
// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
|
||||||
//
|
|
Loading…
Reference in New Issue
What is the reason for not handling nor let propagate this exception? I imagine that a malformed entry in the tar file could cause it, but in that case we should interrupt the procedure and deepen the analysis to spot the error. In this way the error would likely be unnoticed, but causing a drop in the number of output records.