forked from D-Net/dnet-hadoop
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork)
This commit is contained in:
parent
fcbb4c1489
commit
d6498278ed
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid.json;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
public class JsonHelper {
|
||||||
|
|
||||||
|
public static String createOidWork(WorkDataNoDoi workData) {
|
||||||
|
JsonObject oidWork = new JsonObject();
|
||||||
|
oidWork.addProperty("oid", workData.getOid());
|
||||||
|
oidWork.addProperty("work", new Gson().toJson(workData));
|
||||||
|
return oidWork.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,149 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
public class ActivitiesDumpReader {
|
||||||
|
|
||||||
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||||
|
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||||
|
|
||||||
|
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||||
|
throws Exception {
|
||||||
|
String uri = inputUri;
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||||
|
Path inputPath = new Path(uri);
|
||||||
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||||
|
CompressionCodec codec = factory.getCodec(inputPath);
|
||||||
|
if (codec == null) {
|
||||||
|
System.err.println("No codec found for " + uri);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||||
|
InputStream gzipInputStream = null;
|
||||||
|
try {
|
||||||
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||||
|
parseTarActivities(fs, conf, gzipInputStream, outputPath);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
Log.debug("Closing gzip stream");
|
||||||
|
IOUtils.closeStream(gzipInputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void parseTarActivities(
|
||||||
|
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||||
|
int counter = 0;
|
||||||
|
int noDoiFound = 0;
|
||||||
|
int errorFromOrcidFound = 0;
|
||||||
|
int xmlParserErrorFound = 0;
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(outputPath),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class))) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
String filename = entry.getName();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (entry.isDirectory() || !filename.contains("works")) {
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.debug("XML work entry name: " + entry.getName());
|
||||||
|
counter++;
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||||
|
// tarInput
|
||||||
|
String line;
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
|
||||||
|
if (workDataNoDoi != null) {
|
||||||
|
if (workDataNoDoi.getErrorCode() != null) {
|
||||||
|
errorFromOrcidFound += 1;
|
||||||
|
Log
|
||||||
|
.debug(
|
||||||
|
"error from Orcid with code "
|
||||||
|
+ workDataNoDoi.getErrorCode()
|
||||||
|
+ " for entry "
|
||||||
|
+ entry.getName());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
boolean isDoiFound = workDataNoDoi.getExtIds().stream()
|
||||||
|
.filter(e -> e.getType()!=null)
|
||||||
|
.anyMatch(e -> e.getType().equals("doi"));
|
||||||
|
if (!isDoiFound) {
|
||||||
|
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
|
||||||
|
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
|
||||||
|
|
||||||
|
final Text key = new Text(workDataNoDoi.getOid());
|
||||||
|
final Text value = new Text(jsonData);
|
||||||
|
|
||||||
|
try {
|
||||||
|
writer.append(key, value);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||||
|
Log.debug(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
noDoiFound += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||||
|
xmlParserErrorFound += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
Log
|
||||||
|
.warn(
|
||||||
|
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||||
|
Log.warn(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||||
|
Log.info("Current xml works parsed: " + counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
||||||
|
Log.warn(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
Log.info("Activities parse completed");
|
||||||
|
Log.info("Total XML works parsed: " + counter);
|
||||||
|
Log.info("Total no doi work found: " + noDoiFound);
|
||||||
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||||
|
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
|
|
||||||
|
private String activitiesFileNameTarGz;
|
||||||
|
private String outputWorksPath;
|
||||||
|
private String workingPath;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||||
|
genOrcidAuthorWork.loadArgs(args);
|
||||||
|
genOrcidAuthorWork.generateAuthorsDOIsData();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void generateAuthorsDOIsData() throws Exception {
|
||||||
|
Configuration conf = initConfigurationObject();
|
||||||
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
|
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
|
||||||
|
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadArgs(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenOrcidAuthorWork.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
Log.info("Working Path: " + workingPath);
|
||||||
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
|
outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,119 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
public class SparkGenEnrichedOrcidWorks {
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
||||||
|
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkGenEnrichedOrcidWorks.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
logger.info("workingPath: ", workingPath);
|
||||||
|
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||||
|
logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
|
||||||
|
final String outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
logger.info("outputWorksPath: ", outputWorksPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||||
|
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
||||||
|
Dataset<AuthorData> summariesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(AuthorData.class));
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> activitiesRDD = sc
|
||||||
|
.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
|
||||||
|
Dataset<WorkDataNoDoi> activitiesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(WorkDataNoDoi.class));
|
||||||
|
|
||||||
|
activitiesDataset
|
||||||
|
.joinWith(
|
||||||
|
summariesDataset,
|
||||||
|
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
|
||||||
|
WorkDataNoDoi w = value._1;
|
||||||
|
AuthorData a = value._2;
|
||||||
|
AuthorMatcher.match(a, w.getContributors());
|
||||||
|
return new Tuple2<>(a.getOid(), w);
|
||||||
|
},
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toJavaRDD()
|
||||||
|
.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
||||||
|
AuthorData authorData = new AuthorData();
|
||||||
|
authorData.setOid(orcidId.toString());
|
||||||
|
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||||
|
authorData.setName(getJsonValue(jElement, "name"));
|
||||||
|
authorData.setSurname(getJsonValue(jElement, "surname"));
|
||||||
|
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
|
||||||
|
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
|
||||||
|
return workData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,5 +1,5 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.json;
|
package eu.dnetlib.doiboost.orcidnodoi.json;
|
||||||
|
|
||||||
import com.google.gson.JsonObject;
|
import com.google.gson.JsonObject;
|
||||||
|
|
|
@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
public class Contributor extends AuthorData implements Serializable {
|
public class Contributor extends AuthorData implements Serializable {
|
||||||
private String sequence;
|
private String sequence;
|
||||||
private String role;
|
private String role;
|
||||||
private boolean simpleMatch = false;
|
private transient boolean simpleMatch = false;
|
||||||
private Double score = 0.0;
|
private transient Double score = 0.0;
|
||||||
private boolean bestMatch = false;
|
private transient boolean bestMatch = false;
|
||||||
|
|
||||||
public String getSequence() {
|
public String getSequence() {
|
||||||
return sequence;
|
return sequence;
|
||||||
|
|
|
@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
|
||||||
public void setContributors(List<Contributor> contributors) {
|
public void setContributors(List<Contributor> contributors) {
|
||||||
this.contributors = contributors;
|
this.contributors = contributors;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.similarity;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
public class AuthorMatcher {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||||
|
private static final Double threshold = 0.8;
|
||||||
|
|
||||||
|
public static void match(AuthorData author, List<Contributor> contributors)
|
||||||
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
|
||||||
|
int matchCounter = 0;
|
||||||
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
|
Contributor contributor = null;
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
||||||
|
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
||||||
|
((author.getOtherName() != null)
|
||||||
|
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
||||||
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
||||||
|
if (matchCounters.get(0) == 1) {
|
||||||
|
updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
} else if (matchCounters.get(0) > 1) {
|
||||||
|
Optional<Contributor> optCon = contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isSimpleMatch())
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||||
|
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
Contributor bestMatchContributor = null;
|
||||||
|
if (optCon.isPresent()) {
|
||||||
|
bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
logger.info("best match: " + bestMatchContributor.getCreditName());
|
||||||
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("UPDATED contributors: ");
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||||
|
c.getName() + " - " + c.getSurname() + " - " +
|
||||||
|
c.getRole() + " - " + c.getSequence());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||||
|
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
||||||
|
String[] contributorSplitted = contributor.split(" ");
|
||||||
|
if (contributorSplitted.length == 0) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
||||||
|
String contributorSurname = "";
|
||||||
|
if (contributorSplitted.length > 1) {
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
||||||
|
joiner.add(contributorSplitted[i]);
|
||||||
|
}
|
||||||
|
contributorSurname = joiner.toString();
|
||||||
|
}
|
||||||
|
logger
|
||||||
|
.debug(
|
||||||
|
"contributorName: " + contributorName +
|
||||||
|
" contributorSurname: " + contributorSurname);
|
||||||
|
String authorNameNrm = normalize(authorName);
|
||||||
|
String authorSurnameNrm = normalize(authorSurname);
|
||||||
|
String contributorNameNrm = normalize(contributorName);
|
||||||
|
String contributorSurnameNrm = normalize(contributorSurname);
|
||||||
|
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
||||||
|
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
||||||
|
if (sm1.compareTo(sm2) >= 0) {
|
||||||
|
return sm1;
|
||||||
|
}
|
||||||
|
return sm2;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||||
|
logger
|
||||||
|
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String normalize(final String s) {
|
||||||
|
return nfd(s)
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
|
// in case
|
||||||
|
// of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String parse(String name, String surname) {
|
||||||
|
return surname + " " + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
if (c.isSimpleMatch()) {
|
||||||
|
logger.info("simple match on : " + c.getCreditName());
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
logger.info("inside updateAuthorsSimilarityMatch ...");
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||||
|
c.getName() + " - " + c.getSurname() + " - " +
|
||||||
|
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
||||||
|
+ c.isSimpleMatch());
|
||||||
|
});
|
||||||
|
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isBestMatch())
|
||||||
|
.forEach(c -> {
|
||||||
|
logger.info("similarity match on : " + c.getCreditName());
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateRanks(List<Contributor> contributors) {
|
||||||
|
boolean seqFound = false;
|
||||||
|
if (contributors
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
c -> c.getRole() != null && c.getSequence() != null &&
|
||||||
|
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
||||||
|
c.getSequence().equals("additional")))
|
||||||
|
.count() > 0) {
|
||||||
|
seqFound = true;
|
||||||
|
logger.info("sequence data found");
|
||||||
|
}
|
||||||
|
if (!seqFound) {
|
||||||
|
List<Integer> seqIds = Arrays.asList(0);
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
int currentSeq = seqIds.get(0) + 1;
|
||||||
|
seqIds.set(0, currentSeq);
|
||||||
|
c.setSequence(Integer.toString(seqIds.get(0)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toJson(WorkDataNoDoi work) {
|
||||||
|
GsonBuilder builder = new GsonBuilder();
|
||||||
|
Gson gson = builder.create();
|
||||||
|
return gson.toJson(work);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx4g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,524 @@
|
||||||
|
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath_activities</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_1</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_2</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_3</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_4</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_5</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_6</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_7</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_8</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_9</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_X</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath_activities}/no_doi_works/*'/>
|
||||||
|
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="fork_gen_orcid_author_work"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name = "fork_gen_orcid_author_work">
|
||||||
|
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_0">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_0">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_0" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_0">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_0"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_0">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_1">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_1">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_1" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_1">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_1}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_1"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_1">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_2">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_2">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_2" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_2">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_2}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_2"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_2">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_3">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_3">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_3" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_3">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_3}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_3"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_3">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_4">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_4">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_4" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_4">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_4}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_4"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_4">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_5">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_5">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_5" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_5">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_5}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_5"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_5">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_6">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_6">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_6" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_6">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_6}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_6"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_6">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_7">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_7">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_7" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_7">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_7}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_7"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_7">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_8">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_8">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_8" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_8">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_8}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_8"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_8">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_9">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_9">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_9" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_9">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_9}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_9"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_9">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_X">
|
||||||
|
<switch>
|
||||||
|
<case to="GenOrcidAuthorWork_X">
|
||||||
|
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_X" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_X">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_X}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="GenOrcidAuthorWork_X"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_X">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
|
||||||
|
|
||||||
|
<action name="Gen_Enriched_Orcid_Works">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Gen_Enriched_Orcid_Works</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
|
||||||
|
<spark-opts>--num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,7 @@
|
||||||
|
[
|
||||||
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
|
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
|
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
|
]
|
|
@ -1,15 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||||
|
|
||||||
import com.ximpleware.NavException;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import com.ximpleware.ParseException;
|
|
||||||
import com.ximpleware.XPathEvalException;
|
import java.io.IOException;
|
||||||
import com.ximpleware.XPathParseException;
|
import java.text.Normalizer;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import java.util.*;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
|
||||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.text.similarity.JaccardSimilarity;
|
import org.apache.commons.text.similarity.JaccardSimilarity;
|
||||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||||
|
@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import com.google.gson.Gson;
|
||||||
import java.text.Normalizer;
|
import com.google.gson.GsonBuilder;
|
||||||
import java.util.*;
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class OrcidNoDoiTest {
|
public class OrcidNoDoiTest {
|
||||||
|
|
||||||
|
@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
|
||||||
String nameB = "K";
|
String nameB = "K";
|
||||||
String surnameB = "Abdel-Dayem";
|
String surnameB = "Abdel-Dayem";
|
||||||
String orcidIdA = "0000-0003-2760-1191";
|
String orcidIdA = "0000-0003-2760-1191";
|
||||||
Double threshold = 0.8;
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
private void similarityTest() throws Exception {
|
private void readPublicationFieldsTest()
|
||||||
logger.info("running testSimilarity ....");
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
"JaroWinklerSimilarity: "
|
|
||||||
+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Ignore
|
|
||||||
private void bestMatchTest() throws Exception {
|
|
||||||
logger.info("running bestMatchTest ....");
|
|
||||||
String contributor = surnameB + ", " + nameB;
|
|
||||||
logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
|
||||||
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
|
||||||
String[] contributorSplitted = contributor.split(" ");
|
|
||||||
if (contributorSplitted.length == 0) {
|
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
|
||||||
String contributorSurname = "";
|
|
||||||
if (contributorSplitted.length > 1) {
|
|
||||||
StringJoiner joiner = new StringJoiner(" ");
|
|
||||||
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
|
||||||
joiner.add(contributorSplitted[i]);
|
|
||||||
}
|
|
||||||
contributorSurname = joiner.toString();
|
|
||||||
}
|
|
||||||
logger
|
|
||||||
.debug(
|
|
||||||
"contributorName: " + contributorName +
|
|
||||||
" contributorSurname: " + contributorSurname);
|
|
||||||
String authorNameNrm = normalize(authorName);
|
|
||||||
String authorSurnameNrm = normalize(authorSurname);
|
|
||||||
String contributorNameNrm = normalize(contributorName);
|
|
||||||
String contributorSurnameNrm = normalize(contributorSurname);
|
|
||||||
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
|
||||||
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
|
||||||
if (sm1.compareTo(sm2) >= 0) {
|
|
||||||
return sm1;
|
|
||||||
}
|
|
||||||
return sm2;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
|
||||||
logger
|
|
||||||
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
|
||||||
return score;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
|
||||||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String parse(String name, String surname) {
|
|
||||||
return surname + " " + name;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
|
||||||
return nfd(s)
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
|
||||||
// in case
|
|
||||||
// of large input strings
|
|
||||||
.replaceAll("(\\W)+", " ")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Ignore
|
|
||||||
public void readPublicationFieldsTest()
|
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
logger.info("running loadPublicationFieldsTest ....");
|
logger.info("running loadPublicationFieldsTest ....");
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
|
@ -178,78 +94,10 @@ public class OrcidNoDoiTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateRanks(List<Contributor> contributors) {
|
|
||||||
boolean seqFound = false;
|
|
||||||
if (contributors
|
|
||||||
.stream()
|
|
||||||
.filter(
|
|
||||||
c -> c.getRole() != null && c.getSequence() != null &&
|
|
||||||
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
|
||||||
c.getSequence().equals("additional")))
|
|
||||||
.count() > 0) {
|
|
||||||
seqFound = true;
|
|
||||||
logger.info("sequence data found");
|
|
||||||
}
|
|
||||||
if (!seqFound) {
|
|
||||||
List<Integer> seqIds = Arrays.asList(0);
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
int currentSeq = seqIds.get(0) + 1;
|
|
||||||
seqIds.set(0, currentSeq);
|
|
||||||
c.setSequence(Integer.toString(seqIds.get(0)));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
if (c.isSimpleMatch()) {
|
|
||||||
logger.info("simple match on : " + c.getCreditName());
|
|
||||||
c.setName(author.getName());
|
|
||||||
c.setSurname(author.getSurname());
|
|
||||||
c.setOid(author.getOid());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
updateRanks(contributors);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
|
||||||
logger.info("inside updateAuthorsSimilarityMatch ...");
|
|
||||||
contributors.forEach(c -> {
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
|
||||||
c.getName() + " - " + c.getSurname() + " - " +
|
|
||||||
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
|
||||||
+ c.isSimpleMatch());
|
|
||||||
});
|
|
||||||
|
|
||||||
contributors
|
|
||||||
.stream()
|
|
||||||
.filter(c -> c.isBestMatch())
|
|
||||||
.forEach(c -> {
|
|
||||||
logger.info("similarity match on : " + c.getCreditName());
|
|
||||||
c.setName(author.getName());
|
|
||||||
c.setSurname(author.getSurname());
|
|
||||||
c.setOid(author.getOid());
|
|
||||||
});
|
|
||||||
updateRanks(contributors);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
public void authorMatchTest() throws Exception {
|
||||||
public void authorSimilarityMatchTest() throws Exception {
|
|
||||||
logger.info("running authorSimilarityMatchTest ....");
|
|
||||||
authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
private void authorSimpleMatchTest() throws Exception {
|
|
||||||
logger.info("running authorSimpleMatchTest ....");
|
logger.info("running authorSimpleMatchTest ....");
|
||||||
authorMatchTest("activity_work_0000-0003-2760-1191.xml");
|
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||||
}
|
|
||||||
|
|
||||||
private void authorMatchTest(String orcidWork)
|
|
||||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
|
||||||
AuthorData author = new AuthorData();
|
AuthorData author = new AuthorData();
|
||||||
author.setName(nameA);
|
author.setName(nameA);
|
||||||
author.setSurname(surnameA);
|
author.setSurname(surnameA);
|
||||||
|
@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
|
||||||
logger.error("parsing xml", e);
|
logger.error("parsing xml", e);
|
||||||
}
|
}
|
||||||
assertNotNull(workData);
|
assertNotNull(workData);
|
||||||
int matchCounter = 0;
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
GsonBuilder builder = new GsonBuilder();
|
||||||
Contributor contributor = null;
|
Gson gson = builder.create();
|
||||||
workData.getContributors().forEach(c -> {
|
logger.info(gson.toJson(workData));
|
||||||
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
|
||||||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
|
||||||
((author.getOtherName() != null)
|
|
||||||
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
|
||||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
|
||||||
c.setSimpleMatch(true);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
|
||||||
if (matchCounters.get(0) == 1) {
|
|
||||||
updateAuthorsSimpleMatch(workData.getContributors(), author);
|
|
||||||
} else if (matchCounters.get(0) > 1) {
|
|
||||||
Optional<Contributor> optCon = workData
|
|
||||||
.getContributors()
|
|
||||||
.stream()
|
|
||||||
.filter(c -> c.isSimpleMatch())
|
|
||||||
.map(c -> {
|
|
||||||
c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
|
|
||||||
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
|
||||||
return c;
|
|
||||||
})
|
|
||||||
.filter(c -> c.getScore() >= threshold)
|
|
||||||
.max(Comparator.comparing(c -> c.getScore()));
|
|
||||||
Contributor bestMatchContributor = null;
|
|
||||||
if (optCon.isPresent()) {
|
|
||||||
bestMatchContributor = optCon.get();
|
|
||||||
bestMatchContributor.setBestMatch(true);
|
|
||||||
logger.info("best match: " + bestMatchContributor.getCreditName());
|
|
||||||
updateAuthorsSimilarityMatch(workData.getContributors(), author);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("UPDATED contributors: ");
|
|
||||||
workData.getContributors().forEach(c -> {
|
|
||||||
logger
|
|
||||||
.info(
|
|
||||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
|
||||||
c.getName() + " - " + c.getSurname() + " - " +
|
|
||||||
c.getRole() + " - " + c.getSequence());
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
|
|
||||||
// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
|
||||||
//
|
|
Loading…
Reference in New Issue