forked from D-Net/dnet-hadoop
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork)
This commit is contained in:
parent
fcbb4c1489
commit
d6498278ed
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
|||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ import org.apache.hadoop.io.compress.CompressionCodec;
|
|||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.json;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.JsonObject;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
|
||||
public class JsonHelper {
|
||||
|
||||
public static String createOidWork(WorkDataNoDoi workData) {
|
||||
JsonObject oidWork = new JsonObject();
|
||||
oidWork.addProperty("oid", workData.getOid());
|
||||
oidWork.addProperty("work", new Gson().toJson(workData));
|
||||
return oidWork.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,149 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URI;
|
||||
|
||||
public class ActivitiesDumpReader {
|
||||
|
||||
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||
|
||||
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodec(inputPath);
|
||||
if (codec == null) {
|
||||
System.err.println("No codec found for " + uri);
|
||||
System.exit(1);
|
||||
}
|
||||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
parseTarActivities(fs, conf, gzipInputStream, outputPath);
|
||||
|
||||
} finally {
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
private static void parseTarActivities(
|
||||
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||
int counter = 0;
|
||||
int noDoiFound = 0;
|
||||
int errorFromOrcidFound = 0;
|
||||
int xmlParserErrorFound = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
|
||||
try {
|
||||
if (entry.isDirectory() || !filename.contains("works")) {
|
||||
|
||||
} else {
|
||||
Log.debug("XML work entry name: " + entry.getName());
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||
// tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi.VTDParseWorkData(buffer.toString().getBytes());
|
||||
if (workDataNoDoi != null) {
|
||||
if (workDataNoDoi.getErrorCode() != null) {
|
||||
errorFromOrcidFound += 1;
|
||||
Log
|
||||
.debug(
|
||||
"error from Orcid with code "
|
||||
+ workDataNoDoi.getErrorCode()
|
||||
+ " for entry "
|
||||
+ entry.getName());
|
||||
continue;
|
||||
}
|
||||
boolean isDoiFound = workDataNoDoi.getExtIds().stream()
|
||||
.filter(e -> e.getType()!=null)
|
||||
.anyMatch(e -> e.getType().equals("doi"));
|
||||
if (!isDoiFound) {
|
||||
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
|
||||
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
|
||||
|
||||
final Text key = new Text(workDataNoDoi.getOid());
|
||||
final Text value = new Text(jsonData);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||
Log.debug(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
noDoiFound += 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||
xmlParserErrorFound += 1;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log
|
||||
.warn(
|
||||
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||
Log.warn(e);
|
||||
}
|
||||
|
||||
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
Log.info("Current xml works parsed: " + counter);
|
||||
}
|
||||
|
||||
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
Log.info("Activities parse completed");
|
||||
Log.info("Total XML works parsed: " + counter);
|
||||
Log.info("Total no doi work found: " + noDoiFound);
|
||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||
|
||||
private String activitiesFileNameTarGz;
|
||||
private String outputWorksPath;
|
||||
private String workingPath;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||
genOrcidAuthorWork.loadArgs(args);
|
||||
genOrcidAuthorWork.generateAuthorsDOIsData();
|
||||
}
|
||||
|
||||
public void generateAuthorsDOIsData() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
|
||||
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenOrcidAuthorWork.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
workingPath = parser.get("workingPath");
|
||||
Log.info("Working Path: " + workingPath);
|
||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||
outputWorksPath = parser.get("outputWorksPath");
|
||||
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class SparkGenEnrichedOrcidWorks {
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
||||
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkGenEnrichedOrcidWorks.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
logger.info("workingPath: ", workingPath);
|
||||
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||
logger.info("outputEnrichedWorksPath: ", outputEnrichedWorksPath);
|
||||
final String outputWorksPath = parser.get("outputWorksPath");
|
||||
logger.info("outputWorksPath: ", outputWorksPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
||||
Dataset<AuthorData> summariesDataset = spark
|
||||
.createDataset(
|
||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||
Encoders.bean(AuthorData.class));
|
||||
|
||||
JavaPairRDD<Text, Text> activitiesRDD = sc
|
||||
.sequenceFile(workingPath + outputWorksPath + "works_X.seq" , Text.class, Text.class);
|
||||
Dataset<WorkDataNoDoi> activitiesDataset = spark
|
||||
.createDataset(
|
||||
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
||||
Encoders.bean(WorkDataNoDoi.class));
|
||||
|
||||
activitiesDataset
|
||||
.joinWith(
|
||||
summariesDataset,
|
||||
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
||||
.map(
|
||||
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, WorkDataNoDoi>>) value -> {
|
||||
WorkDataNoDoi w = value._1;
|
||||
AuthorData a = value._2;
|
||||
AuthorMatcher.match(a, w.getContributors());
|
||||
return new Tuple2<>(a.getOid(), w);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(WorkDataNoDoi.class)))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.saveAsTextFile(workingPath + outputEnrichedWorksPath);;
|
||||
});
|
||||
}
|
||||
|
||||
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
||||
AuthorData authorData = new AuthorData();
|
||||
authorData.setOid(orcidId.toString());
|
||||
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||
authorData.setName(getJsonValue(jElement, "name"));
|
||||
authorData.setSurname(getJsonValue(jElement, "surname"));
|
||||
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
||||
return authorData;
|
||||
}
|
||||
|
||||
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
|
||||
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
|
||||
return workData;
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.json;
|
||||
package eu.dnetlib.doiboost.orcidnodoi.json;
|
||||
|
||||
import com.google.gson.JsonObject;
|
||||
|
|
@ -8,9 +8,9 @@ import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
|||
public class Contributor extends AuthorData implements Serializable {
|
||||
private String sequence;
|
||||
private String role;
|
||||
private boolean simpleMatch = false;
|
||||
private Double score = 0.0;
|
||||
private boolean bestMatch = false;
|
||||
private transient boolean simpleMatch = false;
|
||||
private transient Double score = 0.0;
|
||||
private transient boolean bestMatch = false;
|
||||
|
||||
public String getSequence() {
|
||||
return sequence;
|
||||
|
|
|
@ -97,5 +97,4 @@ public class WorkDataNoDoi implements Serializable {
|
|||
public void setContributors(List<Contributor> contributors) {
|
||||
this.contributors = contributors;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,204 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.similarity;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.ximpleware.NavException;
|
||||
import com.ximpleware.ParseException;
|
||||
import com.ximpleware.XPathEvalException;
|
||||
import com.ximpleware.XPathParseException;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
|
||||
public class AuthorMatcher {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||
private static final Double threshold = 0.8;
|
||||
|
||||
public static void match(AuthorData author, List<Contributor> contributors)
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
|
||||
int matchCounter = 0;
|
||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||
Contributor contributor = null;
|
||||
contributors.forEach(c -> {
|
||||
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
||||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
||||
((author.getOtherName() != null)
|
||||
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||
c.setSimpleMatch(true);
|
||||
}
|
||||
});
|
||||
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
||||
if (matchCounters.get(0) == 1) {
|
||||
updateAuthorsSimpleMatch(contributors, author);
|
||||
} else if (matchCounters.get(0) > 1) {
|
||||
Optional<Contributor> optCon = contributors
|
||||
.stream()
|
||||
.filter(c -> c.isSimpleMatch())
|
||||
.map(c -> {
|
||||
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
||||
return c;
|
||||
})
|
||||
.filter(c -> c.getScore() >= threshold)
|
||||
.max(Comparator.comparing(c -> c.getScore()));
|
||||
Contributor bestMatchContributor = null;
|
||||
if (optCon.isPresent()) {
|
||||
bestMatchContributor = optCon.get();
|
||||
bestMatchContributor.setBestMatch(true);
|
||||
logger.info("best match: " + bestMatchContributor.getCreditName());
|
||||
updateAuthorsSimilarityMatch(contributors, author);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
logger.info("UPDATED contributors: ");
|
||||
contributors.forEach(c -> {
|
||||
logger
|
||||
.info(
|
||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||
c.getName() + " - " + c.getSurname() + " - " +
|
||||
c.getRole() + " - " + c.getSequence());
|
||||
});
|
||||
}
|
||||
|
||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
||||
String[] contributorSplitted = contributor.split(" ");
|
||||
if (contributorSplitted.length == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
||||
String contributorSurname = "";
|
||||
if (contributorSplitted.length > 1) {
|
||||
StringJoiner joiner = new StringJoiner(" ");
|
||||
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
||||
joiner.add(contributorSplitted[i]);
|
||||
}
|
||||
contributorSurname = joiner.toString();
|
||||
}
|
||||
logger
|
||||
.debug(
|
||||
"contributorName: " + contributorName +
|
||||
" contributorSurname: " + contributorSurname);
|
||||
String authorNameNrm = normalize(authorName);
|
||||
String authorSurnameNrm = normalize(authorSurname);
|
||||
String contributorNameNrm = normalize(contributorName);
|
||||
String contributorSurnameNrm = normalize(contributorSurname);
|
||||
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
||||
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
||||
if (sm1.compareTo(sm2) >= 0) {
|
||||
return sm1;
|
||||
}
|
||||
return sm2;
|
||||
}
|
||||
|
||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||
logger
|
||||
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
||||
return score;
|
||||
}
|
||||
|
||||
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||
}
|
||||
|
||||
private static String normalize(final String s) {
|
||||
return nfd(s)
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||
// in case
|
||||
// of large input strings
|
||||
.replaceAll("(\\W)+", " ")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
private static String parse(String name, String surname) {
|
||||
return surname + " " + name;
|
||||
}
|
||||
|
||||
private static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||
contributors.forEach(c -> {
|
||||
if (c.isSimpleMatch()) {
|
||||
logger.info("simple match on : " + c.getCreditName());
|
||||
c.setName(author.getName());
|
||||
c.setSurname(author.getSurname());
|
||||
c.setOid(author.getOid());
|
||||
}
|
||||
});
|
||||
updateRanks(contributors);
|
||||
}
|
||||
|
||||
private static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||
logger.info("inside updateAuthorsSimilarityMatch ...");
|
||||
contributors.forEach(c -> {
|
||||
logger
|
||||
.info(
|
||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||
c.getName() + " - " + c.getSurname() + " - " +
|
||||
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
||||
+ c.isSimpleMatch());
|
||||
});
|
||||
|
||||
contributors
|
||||
.stream()
|
||||
.filter(c -> c.isBestMatch())
|
||||
.forEach(c -> {
|
||||
logger.info("similarity match on : " + c.getCreditName());
|
||||
c.setName(author.getName());
|
||||
c.setSurname(author.getSurname());
|
||||
c.setOid(author.getOid());
|
||||
});
|
||||
updateRanks(contributors);
|
||||
}
|
||||
|
||||
private static void updateRanks(List<Contributor> contributors) {
|
||||
boolean seqFound = false;
|
||||
if (contributors
|
||||
.stream()
|
||||
.filter(
|
||||
c -> c.getRole() != null && c.getSequence() != null &&
|
||||
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
||||
c.getSequence().equals("additional")))
|
||||
.count() > 0) {
|
||||
seqFound = true;
|
||||
logger.info("sequence data found");
|
||||
}
|
||||
if (!seqFound) {
|
||||
List<Integer> seqIds = Arrays.asList(0);
|
||||
contributors.forEach(c -> {
|
||||
int currentSeq = seqIds.get(0) + 1;
|
||||
seqIds.set(0, currentSeq);
|
||||
c.setSequence(Integer.toString(seqIds.get(0)));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private static String toJson(WorkDataNoDoi work) {
|
||||
GsonBuilder builder = new GsonBuilder();
|
||||
Gson gson = builder.create();
|
||||
return gson.toJson(work);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.java</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||
<value>-Xmx4g</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,524 @@
|
|||
<workflow-app name="Gen Enriched Orcid Works" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath_activities</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_0</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/18017660 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_0.tar.gz /data/orcid_activities/ORCID_2019_activites_0.tar.gz ; rm -f /tmp/ORCID_2019_activites_0.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_1</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/18017675 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_1.tar.gz /data/orcid_activities/ORCID_2019_activites_1.tar.gz ; rm -f /tmp/ORCID_2019_activites_1.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_2</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/18017717 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_2.tar.gz /data/orcid_activities/ORCID_2019_activites_2.tar.gz ; rm -f /tmp/ORCID_2019_activites_2.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_3</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/18017765 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_3.tar.gz /data/orcid_activities/ORCID_2019_activites_3.tar.gz ; rm -f /tmp/ORCID_2019_activites_3.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_4</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/18017831 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_4.tar.gz /data/orcid_activities/ORCID_2019_activites_4.tar.gz ; rm -f /tmp/ORCID_2019_activites_4.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_5</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/18017987 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_5.tar.gz /data/orcid_activities/ORCID_2019_activites_5.tar.gz ; rm -f /tmp/ORCID_2019_activites_5.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_6</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/18018053 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_6.tar.gz /data/orcid_activities/ORCID_2019_activites_6.tar.gz ; rm -f /tmp/ORCID_2019_activites_6.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_7</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/18018023 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_7.tar.gz /data/orcid_activities/ORCID_2019_activites_7.tar.gz ; rm -f /tmp/ORCID_2019_activites_7.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_8</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/18018248 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_8.tar.gz /data/orcid_activities/ORCID_2019_activites_8.tar.gz ; rm -f /tmp/ORCID_2019_activites_8.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_9</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/18018029 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_9.tar.gz /data/orcid_activities/ORCID_2019_activites_9.tar.gz ; rm -f /tmp/ORCID_2019_activites_9.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>shell_cmd_X</name>
|
||||
<value>wget -O /tmp/ORCID_2019_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/18018182 ; hdfs dfs -copyFromLocal /tmp/ORCID_2019_activites_X.tar.gz /data/orcid_activities/ORCID_2019_activites_X.tar.gz ; rm -f /tmp/ORCID_2019_activites_X.tar.gz
|
||||
</value>
|
||||
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetWorkingPath"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingPath_activities}/no_doi_works/*'/>
|
||||
<delete path='${workingPath_activities}/no_doi_enriched_works/*'/>
|
||||
</fs>
|
||||
<ok to="fork_gen_orcid_author_work"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name = "fork_gen_orcid_author_work">
|
||||
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||
</fork>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_0">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_0">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_0.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_0" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_0">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_0}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_0"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_0">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_0.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_1">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_1">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_1.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_1" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_1">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_1}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_1"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_1">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_1.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_2">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_2">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_2.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_2" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_2">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_2}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_2"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_2">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_2.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_3">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_3">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_3.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_3" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_3">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_3}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_3"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_3">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_3.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_4">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_4">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_4.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_4" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_4">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_4}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_4"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_4">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_4.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_5">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_5">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_5.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_5" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_5">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_5}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_5"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_5">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_5.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_6">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_6">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_6.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_6" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_6">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_6}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_6"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_6">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_6.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_7">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_7">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_7.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_7" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_7">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_7}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_7"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_7">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_7.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_8">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_8">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_8.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_8" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_8">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_8}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_8"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_8">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_8.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_9">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_9">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_9.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_9" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_9">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_9}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_9"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_9">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_9.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<decision name="check_exist_on_hdfs_activities_X">
|
||||
<switch>
|
||||
<case to="GenOrcidAuthorWork_X">
|
||||
${fs:exists(concat(workingPath_activities,'/ORCID_2019_activites_X.tar.gz'))}
|
||||
</case>
|
||||
<default to="Download_X" />
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="Download_X">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>bash</exec>
|
||||
<argument>-c</argument>
|
||||
<argument>${shell_cmd_X}</argument>
|
||||
<capture-output/>
|
||||
</shell>
|
||||
<ok to="GenOrcidAuthorWork_X"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenOrcidAuthorWork_X">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||
<arg>-w</arg><arg>${workingPath_activities}/</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-f</arg><arg>ORCID_2019_activites_X.tar.gz</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||
</java>
|
||||
<ok to="join_node"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name = "join_node" to = "Gen_Enriched_Orcid_Works"/>
|
||||
|
||||
<action name="Gen_Enriched_Orcid_Works">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Gen_Enriched_Orcid_Works</name>
|
||||
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
||||
<jar>dhp-doiboost-1.2.3-SNAPSHOT.jar</jar>
|
||||
<spark-opts>--num-executors 10 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
||||
</spark-opts>
|
||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||
]
|
|
@ -1,15 +1,12 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||
|
||||
import com.ximpleware.NavException;
|
||||
import com.ximpleware.ParseException;
|
||||
import com.ximpleware.XPathEvalException;
|
||||
import com.ximpleware.XPathParseException;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.text.similarity.JaccardSimilarity;
|
||||
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||
|
@ -17,11 +14,20 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.ximpleware.NavException;
|
||||
import com.ximpleware.ParseException;
|
||||
import com.ximpleware.XPathEvalException;
|
||||
import com.ximpleware.XPathParseException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class OrcidNoDoiTest {
|
||||
|
||||
|
@ -33,100 +39,10 @@ public class OrcidNoDoiTest {
|
|||
String nameB = "K";
|
||||
String surnameB = "Abdel-Dayem";
|
||||
String orcidIdA = "0000-0003-2760-1191";
|
||||
Double threshold = 0.8;
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
private void similarityTest() throws Exception {
|
||||
logger.info("running testSimilarity ....");
|
||||
logger
|
||||
.info(
|
||||
"JaroWinklerSimilarity: "
|
||||
+ Double.toString(similarityJaroWinkler(nameA, surnameA, nameB, surnameB)));
|
||||
logger
|
||||
.info(
|
||||
"JaccardSimilarity: " + Double.toString(similarityJaccard(nameA, surnameA, nameB, surnameB)));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
private void bestMatchTest() throws Exception {
|
||||
logger.info("running bestMatchTest ....");
|
||||
String contributor = surnameB + ", " + nameB;
|
||||
logger.info("score: " + Double.toString(bestMatch(surnameA, nameA, contributor)));
|
||||
}
|
||||
|
||||
private static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||
logger.debug(authorSurname + " " + authorName + " vs " + contributor);
|
||||
String[] contributorSplitted = contributor.split(" ");
|
||||
if (contributorSplitted.length == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
||||
String contributorSurname = "";
|
||||
if (contributorSplitted.length > 1) {
|
||||
StringJoiner joiner = new StringJoiner(" ");
|
||||
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
||||
joiner.add(contributorSplitted[i]);
|
||||
}
|
||||
contributorSurname = joiner.toString();
|
||||
}
|
||||
logger
|
||||
.debug(
|
||||
"contributorName: " + contributorName +
|
||||
" contributorSurname: " + contributorSurname);
|
||||
String authorNameNrm = normalize(authorName);
|
||||
String authorSurnameNrm = normalize(authorSurname);
|
||||
String contributorNameNrm = normalize(contributorName);
|
||||
String contributorSurnameNrm = normalize(contributorSurname);
|
||||
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
||||
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
||||
if (sm1.compareTo(sm2) >= 0) {
|
||||
return sm1;
|
||||
}
|
||||
return sm2;
|
||||
}
|
||||
|
||||
private static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||
logger
|
||||
.debug(nameA + ", " + surnameA + " <> " + nameB + ", " + surnameB + " score: " + Double.toString(score));
|
||||
return score;
|
||||
}
|
||||
|
||||
private static Double similarityJaccard(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
return new JaccardSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||
}
|
||||
|
||||
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
||||
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||
}
|
||||
|
||||
private static String parse(String name, String surname) {
|
||||
return surname + " " + name;
|
||||
}
|
||||
|
||||
private static String normalize(final String s) {
|
||||
return nfd(s)
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||
// in case
|
||||
// of large input strings
|
||||
.replaceAll("(\\W)+", " ")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
private static String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void readPublicationFieldsTest()
|
||||
private void readPublicationFieldsTest()
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
logger.info("running loadPublicationFieldsTest ....");
|
||||
String xml = IOUtils
|
||||
|
@ -178,78 +94,10 @@ public class OrcidNoDoiTest {
|
|||
|
||||
}
|
||||
|
||||
private void updateRanks(List<Contributor> contributors) {
|
||||
boolean seqFound = false;
|
||||
if (contributors
|
||||
.stream()
|
||||
.filter(
|
||||
c -> c.getRole() != null && c.getSequence() != null &&
|
||||
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
||||
c.getSequence().equals("additional")))
|
||||
.count() > 0) {
|
||||
seqFound = true;
|
||||
logger.info("sequence data found");
|
||||
}
|
||||
if (!seqFound) {
|
||||
List<Integer> seqIds = Arrays.asList(0);
|
||||
contributors.forEach(c -> {
|
||||
int currentSeq = seqIds.get(0) + 1;
|
||||
seqIds.set(0, currentSeq);
|
||||
c.setSequence(Integer.toString(seqIds.get(0)));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||
contributors.forEach(c -> {
|
||||
if (c.isSimpleMatch()) {
|
||||
logger.info("simple match on : " + c.getCreditName());
|
||||
c.setName(author.getName());
|
||||
c.setSurname(author.getSurname());
|
||||
c.setOid(author.getOid());
|
||||
}
|
||||
});
|
||||
updateRanks(contributors);
|
||||
}
|
||||
|
||||
private void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||
logger.info("inside updateAuthorsSimilarityMatch ...");
|
||||
contributors.forEach(c -> {
|
||||
logger
|
||||
.info(
|
||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||
c.getName() + " - " + c.getSurname() + " - " +
|
||||
c.getRole() + " - " + c.getSequence() + " - best: " + c.isBestMatch() + " - simpe: "
|
||||
+ c.isSimpleMatch());
|
||||
});
|
||||
|
||||
contributors
|
||||
.stream()
|
||||
.filter(c -> c.isBestMatch())
|
||||
.forEach(c -> {
|
||||
logger.info("similarity match on : " + c.getCreditName());
|
||||
c.setName(author.getName());
|
||||
c.setSurname(author.getSurname());
|
||||
c.setOid(author.getOid());
|
||||
});
|
||||
updateRanks(contributors);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void authorSimilarityMatchTest() throws Exception {
|
||||
logger.info("running authorSimilarityMatchTest ....");
|
||||
authorMatchTest("activity_work_0000-0003-2760-1191-similarity.xml");
|
||||
}
|
||||
|
||||
@Test
|
||||
private void authorSimpleMatchTest() throws Exception {
|
||||
public void authorMatchTest() throws Exception {
|
||||
logger.info("running authorSimpleMatchTest ....");
|
||||
authorMatchTest("activity_work_0000-0003-2760-1191.xml");
|
||||
}
|
||||
|
||||
private void authorMatchTest(String orcidWork)
|
||||
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||
AuthorData author = new AuthorData();
|
||||
author.setName(nameA);
|
||||
author.setSurname(surnameA);
|
||||
|
@ -272,55 +120,9 @@ public class OrcidNoDoiTest {
|
|||
logger.error("parsing xml", e);
|
||||
}
|
||||
assertNotNull(workData);
|
||||
int matchCounter = 0;
|
||||
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||
Contributor contributor = null;
|
||||
workData.getContributors().forEach(c -> {
|
||||
if (normalize(c.getCreditName()).contains(normalize(author.getName())) ||
|
||||
normalize(c.getCreditName()).contains(normalize(author.getSurname())) ||
|
||||
((author.getOtherName() != null)
|
||||
&& normalize(c.getCreditName()).contains(normalize(author.getOtherName())))) {
|
||||
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||
c.setSimpleMatch(true);
|
||||
}
|
||||
});
|
||||
logger.info("match counter: " + Integer.toString(matchCounters.get(0)));
|
||||
if (matchCounters.get(0) == 1) {
|
||||
updateAuthorsSimpleMatch(workData.getContributors(), author);
|
||||
} else if (matchCounters.get(0) > 1) {
|
||||
Optional<Contributor> optCon = workData
|
||||
.getContributors()
|
||||
.stream()
|
||||
.filter(c -> c.isSimpleMatch())
|
||||
.map(c -> {
|
||||
c.setScore(bestMatch(nameA, surnameA, c.getCreditName()));
|
||||
logger.debug("nella map: " + c.getCreditName() + " score: " + c.getScore());
|
||||
return c;
|
||||
})
|
||||
.filter(c -> c.getScore() >= threshold)
|
||||
.max(Comparator.comparing(c -> c.getScore()));
|
||||
Contributor bestMatchContributor = null;
|
||||
if (optCon.isPresent()) {
|
||||
bestMatchContributor = optCon.get();
|
||||
bestMatchContributor.setBestMatch(true);
|
||||
logger.info("best match: " + bestMatchContributor.getCreditName());
|
||||
updateAuthorsSimilarityMatch(workData.getContributors(), author);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
logger.info("UPDATED contributors: ");
|
||||
workData.getContributors().forEach(c -> {
|
||||
logger
|
||||
.info(
|
||||
c.getOid() + " - " + c.getCreditName() + " - " +
|
||||
c.getName() + " - " + c.getSurname() + " - " +
|
||||
c.getRole() + " - " + c.getSequence());
|
||||
});
|
||||
AuthorMatcher.match(author, workData.getContributors());
|
||||
GsonBuilder builder = new GsonBuilder();
|
||||
Gson gson = builder.create();
|
||||
logger.info(gson.toJson(workData));
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// orcid_RDD = sc.textFile(ORCID_DUMP_PATH)
|
||||
// no_doi_works_RDD = orcid_RDD.map(orcid_map).filter(lambda x:x is not None).map(lambda x: json.dumps(x)).saveAsTextFile(path=ORCID_OPENAIRE_PATH,compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")
|
||||
//
|
Loading…
Reference in New Issue