resolved conflicts

pull/73/head
Claudio Atzori 3 years ago
commit 12e2f930c8

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.common;
import java.util.Map;
import com.google.common.collect.Maps;
public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
accessRightsCoarMap.put("CLOSED", "c_14cb");
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
}
static {
coarCodeLabelMap.put("c_abf2", "OPEN");
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
}

@ -0,0 +1,412 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class GraphResultMapper implements Serializable {
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in) {
CommunityResult out = new CommunityResult();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
c.setConferencedate(j.getConferencedate());
c.setConferenceplace(j.getConferenceplace());
c.setEdition(j.getEdition());
c.setEp(j.getEp());
c.setIss(j.getIss());
c.setIssnLinking(j.getIssnLinking());
c.setIssnOnline(j.getIssnOnline());
c.setIssnPrinted(j.getIssnPrinted());
c.setName(j.getName());
c.setSp(j.getSp());
c.setVol(j.getVol());
out.setContainer(c);
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
}
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out
.setGeolocation(
Optional
.ofNullable(id.getGeolocation())
.map(
igl -> igl
.stream()
.filter(Objects::nonNull)
.map(gli -> {
GeoLocation gl = new GeoLocation();
gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
break;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
Optional
.ofNullable(is.getCodeRepositoryUrl())
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
Optional
.ofNullable(is.getDocumentationUrl())
.ifPresent(
value -> out
.setDocumentationUrl(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
break;
case "other":
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
out
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
break;
}
Optional
.ofNullable(input.getAuthor())
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
out
.setBestaccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
Country country = new Country();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(input.getOriginalId());
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
out
.setInstance(
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
}
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
}
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
List<ControlledField> pids = new ArrayList<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.stream()
.forEach(
p -> pids
.add(
ControlledField
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
out.setPid(pids);
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
List<String> sourceList = new ArrayList<>();
Optional
.ofNullable(input.getSource())
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<Subject> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.forEach(s -> subjectList.add(getSubject(s))));
out.setSubjects(subjectList);
out.setType(input.getResulttype().getClassid());
}
out
.setCollectedfrom(
input
.getCollectedfrom()
.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
.collect(Collectors.toList()));
return out;
}
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright());
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
}
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
p.setProvenance(di.get().getProvenanceaction().getClassname());
p.setTrust(di.get().getTrust());
subject.setProvenance(p);
}
return subject;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional<List<StructuredProperty>> oPids = Optional
.ofNullable(oa.getPid());
if (oPids.isPresent()) {
Pid pid = getOrcid(oPids.get());
if (pid != null) {
a.setPid(pid);
}
}
return a;
}
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue()),
Provenance
.newInstance(
di.get().getProvenanceaction().getClassname(),
di.get().getTrust()));
} else {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue())
);
}
}
}
return null;
}
}

@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
name = "communities_infrastructures.json";
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen();

@ -6,7 +6,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../</relativePath>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-schemas</artifactId>

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.List;
import com.google.common.base.Objects;
@ -8,7 +9,7 @@ import com.google.common.base.Objects;
/**
* Represent a measure, must be further described by a system available resource providing name and descriptions.
*/
public class Measure {
public class Measure implements Serializable {
/**
* Unique measure identifier.
@ -16,7 +17,7 @@ public class Measure {
private String id;
/**
* List of units associated with this measure. KeyValue provides a pair to store the laber (key) and the value, plus
* List of units associated with this measure. KeyValue provides a pair to store the label (key) and the value, plus
* common provenance information.
*/
private List<KeyValue> unit;

@ -2,8 +2,12 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public abstract class Oaf implements Serializable {
@ -40,9 +44,36 @@ public abstract class Oaf implements Serializable {
this.lastupdatetimestamp = lastupdatetimestamp;
}
public void mergeOAFDataInfo(Oaf e) {
if (e.getDataInfo() != null && compareTrust(this, e) < 0)
dataInfo = e.getDataInfo();
public void mergeFrom(Oaf o) {
if (Objects.isNull(o)) {
return;
}
setCollectedfrom(
Stream
.concat(
Optional
.ofNullable(getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(o.getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()))
.distinct() // relies on KeyValue.equals
.collect(Collectors.toList()));
mergeOAFDataInfo(o);
setLastupdatetimestamp(
Math
.max(
Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
}
public void mergeOAFDataInfo(Oaf o) {
if (o.getDataInfo() != null && compareTrust(this, o) < 0)
dataInfo = o.getDataInfo();
}
protected String extractTrust(Oaf e) {
@ -62,7 +93,7 @@ public abstract class Oaf implements Serializable {
if (o == null || getClass() != o.getClass())
return false;
Oaf oaf = (Oaf) o;
return Objects.equals(dataInfo, oaf.dataInfo)
return Objects.equals(getDataInfo(), oaf.getDataInfo())
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
}

@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
}
public void mergeFrom(OafEntity e) {
if (e == null)
return;
super.mergeFrom(e);
originalId = mergeLists(originalId, e.getOriginalId());
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
pid = mergeLists(pid, e.getPid());
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)

@ -130,19 +130,7 @@ public class Relation extends Oaf {
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
setCollectedfrom(
Stream
.concat(
Optional
.ofNullable(getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(r.getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()))
.distinct() // relies on KeyValue.equals
.collect(Collectors.toList()));
super.mergeFrom(r);
}
@Override

@ -1,7 +1,14 @@
package eu.dnetlib.doiboost.orcid.model;
package eu.dnetlib.dhp.schema.orcid;
import java.io.Serializable;
import java.util.List;
import com.google.common.collect.Lists;
/**
* This class models the data that are retrieved from orcid publication
*/
public class AuthorData implements Serializable {
@ -10,6 +17,7 @@ public class AuthorData implements Serializable {
private String surname;
private String creditName;
private String errorCode;
private List<String> otherNames;
public String getErrorCode() {
return errorCode;
@ -50,4 +58,15 @@ public class AuthorData implements Serializable {
public void setOid(String oid) {
this.oid = oid;
}
public List<String> getOtherNames() {
return otherNames;
}
public void setOtherNames(List<String> otherNames) {
if (this.otherNames == null) {
this.otherNames = Lists.newArrayList();
}
this.otherNames = otherNames;
}
}

@ -0,0 +1,25 @@
package eu.dnetlib.dhp.schema.orcid;
import java.util.List;
public class OrcidDOI {
private String doi;
private List<AuthorData> authors;
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public List<AuthorData> getAuthors() {
return authors;
}
public void setAuthors(List<AuthorData> authors) {
this.authors = authors;
}
}

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* Class that maps the model of the bipFinder! input data.
* Only needed for deserialization purposes
*/
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
public BipDeserialize() {
super();
}
public List<Score> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
*/
public class BipScore implements Serializable {
private String id; //doi
private List<Score> scoreList; //unit as given in the inputfile
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<Score> getScoreList() {
return scoreList;
}
public void setScoreList(List<Score> scoreList) {
this.scoreList = scoreList;
}
}

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Just collects all the atomic actions produced for the different results and saves them in
* outputpath for the ActionSet
*/
public class CollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
collectAndSave(spark, inputPath, outputPath);
});
}
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
public class KeyValue implements Serializable {
private String key;
private String value;
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
/**
* Subset of the information of the generic results that are needed to create the atomic action
*/
public class PreparedResult implements Serializable {
private String id; // openaire id
private String value; // doi
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* represents the score in the input file
*/
public class Score implements Serializable {
private String id;
private List<KeyValue> unit;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<KeyValue> getUnit() {
return unit;
}
public void setUnit(List<KeyValue> unit) {
this.unit = unit;
}
}

@ -0,0 +1,200 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import scala.Tuple2;
/**
* created the Atomic Action for each tipe of results
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionScoreJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String bipScorePath = parser.get("bipScorePath");
log.info("bipScorePath: {}", bipScorePath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(bipScorePath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
Dataset<BipScore> bipScores = spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
System.out.println(bipScores.count());
Dataset<I> results = readPath(spark, inputPath, inputClazz);
results.createOrReplaceTempView("result");
Dataset<PreparedResult> preparedResult = spark
.sql(
"select pIde.value value, id " +
"from result " +
"lateral view explode (pid) p as pIde " +
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
.as(Encoders.bean(PreparedResult.class));
bipScores
.joinWith(
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
"inner")
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
BipScore ret = value._1();
ret.setId(value._2().getId());
return ret;
}, Encoders.bean(BipScore.class))
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, BipScore, I>) (k, it) -> {
Result ret = inputClazz.newInstance();
BipScore first = it.next();
ret.setId(first.getId());
ret.setMeasures(getMeasure(first));
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
return (I) ret;
}, Encoders.bean(inputClazz))
.toJavaRDD()
.map(p -> new AtomicAction(inputClazz, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv.setDataInfo(getDataInfo());
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
private static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setInvisible(false);
di.setDeletedbyinference(false);
di.setTrust("");
Qualifier qualifier = new Qualifier();
qualifier.setClassid("sysimport:actionset");
qualifier.setClassname("Harvested");
qualifier.setSchemename("dnet:provenanceActions");
qualifier.setSchemeid("dnet:provenanceActions");
di.setProvenanceaction(qualifier);
return di;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rtn",
"paramLongName": "resultTableName",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "bsp",
"paramLongName": "bipScorePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

@ -1,11 +1,11 @@
<configuration>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
@ -15,28 +15,44 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
<name>sparkDriverMemory</name>
<value>15G</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
<name>sparkExecutorMemory</name>
<value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property>
</configuration>

@ -0,0 +1,171 @@
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path='${outputPath}'/>
<mkdir path='${outputPath}'/>
<delete path='${workingDir}'/>
<mkdir path='${workingDir}'/>
</fs>
<ok to="atomicactions"/>
<error to="Kill"/>
</action>
<fork name="atomicactions">
<path start="atomicactions_publication"/>
<path start="atomicactions_dataset"/>
<path start="atomicactions_orp"/>
<path start="atomicactions_software"/>
</fork>
<action name="atomicactions_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for publications</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for datasets</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for orp</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for software</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<join name="join_aa" to="collectandsave"/>
<action name="collectandsave">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>saves all the aa produced for the several types of results in the as output path</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,331 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import scala.Tuple2;
public class SparkAtomicActionScoreJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(SparkAtomicActionScoreJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void matchOne() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(2, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
}
@Test
public void matchOneWithTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
String tmp_influence = tmp_ds.get(0).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
tmp_influence = tmp_ds.get(1).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
}
@Test
public void matchTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 2);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"1.98956540239e-08",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.282046161584",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
}
}

@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
@ -29,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
public class PartitionEventsByDsIdJob {
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
private static final String OPENDOAR_NSPREFIX = "opendoar____::";
private static final String OPENDOAR_NSPREFIX = "10|opendoar____::";
public static void main(final String[] args) throws Exception {
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
IOUtils
.toString(
PartitionEventsByDsIdJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
@ -54,14 +61,32 @@ public class PartitionEventsByDsIdJob {
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
log.info("partitionPath: {}", partitionPath);
final String opendoarIds = parser.get("opendoarIds");
log.info("opendoarIds: {}", opendoarIds);
final Set<String> validOpendoarIds = new HashSet<>();
if (!opendoarIds.trim().equals("-")) {
validOpendoarIds
.addAll(
Arrays
.stream(opendoarIds.split(","))
.map(String::trim)
.filter(StringUtils::isNotBlank)
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
.collect(Collectors.toSet()));
}
log.info("validOpendoarIds: {}", validOpendoarIds);
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
.limit(10000)
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
.map(
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
Encoders.bean(ShortEventMessageWithGroupId.class))
.coalesce(1)
.write()
.partitionBy("group")

@ -5,12 +5,16 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingAbstract extends UpdateMatcher<String> {
private static final int MIN_LENGTH = 200;
public EnrichMissingAbstract() {
super(1,
s -> Topic.ENRICH_MISSING_ABSTRACT,
@ -21,10 +25,15 @@ public class EnrichMissingAbstract extends UpdateMatcher<String> {
@Override
protected List<String> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
return Arrays.asList(source.getAbstracts().get(0));
} else {
return new ArrayList<>();
return source
.getAbstracts()
.stream()
.filter(s -> StringUtils.normalizeSpace(s).length() >= MIN_LENGTH)
.map(Arrays::asList)
.findFirst()
.orElse(new ArrayList<>());
}
return new ArrayList<>();
}
}

@ -0,0 +1,14 @@
[
{
"paramName": "o",
"paramLongName": "workingPath",
"paramDescription": "the path where the temporary data will be stored",
"paramRequired": true
},
{
"paramName": "list",
"paramLongName": "opendoarIds",
"paramDescription": "the opendoar IDs whitelist (comma separated)",
"paramRequired": true
}
]

@ -1,60 +1,13 @@
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the path where the graph is stored</description>
<name>opendoarIds</name>
<description>the opendoar IDs whitelist (comma separated)</description>
</property>
<property>
<name>workingPath</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
<name>datasourceIdWhitelist</name>
<value>-</value>
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
</property>
<property>
<name>datasourceTypeWhitelist</name>
<value>-</value>
<description>a white list (comma separeted, - for empty list) of datasource types</description>
</property>
<property>
<name>datasourceIdBlacklist</name>
<value>-</value>
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
</property>
<property>
<name>esEventIndexName</name>
<description>the elasticsearch index name for events</description>
</property>
<property>
<name>esNotificationsIndexName</name>
<description>the elasticsearch index name for notifications</description>
</property>
<property>
<name>esIndexHost</name>
<description>the elasticsearch host</description>
</property>
<property>
<name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description>
</property>
<property>
<name>brokerApiBaseUrl</name>
<description>the url of the broker service api</description>
</property>
<property>
<name>brokerDbUrl</name>
<description>the url of the broker database</description>
</property>
<property>
<name>brokerDbUser</name>
<description>the user of the broker database</description>
</property>
<property>
<name>brokerDbPassword</name>
<description>the password of the broker database</description>
</property>
<property>
<name>sparkDriverMemory</name>
@ -111,13 +64,13 @@
</configuration>
</global>
<start to="partition"/>
<start to="opendoarPartition"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="partition">
<action name="opendoarPartition">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
@ -134,8 +87,8 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

@ -14,7 +14,7 @@
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
@ -51,7 +51,6 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.4</version>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
@ -84,6 +83,11 @@
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
</dependency>
</dependencies>

@ -62,7 +62,7 @@ object SparkGenerateDoiBoost {
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
logger.info("Phase 3) Join Result with MAG")
logger.info("Phase 4) Join Result with MAG")
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))

@ -15,7 +15,7 @@ import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex
case class CrossrefDT(doi: String, json:String) {}
case class CrossrefDT(doi: String, json:String, timestamp: Long) {}
case class mappingAffiliation(name: String) {}
@ -206,7 +206,7 @@ case object Crossref2Oaf {
a.setSurname(family)
a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
a
}
@ -254,7 +254,7 @@ case object Crossref2Oaf {
def snsfRule(award:String): String = {
var tmp1 = StringUtils.substringAfter(award,"_")
val tmp1 = StringUtils.substringAfter(award,"_")
val tmp2 = StringUtils.substringBefore(tmp1,"/")
logger.debug(s"From $award to $tmp2")
tmp2
@ -271,18 +271,20 @@ case object Crossref2Oaf {
}
def generateRelation(sourceId:String, targetId:String, nsPrefix:String) :Relation = {
def generateRelation(sourceId:String, targetId:String, relClass:String) :Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(s"40|$nsPrefix::$targetId")
r.setTarget(targetId)
r.setRelType("resultProject")
r.setRelClass("isProducedBy")
r.setRelClass(relClass)
r.setSubRelType("outcome")
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r.setLastupdatetimestamp(ts)
r
}
@ -290,12 +292,18 @@ case object Crossref2Oaf {
if (funder.award.isDefined && funder.award.get.nonEmpty)
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
award => {
val targetId = DHPUtils.md5(award)
queue += generateRelation(sourceId, targetId, nsPrefix)
val targetId = getProjectId(nsPrefix, DHPUtils.md5(award))
queue += generateRelation(sourceId, targetId , "isProducedBy")
queue += generateRelation(targetId , sourceId, "produces")
}
)
}
def getProjectId (nsPrefix:String, targetId:String):String = {
s"40|$nsPrefix::$targetId"
}
if (funders != null)
funders.foreach(funder => {
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
@ -316,22 +324,33 @@ case object Crossref2Oaf {
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
case "10.13039/501100000038"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "nserc_______" )
case "10.13039/501100000155"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "sshrc_______" )
case "10.13039/501100000024"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "cihr________" )
case "10.13039/501100000038"=> val targetId = getProjectId("nserc_______" , "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, "isProducedBy" )
queue += generateRelation(targetId, sourceId, "produces" )
case "10.13039/501100000155"=> val targetId = getProjectId("sshrc_______" , "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId,targetId, "isProducedBy" )
queue += generateRelation(targetId,sourceId, "produces" )
case "10.13039/501100000024"=> val targetId = getProjectId("cihr________" , "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId,targetId, "isProducedBy" )
queue += generateRelation(targetId,sourceId, "produces" )
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "miur________" )
val targetId = getProjectId("miur________" , "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId,targetId, "isProducedBy" )
queue += generateRelation(targetId,sourceId, "produces" )
case "10.13039/501100006588" |
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
case "10.13039/100004440"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
case "10.13039/100004440"=> val targetId = getProjectId("wt__________" , "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId,targetId, "isProducedBy" )
queue += generateRelation(targetId,sourceId, "produces" )
case _ => logger.debug("no match for "+funder.DOI.get )
@ -347,7 +366,9 @@ case object Crossref2Oaf {
case "The French National Research Agency (ANR)" |
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
case "Wellcome Trust Masters Fellowship" => queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
case "Wellcome Trust Masters Fellowship" => val targetId = getProjectId("wt__________", "1e5e62235d094afd01cd56e65112fc63")
queue += generateRelation(sourceId, targetId, "isProducedBy" )
queue += generateRelation(targetId, sourceId, "produces" )
case _ => logger.debug("no match for "+funder.name )
}

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.crossref
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.{IntWritable, Text}
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
@ -12,21 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}
object CrossrefDataset {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
def extractTimestamp(input:String): Long = {
def to_item(input:String):CrossrefDT = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
(json\"indexed"\"timestamp").extractOrElse[Long](0)
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
val doi:String = (json \ "DOI").extract[String]
CrossrefDT(doi, input, ts)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
parser.parseArgument(args)
@ -49,9 +52,8 @@ object CrossrefDataset {
if (a == null)
return b
val tb = extractTimestamp(b.json)
val ta = extractTimestamp(a.json)
if(ta >tb) {
if(a.timestamp >b.timestamp) {
return a
}
b
@ -63,9 +65,7 @@ object CrossrefDataset {
if (a == null)
return b
val tb = extractTimestamp(b.json)
val ta = extractTimestamp(a.json)
if(ta >tb) {
if(a.timestamp >b.timestamp) {
return a
}
b
@ -78,15 +78,21 @@ object CrossrefDataset {
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
}
val sourcePath:String = parser.get("sourcePath")
val targetPath:String = parser.get("targetPath")
val workingPath:String = parser.get("workingPath")
val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT]
val update =
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
.map(i =>CrossrefImporter.decompressBlob(i._2.toString))
.map(i =>to_item(i)))
ds.groupByKey(_.doi)
main_ds.union(update).groupByKey(_.doi)
.agg(crossrefAggregator.toColumn)
.map(s=>s._2)
.write.mode(SaveMode.Overwrite).save(targetPath)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
}

@ -29,69 +29,26 @@ object SparkMapDumpIntoOAF {
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
val sc = spark.sparkContext
val targetPath = parser.get("targetPath")
import spark.implicits._
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
.flatMap(k => Crossref2Oaf.convert(k.json))
.filter(o => o != null)
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
.map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
.flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
.map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
r = p2
else
r = p1
} else {
r = if (p1.getLastupdatetimestamp == null) p2 else p1
}
}
r
}.map(_._2)
val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
.map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
var r = if (p1 == null) p2 else p1
if (p1 != null && p2 != null) {
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
r = p2
else
r = p1
} else {
r = if (p1.getLastupdatetimestamp == null) p2 else p1
}
}
r
}.map(_._2)
spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
.map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
.reduceByKey { case (p1: Relation, p2: Relation) =>
if (p1 == null) p2 else p1
}.map(_._2)
val rels: Dataset[Relation] = spark.createDataset(distinctRels)
rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
}

@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
val stream = Map(
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
// ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
)

@ -26,12 +26,15 @@ object SparkPreProcessMAG {
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninque DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
.map(_._2)
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
} else
mpa
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
var magPubs: Dataset[(String, Publication)] =
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write
.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
.save(s"$workingPath/merge_step_2_conference")
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_3")
.save(s"$workingPath/merge_step_3")
// logger.info("Phase 6) Enrich Publication with description")
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item)
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
.equalTo(paperField("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithSubject(item))
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/mag_publication")
.save(s"$workingPath/mag_publication")
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.map(_._2)
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
}
}

@ -17,11 +17,12 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
public class ActivitiesDecompressor {
@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
}
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
int counter = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
if (entry.isDirectory() || !filename.contains("works")) {
} else {
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
String xml = buffer.toString();
String[] filenameParts = filename.split("/");
final Text key = new Text(
XMLRecordParser
.retrieveOrcidIdFromActivity(
xml.getBytes(), filenameParts[filenameParts.length - 1]));
final Text value = new Text(xml);
writer.append(key, value);
if ((counter % 100000) == 0) {
Log.info("Current xml works extracted: " + counter);
}
}
}
}
}
Log.info("Activities extraction completed");
Log.info("Total XML works parsed: " + counter);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
}

@ -0,0 +1,54 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
public class ExtractXMLActivitiesData extends OrcidDSManager {
private String outputWorksPath;
private String activitiesFileNameTarGz;
public static void main(String[] args) throws IOException, Exception {
ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
extractXMLActivitiesData.loadArgs(args);
extractXMLActivitiesData.extractWorks();
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath");
Log.info("Output Author Work Data: " + outputWorksPath);
}
private void extractWorks() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputWorksPath));
ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
}
}

@ -0,0 +1,56 @@
package eu.dnetlib.doiboost.orcid;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
public class ExtractXMLSummariesData extends OrcidDSManager {
private String outputAuthorsPath;
private String summariesFileNameTarGz;
public static void main(String[] args) throws IOException, Exception {
ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
extractXMLSummariesData.loadArgs(args);
extractXMLSummariesData.extractAuthors();
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath");
Log.info("Output Authors Data: " + outputAuthorsPath);
}
public void extractAuthors() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputAuthorsPath)
.concat("xml_authors.seq"));
SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
}
}

@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
}
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Default Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");

@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDSManager {
protected String hdfsServerUri;
protected String hdfsOrcidDefaultPath;
protected String workingPath;
private String summariesFileNameTarGz;
private String outputAuthorsPath;
@ -28,10 +28,10 @@ public class OrcidDSManager {
public void generateAuthors() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
Path outputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(workingPath)
.concat(outputAuthorsPath)
.concat("authors.seq"));
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
@ -41,22 +41,18 @@ public class OrcidDSManager {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
return conf;
}
protected FileSystem initFileSystemObject(Configuration conf) {
protected FileSystem initFileSystemObject(Configuration conf) throws IOException {
// Get the filesystem - HDFS
// if there is an exception, it will be propagate
FileSystem fs = null;
try {
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
return fs;
}
@ -66,13 +62,13 @@ public class OrcidDSManager {
.toString(
OrcidDSManager.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
Log.info("Summaries File Name: " + summariesFileNameTarGz);
outputAuthorsPath = parser.get("outputAuthorsPath");

@ -1,14 +1,15 @@
package eu.dnetlib.doiboost.orcid;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
@ -16,6 +17,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
@ -27,10 +29,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class OrcidDownloader extends OrcidDSManager {
static final int REQ_LIMIT = 24;
// static final int REQ_MAX_TEST = 100;
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
static final int REQ_MAX_TEST = -1;
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2019-09-30 00:00:00";
static final String lastUpdate = "2020-09-29 00:00:00";
private String lambdaFileName;
private String outputPath;
private String token;
@ -41,7 +43,7 @@ public class OrcidDownloader extends OrcidDSManager {
orcidDownloader.parseLambdaFile();
}
private String downloadRecord(String orcidId) {
private String downloadRecord(String orcidId) throws IOException {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
@ -49,17 +51,23 @@ public class OrcidDownloader extends OrcidDSManager {
CloseableHttpResponse response = client.execute(httpGet);
if (response.getStatusLine().getStatusCode() != 200) {
Log
.warn(
.info(
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
return new String("");
}
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
Log.warn("Downloading " + orcidId, e.getMessage());
// return IOUtils.toString(response.getEntity().getContent());
return xmlStreamToString(response.getEntity().getContent());
}
}
private String xmlStreamToString(InputStream xmlStream) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
return new String("");
return buffer.toString();
}
public void parseLambdaFile() throws Exception {
@ -69,97 +77,94 @@ public class OrcidDownloader extends OrcidDSManager {
long startDownload = 0;
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
Path hdfsreadpath = new Path(lambdaFileUri);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
Path hdfsoutputPath = new Path(
hdfsServerUri
.concat(hdfsOrcidDefaultPath)
.concat(workingPath)
.concat(outputPath)
.concat("orcid_records.seq"));
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfsoutputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
String line;
int nReqTmp = 0;
.concat("updated_xml_authors.seq"));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(lambdaFileStream))) {
TarArchiveEntry entry = null;
StringBuilder sb = new StringBuilder();
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfsoutputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
startDownload = System.currentTimeMillis();
long startReqTmp = System.currentTimeMillis();
while ((line = br.readLine()) != null) {
parsedRecordsCounter++;
// skip headers line
if (parsedRecordsCounter == 1) {
continue;
}
String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (isModified(orcidId, recordInfo.get(3))) {
String record = downloadRecord(orcidId);
downloadedRecordsCounter++;
if (!record.isEmpty()) {
String compressRecord = ArgumentApplicationParser.compressArgument(record);
final Text key = new Text(recordInfo.get(0));
final Text value = new Text(compressRecord);
try {
while ((entry = tais.getNextTarEntry()) != null) {
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values);
int nReqTmp = 0;
long startReqTmp = System.currentTimeMillis();
// skip headers line
if (parsedRecordsCounter == 0) {
parsedRecordsCounter++;
continue;
}
parsedRecordsCounter++;
String orcidId = recordInfo.get(0);
if (isModified(orcidId, recordInfo.get(3))) {
String record = downloadRecord(orcidId);
downloadedRecordsCounter++;
if (!record.isEmpty()) {
// String compressRecord = ArgumentApplicationParser.compressArgument(record);
final Text key = new Text(recordInfo.get(0));
final Text value = new Text(record);
writer.append(key, value);
savedRecordsCounter++;
} catch (IOException e) {
Log.warn("Writing to sequence file: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
} else {
break;
}
}
long endReq = System.currentTimeMillis();
nReqTmp++;
if (nReqTmp == REQ_LIMIT) {
long reqSessionDuration = endReq - startReqTmp;
if (reqSessionDuration <= 1000) {
long endReq = System.currentTimeMillis();
nReqTmp++;
if (nReqTmp == REQ_LIMIT) {
long reqSessionDuration = endReq - startReqTmp;
if (reqSessionDuration <= 1000) {
Log
.info(
"\nreqSessionDuration: "
+ reqSessionDuration
+ " nReqTmp: "
+ nReqTmp
+ " wait ....");
Thread.sleep(1000 - reqSessionDuration);
} else {
nReqTmp = 0;
startReqTmp = System.currentTimeMillis();
}
}
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log
.warn(
"\nreqSessionDuration: "
+ reqSessionDuration
+ " nReqTmp: "
+ nReqTmp
+ " wait ....");
Thread.sleep(1000 - reqSessionDuration);
} else {
nReqTmp = 0;
startReqTmp = System.currentTimeMillis();
.info(
"Current parsed: "
+ parsedRecordsCounter
+ " downloaded: "
+ downloadedRecordsCounter
+ " saved: "
+ savedRecordsCounter);
if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
break;
}
}
}
// if (parsedRecordsCounter > REQ_MAX_TEST) {
// break;
// }
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log
.info(
"Current parsed: "
+ parsedRecordsCounter
+ " downloaded: "
+ downloadedRecordsCounter
+ " saved: "
+ savedRecordsCounter);
// if (parsedRecordsCounter > REQ_MAX_TEST) {
// break;
// }
}
long endDownload = System.currentTimeMillis();
long downloadTime = endDownload - startDownload;
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
}
long endDownload = System.currentTimeMillis();
long downloadTime = endDownload - startDownload;
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
}
}
lambdaFileStream.close();
Log.info("Download started at: " + new Date(startDownload).toString());
Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
Log.info("Saved Records Counter: " + savedRecordsCounter);
@ -176,8 +181,8 @@ public class OrcidDownloader extends OrcidDSManager {
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
Log.info("Default Path: " + hdfsOrcidDefaultPath);
workingPath = parser.get("workingPath");
Log.info("Default Path: " + workingPath);
lambdaFileName = parser.get("lambdaFileName");
Log.info("Lambda File Name: " + lambdaFileName);
outputPath = parser.get("outputPath");
@ -185,7 +190,7 @@ public class OrcidDownloader extends OrcidDSManager {
token = parser.get("token");
}
private boolean isModified(String orcidId, String modifiedDate) {
public boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
@ -195,7 +200,7 @@ public class OrcidDownloader extends OrcidDSManager {
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);

@ -1,21 +1,72 @@
package eu.dnetlib.doiboost.orcid
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
override def zero: Publication = new Publication()
override def reduce(b: Publication, a: (String, Publication)): Publication = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: Publication, wy: Publication): Publication = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: Publication): Publication = reduction
override def bufferEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
override def outputEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
}
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
val mapper = new ObjectMapper()
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
.map(d => (d.getId, d))
.groupByKey(_._1)(Encoders.STRING)
.agg(getPublicationAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
parser.parseArgument(args)
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
import spark.implicits._
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
logger.info("Converting ORCID to OAF")
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
.map(_._2)
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
run(spark, sourcePath, targetPath)
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
}
}

@ -0,0 +1,188 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.util.LongAccumulator;
import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
import scala.Tuple2;
public class SparkDownloadOrcidAuthors {
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2020-09-29 00:00:00";
public static void main(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkDownloadOrcidAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputPath = parser.get("outputPath");
logger.info("outputPath: ", outputPath);
final String token = parser.get("token");
final String lambdaFileName = parser.get("lambdaFileName");
logger.info("lambdaFileName: ", lambdaFileName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
logger.info("Retrieving data from lamda sequence file");
JavaPairRDD<Text, Text> lamdaFileRDD = sc
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
logger.info("Data retrieved: " + lamdaFileRDD.count());
Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
String orcidId = data._1().toString();
String lastModifiedDate = data._2().toString();
parsedRecordsAcc.add(1);
if (isModified(orcidId, lastModifiedDate)) {
modifiedRecordsAcc.add(1);
return true;
}
return false;
};
Function<Tuple2<Text, Text>, Tuple2<String, String>> downloadRecordFunction = data -> {
String orcidId = data._1().toString();
String lastModifiedDate = data._2().toString();
final DownloadedRecordData downloaded = new DownloadedRecordData();
downloaded.setOrcidId(orcidId);
downloaded.setLastModifiedDate(lastModifiedDate);
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
long startReq = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(httpGet);
long endReq = System.currentTimeMillis();
long reqTime = endReq - startReq;
if (reqTime < 1000) {
Thread.sleep(1000 - reqTime);
}
int statusCode = response.getStatusLine().getStatusCode();
downloaded.setStatusCode(statusCode);
if (statusCode != 200) {
switch (statusCode) {
case 403:
errorHTTP403Acc.add(1);
case 409:
errorHTTP409Acc.add(1);
case 503:
errorHTTP503Acc.add(1);
throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
case 525:
errorHTTP525Acc.add(1);
default:
errorHTTPGenericAcc.add(1);
logger
.info(
"Downloading " + orcidId + " status code: "
+ response.getStatusLine().getStatusCode());
}
return downloaded.toTuple2();
}
downloadedRecordsAcc.add(1);
downloaded
.setCompressedData(
ArgumentApplicationParser
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
} catch (Throwable e) {
logger.info("Downloading " + orcidId, e.getMessage());
downloaded.setErrorMessage(e.getMessage());
return downloaded.toTuple2();
}
return downloaded.toTuple2();
};
sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
logger.info("Start execution ...");
JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
logger.info("Authors modified count: " + authorsModifiedRDD.count());
logger.info("Start downloading ...");
authorsModifiedRDD
.repartition(10)
.map(downloadRecordFunction)
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
workingPath.concat(outputPath),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
});
}
private static boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);
}
}

@ -0,0 +1,99 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class SparkGenLastModifiedSeq {
private static String hdfsServerUri;
private static String workingPath;
private static String outputPath;
private static String lambdaFileName;
public static void main(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkGenLastModifiedSeq.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
hdfsServerUri = parser.get("hdfsServerUri");
workingPath = parser.get("workingPath");
outputPath = parser.get("outputPath");
lambdaFileName = parser.get("lambdaFileName");
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
SparkConf sparkConf = new SparkConf();
runWithSparkSession(
sparkConf,
isSparkSessionManaged,
spark -> {
int rowsNum = 0;
Path output = new Path(
hdfsServerUri
.concat(workingPath)
.concat(outputPath));
Path hdfsreadpath = new Path(lambdaFileUri);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(lambdaFileStream))) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(output),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
while ((entry = tais.getNextTarEntry()) != null) {
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
String line;
while ((line = br.readLine()) != null) {
String[] values = line.split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
final Text key = new Text(orcidId);
final Text value = new Text(recordInfo.get(3));
writer.append(key, value);
rowsNum++;
}
}
}
}
Log.info("Saved rows from lamda csv tar file: " + rowsNum);
});
}
}

@ -13,9 +13,6 @@ import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
@ -33,7 +30,7 @@ import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import scala.Tuple2;

@ -1,165 +0,0 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.util.LongAccumulator;
import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
import scala.Tuple2;
public class SparkOrcidGenerateAuthors {
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static final String lastUpdate = "2019-09-30 00:00:00";
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
logger.info("[ SparkOrcidGenerateAuthors STARTED]");
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkOrcidGenerateAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
final String outputAuthorsPath = parser.get("outputAuthorsPath");
logger.info("outputAuthorsPath: ", outputAuthorsPath);
final String token = parser.get("token");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
Function<String, String> getOrcidIdFunction = line -> {
try {
String[] values = line.split(",");
return values[0].substring(1);
} catch (Exception e) {
return new String("");
}
};
List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
Function<String, Boolean> isModifiedAfterFilter = line -> {
String[] values = line.split(",");
String orcidId = values[0];
parsedRecordsAcc.add(1);
if (isModified(orcidId, values[3])) {
modifiedRecordsAcc.add(1);
return true;
}
return false;
};
Function<String, Boolean> isNotDownloadedFilter = line -> {
String[] values = line.split(",");
String orcidId = values[0];
if (downloadedRecords.contains(orcidId)) {
alreadyDownloadedRecords.add(1);
return false;
}
return true;
};
Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
String[] values = line.split(",");
String orcidId = values[0];
String modifiedDate = values[3];
return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
};
lamdaFileRDD
.filter(isModifiedAfterFilter)
.filter(isNotDownloadedFilter)
.map(downloadRecordFunction)
.rdd()
.saveAsTextFile(workingPath.concat(outputAuthorsPath));
});
}
private static boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);
}
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
} catch (Exception e) {
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
return true;
}
return modifiedDateDt.after(lastUpdateDt);
}
private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
LongAccumulator downloadedRecordsAcc) {
final DownloadedRecordData data = new DownloadedRecordData();
data.setOrcidId(orcidId);
data.setModifiedDate(modifiedDate);
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
CloseableHttpResponse response = client.execute(httpGet);
int statusCode = response.getStatusLine().getStatusCode();
data.setStatusCode(statusCode);
if (statusCode != 200) {
Log
.warn(
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
return data.toTuple2();
}
downloadedRecordsAcc.add(1);
data
.setCompressedData(
ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
} catch (Throwable e) {
Log.warn("Downloading " + orcidId, e.getMessage());
data.setErrorMessage(e.getMessage());
return data.toTuple2();
}
return data.toTuple2();
}
}

@ -1,50 +0,0 @@
package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class SparkPartitionLambdaFile {
public static void main(String[] args) throws IOException, Exception {
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkOrcidGenerateAuthors.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv");
lamdaFileRDD
.repartition(20)
.saveAsTextFile(workingPath.concat("lamdafiles"));
});
}
}

@ -17,11 +17,12 @@ import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
public class SummariesDecompressor {
@ -56,6 +57,7 @@ public class SummariesDecompressor {
int nameFound = 0;
int surnameFound = 0;
int creditNameFound = 0;
int otherNamesFound = 0;
int errorFromOrcidFound = 0;
int xmlParserErrorFound = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
@ -117,6 +119,9 @@ public class SummariesDecompressor {
if (authorData.getCreditName() != null) {
creditNameFound += 1;
}
if (authorData.getOtherNames() != null && authorData.getOtherNames().size() > 1) {
otherNamesFound += authorData.getOtherNames().size();
}
} else {
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
@ -152,7 +157,71 @@ public class SummariesDecompressor {
Log.info("Name found: " + nameFound);
Log.info("Surname found: " + surnameFound);
Log.info("Credit name found: " + creditNameFound);
Log.info("Other names found: " + otherNamesFound);
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
}
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
int counter = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
CompressionCodec Codec = new GzipCodec();
org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer
.compression(SequenceFile.CompressionType.RECORD, Codec);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class), optCom)) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
if (entry.isDirectory()) {
Log.debug("Directory entry name: " + entry.getName());
} else {
Log.debug("XML record entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
String line;
StringBuffer buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
String xml = buffer.toString();
final Text key = new Text(
XMLRecordParser
.retrieveOrcidIdFromSummary(
xml.getBytes(), filename.split("/")[2].substring(0, 19)));
final Text value = new Text(xml);
writer.append(key, value);
}
if ((counter % 100000) == 0) {
Log.info("Current xml records extracted: " + counter);
}
}
}
}
Log.info("Summaries extract completed");
Log.info("Total XML records parsed: " + counter);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
}

@ -0,0 +1,13 @@
package eu.dnetlib.doiboost.orcid.json;
import com.google.gson.Gson;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
public class JsonHelper {
public static String createOidWork(WorkDataNoDoi workData) {
return new Gson().toJson(workData);
}
}

@ -1,28 +0,0 @@
package eu.dnetlib.doiboost.orcid.json;
import com.google.gson.JsonObject;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
public class JsonWriter {
public static String create(AuthorData authorData) {
JsonObject author = new JsonObject();
author.addProperty("oid", authorData.getOid());
author.addProperty("name", authorData.getName());
author.addProperty("surname", authorData.getSurname());
if (authorData.getCreditName() != null) {
author.addProperty("creditname", authorData.getCreditName());
}
return author.toString();
}
public static String create(WorkData workData) {
JsonObject work = new JsonObject();
work.addProperty("oid", workData.getOid());
work.addProperty("doi", workData.getDoi());
return work.toString();
}
}

@ -3,8 +3,6 @@ package eu.dnetlib.doiboost.orcid.model;
import java.io.Serializable;
import org.apache.hadoop.io.Text;
import com.google.gson.JsonObject;
import scala.Tuple2;
@ -12,7 +10,7 @@ import scala.Tuple2;
public class DownloadedRecordData implements Serializable {
private String orcidId;
private String modifiedDate;
private String lastModifiedDate;
private String statusCode;
private String compressedData;
private String errorMessage;
@ -20,7 +18,7 @@ public class DownloadedRecordData implements Serializable {
public Tuple2<String, String> toTuple2() {
JsonObject data = new JsonObject();
data.addProperty("statusCode", getStatusCode());
data.addProperty("modifiedDate", getModifiedDate());
data.addProperty("lastModifiedDate", getLastModifiedDate());
if (getCompressedData() != null) {
data.addProperty("compressedData", getCompressedData());
}
@ -66,11 +64,11 @@ public class DownloadedRecordData implements Serializable {
this.compressedData = compressedData;
}
public String getModifiedDate() {
return modifiedDate;
public String getLastModifiedDate() {
return lastModifiedDate;
}
public void setModifiedDate(String modifiedDate) {
this.modifiedDate = modifiedDate;
public void setLastModifiedDate(String lastModifiedDate) {
this.lastModifiedDate = lastModifiedDate;
}
}

@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml;
import java.util.Arrays;
import java.util.List;
import org.mortbay.log.Log;
import com.ximpleware.AutoPilot;
import com.ximpleware.EOFException;
import com.ximpleware.EncodingException;
@ -14,7 +16,7 @@ import com.ximpleware.VTDNav;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
public class XMLRecordParser {
@ -81,6 +83,12 @@ public class XMLRecordParser {
if (!creditNames.isEmpty()) {
authorData.setCreditName(creditNames.get(0));
}
final List<String> otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content");
if (!otherNames.isEmpty()) {
authorData.setOtherNames(otherNames);
}
return authorData;
}
@ -120,4 +128,33 @@ public class XMLRecordParser {
}
return workData;
}
public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue)
throws VtdException, ParseException {
return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1);
}
public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue)
throws VtdException, ParseException {
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
}
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
String idAttributeName)
throws VtdException, ParseException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(ns, nsUrl);
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
.getTextValuesWithAttributes(
ap, vn, xpath, Arrays.asList(idAttributeName));
if (!recordNodes.isEmpty()) {
return (recordNodes.get(0).getAttributes().get(idAttributeName));
}
Log.info("id not found - default: " + defaultValue);
return defaultValue;
}
}

@ -0,0 +1,154 @@
package eu.dnetlib.doiboost.orcidnodoi;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
/**
* This class write on hdfs one sequence file, the key is an orcid identifier and the
* value is an orcid publication in json format
*/
public class ActivitiesDumpReader {
private static final int MAX_XML_WORKS_PARSED = -1;
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
throws Exception {
String uri = inputUri;
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.err.println("No codec found for " + uri);
System.exit(1);
}
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream gzipInputStream = null;
try {
gzipInputStream = codec.createInputStream(fs.open(inputPath));
parseTarActivities(fs, conf, gzipInputStream, outputPath);
} finally {
Log.debug("Closing gzip stream");
IOUtils.closeStream(gzipInputStream);
}
}
private static void parseTarActivities(
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
int counter = 0;
int noDoiFound = 0;
int errorFromOrcidFound = 0;
int xmlParserErrorFound = 0;
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
TarArchiveEntry entry = null;
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(outputPath),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class))) {
while ((entry = tais.getNextTarEntry()) != null) {
String filename = entry.getName();
StringBuffer buffer = new StringBuffer();
try {
if (entry.isDirectory() || !filename.contains("works")) {
} else {
Log.debug("XML work entry name: " + entry.getName());
counter++;
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
// tarInput
String line;
buffer = new StringBuffer();
while ((line = br.readLine()) != null) {
buffer.append(line);
}
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi
.VTDParseWorkData(buffer.toString().getBytes());
if (workDataNoDoi != null) {
if (workDataNoDoi.getErrorCode() != null) {
errorFromOrcidFound += 1;
Log
.debug(
"error from Orcid with code "
+ workDataNoDoi.getErrorCode()
+ " for entry "
+ entry.getName());
continue;
}
boolean isDoiFound = workDataNoDoi
.getExtIds()
.stream()
.filter(e -> e.getType() != null)
.anyMatch(e -> e.getType().equals("doi"));
if (!isDoiFound) {
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
final Text key = new Text(workDataNoDoi.getOid());
final Text value = new Text(jsonData);
try {
writer.append(key, value);
} catch (IOException e) {
Log.debug("Writing to sequence file: " + e.getMessage());
Log.debug(e);
throw new RuntimeException(e);
}
noDoiFound += 1;
}
} else {
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
xmlParserErrorFound += 1;
}
}
} catch (Exception e) {
throw new Exception(filename, e);
}
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
Log.info("Current xml works parsed: " + counter);
}
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
break;
}
}
}
} catch (Exception e) {
Log.warn("Parsing work from gzip archive: " + e.getMessage());
Log.warn(e);
throw new RuntimeException(e);
}
Log.info("Activities parse completed");
Log.info("Total XML works parsed: " + counter);
Log.info("Total no doi work found: " + noDoiFound);
Log.info("Error from Orcid found: " + errorFromOrcidFound);
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
}
}

@ -0,0 +1,57 @@
package eu.dnetlib.doiboost.orcidnodoi;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
/**
* This job generates one sequence file, the key is an orcid identifier and the
* value is an orcid publication in json format
*/
public class GenOrcidAuthorWork extends OrcidDSManager {
private String activitiesFileNameTarGz;
private String outputWorksPath;
public static void main(String[] args) throws IOException, Exception {
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
genOrcidAuthorWork.loadArgs(args);
genOrcidAuthorWork.generateAuthorsDOIsData();
}
public void generateAuthorsDOIsData() throws Exception {
Configuration conf = initConfigurationObject();
FileSystem fs = initFileSystemObject(conf);
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
}
private void loadArgs(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenOrcidAuthorWork.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
parser.parseArgument(args);
hdfsServerUri = parser.get("hdfsServerUri");
Log.info("HDFS URI: " + hdfsServerUri);
workingPath = parser.get("workingPath");
Log.info("Working Path: " + workingPath);
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
Log.info("Activities File Name: " + activitiesFileNameTarGz);
outputWorksPath = parser.get("outputWorksPath");
Log.info("Output Author Work Data: " + outputWorksPath);
}
}

@ -0,0 +1,180 @@
package eu.dnetlib.doiboost.orcidnodoi;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.Gson;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
import scala.Tuple2;
/**
* This spark job generates one parquet file, containing orcid publications dataset
*/
public class SparkGenEnrichedOrcidWorks {
static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws IOException, Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkGenEnrichedOrcidWorks.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
final String workingPath = parser.get("workingPath");
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
final String outputWorksPath = parser.get("outputWorksPath");
final String hdfsServerUri = parser.get("hdfsServerUri");
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaPairRDD<Text, Text> summariesRDD = sc
.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
Dataset<AuthorData> summariesDataset = spark
.createDataset(
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(AuthorData.class));
logger.info("Authors data loaded: " + summariesDataset.count());
JavaPairRDD<Text, Text> activitiesRDD = sc
.sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class);
Dataset<WorkDataNoDoi> activitiesDataset = spark
.createDataset(
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
Encoders.bean(WorkDataNoDoi.class));
logger.info("Works data loaded: " + activitiesDataset.count());
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = activitiesDataset
.joinWith(
summariesDataset,
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
.map(
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, String>>) value -> {
WorkDataNoDoi w = value._1;
AuthorData a = value._2;
AuthorMatcher.match(a, w.getContributors());
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
},
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.filter(Objects::nonNull)
.toJavaRDD();
logger.info("Enriched works RDD ready.");
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
final LongAccumulator enrichedPublications = spark
.sparkContext()
.longAccumulator("enrichedPublications");
final LongAccumulator errorsGeneric = spark.sparkContext().longAccumulator("errorsGeneric");
final LongAccumulator errorsInvalidTitle = spark.sparkContext().longAccumulator("errorsInvalidTitle");
final LongAccumulator errorsNotFoundAuthors = spark
.sparkContext()
.longAccumulator("errorsNotFoundAuthors");
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
final PublicationToOaf publicationToOaf = new PublicationToOaf(
parsedPublications,
enrichedPublications,
errorsGeneric,
errorsInvalidTitle,
errorsNotFoundAuthors,
errorsInvalidType);
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
.map(
e -> {
return (Publication) publicationToOaf
.generatePublicationActionsFromJson(e._2());
})
.filter(p -> p != null);
sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
oafPublicationRDD
.mapToPair(
p -> new Tuple2<>(p.getClass().toString(),
OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, (Publication) p))))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
workingPath.concat(outputEnrichedWorksPath),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
logger.info("parsedPublications: " + parsedPublications.value().toString());
logger.info("enrichedPublications: " + enrichedPublications.value().toString());
logger.info("errorsGeneric: " + errorsGeneric.value().toString());
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
});
}
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
AuthorData authorData = new AuthorData();
authorData.setOid(orcidId.toString());
JsonElement jElement = new JsonParser().parse(json.toString());
authorData.setName(getJsonValue(jElement, "name"));
authorData.setSurname(getJsonValue(jElement, "surname"));
authorData.setCreditName(getJsonValue(jElement, "creditname"));
return authorData;
}
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
return workData;
}
private static String getJsonValue(JsonElement jElement, String property) {
if (jElement.getAsJsonObject().has(property)) {
JsonElement name = null;
name = jElement.getAsJsonObject().get(property);
if (name != null && !name.isJsonNull()) {
return name.getAsString();
}
}
return new String("");
}
}

@ -0,0 +1,31 @@
package eu.dnetlib.doiboost.orcidnodoi.json;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.gson.JsonObject;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
/**
* This class converts an object to json and viceversa
*/
public class JsonWriter {
public static final com.fasterxml.jackson.databind.ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL);;
public static String create(AuthorData authorData) throws JsonProcessingException {
return OBJECT_MAPPER.writeValueAsString(authorData);
}
public static String create(WorkData workData) {
JsonObject work = new JsonObject();
work.addProperty("oid", workData.getOid());
work.addProperty("doi", workData.getDoi());
return work.toString();
}
}

@ -0,0 +1,58 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
/**
* This class models the data related to a contributor, that are retrieved from an orcid publication
*/
public class Contributor extends AuthorData implements Serializable {
private String sequence;
private String role;
private transient boolean simpleMatch = false;
private transient Double score = 0.0;
private transient boolean bestMatch = false;
public String getSequence() {
return sequence;
}
public void setSequence(String sequence) {
this.sequence = sequence;
}
public String getRole() {
return role;
}
public void setRole(String role) {
this.role = role;
}
public boolean isSimpleMatch() {
return simpleMatch;
}
public void setSimpleMatch(boolean simpleMatch) {
this.simpleMatch = simpleMatch;
}
public Double getScore() {
return score;
}
public void setScore(Double score) {
this.score = score;
}
public boolean isBestMatch() {
return bestMatch;
}
public void setBestMatch(boolean bestMatch) {
this.bestMatch = bestMatch;
}
}

@ -0,0 +1,36 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
/**
* This class models the data related to external id, that are retrieved from an orcid publication
*/
public class ExternalId {
private String type;
private String value;
private String relationShip;
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getRelationShip() {
return relationShip;
}
public void setRelationShip(String relationShip) {
this.relationShip = relationShip;
}
}

@ -0,0 +1,36 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
/**
* This class models the data related to a publication date, that are retrieved from an orcid publication
*/
public class PublicationDate {
private String year;
private String month;
private String day;
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
}

@ -0,0 +1,104 @@
package eu.dnetlib.doiboost.orcidnodoi.model;
import java.io.Serializable;
import java.util.List;
/**
* This class models the data that are retrieved from orcid publication
*/
public class WorkDataNoDoi implements Serializable {
private String oid;
private String id;
private String sourceName;
private String type;
private List<String> titles;
private List<String> urls;
List<ExternalId> extIds;
List<PublicationDate> publicationDates;
List<Contributor> contributors;
public String getOid() {
return oid;
}
public void setOid(String oid) {
this.oid = oid;
}
public String getErrorCode() {
return errorCode;
}
public void setErrorCode(String errorCode) {
this.errorCode = errorCode;
}
private String errorCode;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getTitles() {
return titles;
}
public void setTitles(List<String> titles) {
this.titles = titles;
}
public String getSourceName() {
return sourceName;
}
public void setSourceName(String sourceName) {
this.sourceName = sourceName;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public List<String> getUrls() {
return urls;
}
public void setUrls(List<String> urls) {
this.urls = urls;
}
public List<ExternalId> getExtIds() {
return extIds;
}
public void setExtIds(List<ExternalId> extIds) {
this.extIds = extIds;
}
public List<PublicationDate> getPublicationDates() {
return publicationDates;
}
public void setPublicationDates(List<PublicationDate> publicationDates) {
this.publicationDates = publicationDates;
}
public List<Contributor> getContributors() {
return contributors;
}
public void setContributors(List<Contributor> contributors) {
this.contributors = contributors;
}
}

@ -0,0 +1,543 @@
package eu.dnetlib.doiboost.orcidnodoi.oaf;
import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.*;
import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
/**
* This class converts an orcid publication from json format to oaf
*/
public class PublicationToOaf implements Serializable {
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
public static final String ORCID = "ORCID";
public final static String orcidPREFIX = "orcid_______";
public static final String OPENAIRE_PREFIX = "openaire____";
public static final String SEPARATOR = "::";
private final LongAccumulator parsedPublications;
private final LongAccumulator enrichedPublications;
private final LongAccumulator errorsGeneric;
private final LongAccumulator errorsInvalidTitle;
private final LongAccumulator errorsNotFoundAuthors;
private final LongAccumulator errorsInvalidType;
public PublicationToOaf(
LongAccumulator parsedPublications,
LongAccumulator enrichedPublications,
LongAccumulator errorsGeneric,
LongAccumulator errorsInvalidTitle,
LongAccumulator errorsNotFoundAuthors,
LongAccumulator errorsInvalidType) {
this.parsedPublications = parsedPublications;
this.enrichedPublications = enrichedPublications;
this.errorsGeneric = errorsGeneric;
this.errorsInvalidTitle = errorsInvalidTitle;
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
this.errorsInvalidType = errorsInvalidType;
}
public PublicationToOaf() {
this.parsedPublications = null;
this.enrichedPublications = null;
this.errorsGeneric = null;
this.errorsInvalidTitle = null;
this.errorsNotFoundAuthors = null;
this.errorsInvalidType = null;
}
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
{
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
}
};
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
{
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
}
};
static Map<String, Map<String, String>> typologiesMapping;
static {
try {
final String tt = IOUtils
.toString(
PublicationToOaf.class
.getResourceAsStream(
"/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
typologiesMapping = new Gson().fromJson(tt, Map.class);
} catch (Exception e) {
throw new RuntimeException("loading typologies", e);
}
}
public static final String PID_TYPES = "dnet:pid_types";
public Oaf generatePublicationActionsFromJson(final String json) {
try {
if (parsedPublications != null) {
parsedPublications.add(1);
}
JsonElement jElement = new JsonParser().parse(json);
JsonObject jObject = jElement.getAsJsonObject();
return generatePublicationActionsFromDump(jObject);
} catch (Throwable t) {
logger.error("creating publication: " + t.getMessage());
if (errorsGeneric != null) {
errorsGeneric.add(1);
}
return null;
}
}
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
if (!isValid(rootElement)) {
return null;
}
Publication publication = new Publication();
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(false);
dataInfo.setInferred(false);
dataInfo.setTrust("0.9");
dataInfo
.setProvenanceaction(
mapQualifier(
"sysimport:actionset:orcidworks-no-doi",
"sysimport:actionset:orcidworks-no-doi",
"dnet:provenanceActions",
"dnet:provenanceActions"));
publication.setDataInfo(dataInfo);
publication.setLastupdatetimestamp(new Date().getTime());
publication.setDateofcollection("2020-10-14");
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
// Adding external ids
externalIds
.keySet()
.stream()
.forEach(jsonExtId -> {
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
final String extId = getStringValue(rootElement, jsonExtId);
if (StringUtils.isNotBlank(extId)) {
publication
.getExternalReference()
.add(
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
}
});
// Adding source
final String source = getStringValue(rootElement, "sourceName");
if (StringUtils.isNotBlank(source)) {
Field<String> sourceField = mapStringField(source, null);
if (sourceField == null) {
publication.setSource(null);
} else {
publication.setSource(Arrays.asList(sourceField));
}
}
// Adding titles
final List<String> titles = createRepeatedField(rootElement, "titles");
if (titles == null || titles.isEmpty()) {
if (errorsInvalidTitle != null) {
errorsInvalidTitle.add(1);
}
return null;
}
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
publication
.setTitle(
titles
.stream()
.map(t -> {
return mapStructuredProperty(t, q, null);
})
.filter(s -> s != null)
.collect(Collectors.toList()));
// Adding identifier
final String id = getStringValue(rootElement, "id");
String sourceId = null;
if (id != null) {
publication.setOriginalId(Arrays.asList(id));
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
} else {
String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
}
publication.setId(sourceId);
// Adding relevant date
settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
// Adding collectedfrom
publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
// Adding type
final String type = getStringValue(rootElement, "type");
String cobjValue = "";
if (StringUtils.isNotBlank(type)) {
publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
final String typeValue = typologiesMapping.get(type).get("value");
cobjValue = typologiesMapping.get(type).get("cobj");
final Instance instance = new Instance();
// Adding hostedby
instance.setHostedby(createHostedBy());
// Adding url
final List<String> urls = createRepeatedField(rootElement, "urls");
if (urls != null && !urls.isEmpty()) {
instance.setUrl(urls);
} else {
dataInfo.setInvisible(true);
}
final String pubDate = getPublicationDate(rootElement, "publicationDates");
if (StringUtils.isNotBlank(pubDate)) {
instance.setDateofacceptance(mapStringField(pubDate, null));
}
instance.setCollectedfrom(createCollectedFrom());
// Adding accessright
instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
// Adding type
instance
.setInstancetype(
mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
publication.setInstance(Arrays.asList(instance));
} else {
if (errorsInvalidType != null) {
errorsInvalidType.add(1);
}
return null;
}
// Adding authors
final List<Author> authors = createAuthors(rootElement);
if (authors != null && authors.size() > 0) {
publication.setAuthor(authors);
} else {
if (errorsNotFoundAuthors != null) {
errorsNotFoundAuthors.add(1);
}
return null;
}
String classValue = getDefaultResulttype(cobjValue);
publication
.setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies"));
if (enrichedPublications != null) {
enrichedPublications.add(1);
}
return publication;
}
public List<Author> createAuthors(final JsonObject root) {
final String authorsJSONFieldName = "contributors";
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
final List<Author> authors = new ArrayList<>();
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
int firstCounter = 0;
int defaultCounter = 0;
int rank = 1;
int currentRank = 0;
for (final JsonElement item : jsonAuthors) {
final JsonObject jsonAuthor = item.getAsJsonObject();
final Author author = new Author();
if (item.isJsonObject()) {
final String creditname = getStringValue(jsonAuthor, "creditName");
final String surname = getStringValue(jsonAuthor, "surname");
final String name = getStringValue(jsonAuthor, "name");
final String oid = getStringValue(jsonAuthor, "oid");
final String seq = getStringValue(jsonAuthor, "sequence");
if (StringUtils.isNotBlank(seq)) {
if (seq.equals("first")) {
firstCounter += 1;
rank = firstCounter;
} else if (seq.equals("additional")) {
rank = currentRank + 1;
} else {
defaultCounter += 1;
rank = defaultCounter;
}
}
if (StringUtils.isNotBlank(oid)) {
author.setPid(Arrays.asList(mapAuthorId(oid)));
author.setFullname(name + " " + surname);
if (StringUtils.isNotBlank(name)) {
author.setName(name);
}
if (StringUtils.isNotBlank(surname)) {
author.setSurname(surname);
}
} else {
PacePerson p = new PacePerson(creditname, false);
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
author.setFullname(p.getNormalisedFullname());
} else {
author.setFullname(creditname);
}
}
}
author.setRank(rank);
authors.add(author);
currentRank = rank;
}
return authors;
}
return null;
}
private List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
if (!rootElement.has(fieldName)) {
return null;
}
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) {
return null;
}
if (rootElement.get(fieldName).isJsonArray()) {
if (!isValidJsonArray(rootElement, fieldName)) {
return null;
}
return getArrayValues(rootElement, fieldName);
} else {
String field = getStringValue(rootElement, fieldName);
return Arrays.asList(cleanField(field));
}
}
private String cleanField(String value) {
if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
value = value.substring(1, value.length() - 1);
}
return value;
}
private void settingRelevantDate(final JsonObject rootElement,
final Publication publication,
final String jsonKey,
final String dictionaryKey,
final boolean addToDateOfAcceptance) {
final String pubDate = getPublicationDate(rootElement, "publication_date");
if (StringUtils.isNotBlank(pubDate)) {
if (addToDateOfAcceptance) {
publication.setDateofacceptance(mapStringField(pubDate, null));
}
Qualifier q = mapQualifier(dictionaryKey, dictionaryKey, "dnet:dataCite_date", "dnet:dataCite_date");
publication
.setRelevantdate(
Arrays
.asList(pubDate)
.stream()
.map(r -> {
return mapStructuredProperty(r, q, null);
})
.filter(s -> s != null)
.collect(Collectors.toList()));
}
}
private String getPublicationDate(final JsonObject rootElement,
final String jsonKey) {
JsonObject pubDateJson = null;
try {
pubDateJson = rootElement.getAsJsonObject(jsonKey);
} catch (Exception e) {
return null;
}
if (pubDateJson == null) {
return null;
}
final String year = getStringValue(pubDateJson, "year");
final String month = getStringValue(pubDateJson, "month");
final String day = getStringValue(pubDateJson, "day");
if (StringUtils.isBlank(year)) {
return null;
}
String pubDate = "".concat(year);
if (StringUtils.isNotBlank(month)) {
pubDate = pubDate.concat("-" + month);
if (StringUtils.isNotBlank(day)) {
pubDate = pubDate.concat("-" + day);
} else {
pubDate += "-01";
}
} else {
pubDate += "-01-01";
}
if (isValidDate(pubDate)) {
return pubDate;
}
return null;
}
protected boolean isValid(final JsonObject rootElement/* , final Reporter context */) {
final String type = getStringValue(rootElement, "type");
if (!typologiesMapping.containsKey(type)) {
logger.error("unknowntype_" + type);
if (errorsInvalidType != null) {
errorsInvalidType.add(1);
}
return false;
}
if (!isValidJsonArray(rootElement, "titles")) {
if (errorsInvalidTitle != null) {
errorsInvalidTitle.add(1);
}
return false;
}
return true;
}
private boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
if (!rootElement.has(fieldName)) {
return false;
}
final JsonElement jsonElement = rootElement.get(fieldName);
if (jsonElement.isJsonNull()) {
return false;
}
if (jsonElement.isJsonArray()) {
final JsonArray jsonArray = jsonElement.getAsJsonArray();
if (jsonArray.isJsonNull()) {
return false;
}
if (jsonArray.get(0).isJsonNull()) {
return false;
}
}
return true;
}
private Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(classId);
qualifier.setClassname(className);
qualifier.setSchemeid(schemeId);
qualifier.setSchemename(schemeName);
return qualifier;
}
private ExternalReference convertExtRef(String extId, String classId, String className, String schemeId,
String schemeName) {
ExternalReference ex = new ExternalReference();
ex.setRefidentifier(extId);
ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName));
return ex;
}
private StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null | StringUtils.isBlank(value)) {
return null;
}
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(value);
structuredProperty.setQualifier(qualifier);
structuredProperty.setDataInfo(dataInfo);
return structuredProperty;
}
private Field<String> mapStringField(String value, DataInfo dataInfo) {
if (value == null || StringUtils.isBlank(value)) {
return null;
}
final Field<String> stringField = new Field<>();
stringField.setValue(value);
stringField.setDataInfo(dataInfo);
return stringField;
}
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
cf.setValue(ORCID);
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
return cf;
}
private KeyValue createHostedBy() {
KeyValue hb = new KeyValue();
hb.setValue("Unknown Repository");
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
return hb;
}
private StructuredProperty mapAuthorId(String orcidId) {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(orcidId);
final Qualifier q = new Qualifier();
q.setClassid(ORCID.toLowerCase());
q.setClassname(ORCID.toLowerCase());
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
q.setSchemename(ModelConstants.DNET_PID_TYPES);
sp.setQualifier(q);
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(false);
dataInfo.setInferred(false);
dataInfo.setTrust("0.9");
dataInfo
.setProvenanceaction(
mapQualifier(
"sysimport:crosswalk:entityregistry",
"Harvested",
"dnet:provenanceActions",
"dnet:provenanceActions"));
sp.setDataInfo(dataInfo);
return sp;
}
}

@ -0,0 +1,217 @@
package eu.dnetlib.doiboost.orcidnodoi.similarity;
import java.io.IOException;
import java.text.Normalizer;
import java.util.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
/**
* This class is used for searching from a list of publication contributors a
* specific author making a similarity check on both name and surname of the
* author with the credit name of each contributor of the list; as soon as
* the match is found (if exist) author informations are used to enrich the
* matched contribuotr inside contributors list
*/
public class AuthorMatcher {
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
public static final Double threshold = 0.8;
public static void match(AuthorData author, List<Contributor> contributors)
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
Contributor contributor = null;
contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (simpleMatch(c.getCreditName(), author.getName()) ||
simpleMatch(c.getCreditName(), author.getSurname()) ||
simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
if (matchCounters.get(0) == 1) {
updateAuthorsSimpleMatch(contributors, author);
} else if (matchCounters.get(0) == 0) {
Optional<Contributor> optCon = contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
updateAuthorsSimilarityMatch(contributors, author);
}
} else if (matchCounters.get(0) > 1) {
Optional<Contributor> optCon = contributors
.stream()
.filter(c -> c.isSimpleMatch())
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
return c;
})
.filter(c -> c.getScore() >= threshold)
.max(Comparator.comparing(c -> c.getScore()));
Contributor bestMatchContributor = null;
if (optCon.isPresent()) {
bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
updateAuthorsSimilarityMatch(contributors, author);
}
}
}
public static boolean simpleMatchOnOtherNames(String name, List<String> otherNames) {
if (otherNames == null || (otherNames != null && otherNames.isEmpty())) {
return false;
}
return otherNames.stream().filter(o -> simpleMatch(name, o)).count() > 0;
}
public static boolean simpleMatch(String name, String searchValue) {
if (searchValue == null) {
return false;
}
return normalize(name).contains(normalize(searchValue));
}
public static Double bestMatch(String authorSurname, String authorName, String contributor) {
String[] contributorSplitted = contributor.split(" ");
if (contributorSplitted.length == 0) {
return 0.0;
}
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
String contributorSurname = "";
if (contributorSplitted.length > 1) {
StringJoiner joiner = new StringJoiner(" ");
for (int i = 0; i < contributorSplitted.length - 1; i++) {
joiner.add(contributorSplitted[i]);
}
contributorSurname = joiner.toString();
}
String authorNameNrm = normalize(authorName);
String authorSurnameNrm = normalize(authorSurname);
String contributorNameNrm = normalize(contributorName);
String contributorSurnameNrm = normalize(contributorSurname);
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
if (sm1.compareTo(sm2) >= 0) {
return sm1;
}
return sm2;
}
public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
return score;
}
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
}
public static String normalize(final String s) {
if (s == null) {
return new String("");
}
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
// of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
private static String parse(String name, String surname) {
return surname + " " + name;
}
public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
contributors.forEach(c -> {
if (c.isSimpleMatch()) {
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
}
});
updateRanks(contributors);
}
public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
contributors
.stream()
.filter(c -> c.isBestMatch())
.forEach(c -> {
c.setName(author.getName());
c.setSurname(author.getSurname());
c.setOid(author.getOid());
});
updateRanks(contributors);
}
private static void updateRanks(List<Contributor> contributors) {
boolean seqFound = false;
if (contributors
.stream()
.filter(
c -> c.getRole() != null && c.getSequence() != null &&
c.getRole().equals("author") && (c.getSequence().equals("first") ||
c.getSequence().equals("additional")))
.count() > 0) {
seqFound = true;
}
if (!seqFound) {
List<Integer> seqIds = Arrays.asList(0);
contributors.forEach(c -> {
int currentSeq = seqIds.get(0) + 1;
seqIds.set(0, currentSeq);
c.setSequence(Integer.toString(seqIds.get(0)));
});
}
}
private static String toJson(WorkDataNoDoi work) {
GsonBuilder builder = new GsonBuilder();
Gson gson = builder.create();
return gson.toJson(work);
}
}

@ -0,0 +1,113 @@
package eu.dnetlib.doiboost.orcidnodoi.util;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.lang3.StringUtils;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
/**
* Utility class
*/
public class DumpToActionsUtility {
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
public static String getStringValue(final JsonObject root, final String key) {
if (root.has(key) && !root.get(key).isJsonNull())
return root.get(key).getAsString();
return new String("");
}
public static List<String> getArrayValues(final JsonObject root, final String key) {
if (root.has(key) && root.get(key).isJsonArray()) {
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
final List<String> result = new ArrayList<>();
asJsonArray.forEach(it -> {
if (StringUtils.isNotBlank(it.getAsString())) {
result.add(it.getAsString());
}
});
return result;
}
return new ArrayList<>();
}
public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
if (root.has(key) && root.get(key).isJsonArray()) {
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
final List<JsonObject> result = new ArrayList<>();
asJsonArray.forEach(it -> {
if (it.getAsJsonObject() != null) {
result.add(it.getAsJsonObject());
}
});
return result;
}
return new ArrayList<>();
}
public static boolean isValidDate(final String date) {
return date.matches("\\d{4}-\\d{2}-\\d{2}");
}
public static String now_ISO8601() { // NOPMD
String result;
synchronized (ISO8601FORMAT) {
result = ISO8601FORMAT.format(new Date());
}
// convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
// - note the added colon for the Timezone
return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
}
public static String getDefaultResulttype(final String cobjcategory) {
switch (cobjcategory) {
case "0029":
return "software";
case "0021":
case "0024":
case "0025":
case "0030":
return "dataset";
case "0000":
case "0010":
case "0018":
case "0020":
case "0022":
case "0023":
case "0026":
case "0027":
case "0028":
case "0037":
return "other";
case "0001":
case "0002":
case "0004":
case "0005":
case "0006":
case "0007":
case "0008":
case "0009":
case "0011":
case "0012":
case "0013":
case "0014":
case "0015":
case "0016":
case "0017":
case "0019":
case "0031":
case "0032":
return "publication";
default:
return "publication";
}
}
}

@ -0,0 +1,32 @@
package eu.dnetlib.doiboost.orcidnodoi.util;
public class Pair<K, V> {
private K k;
private V v;
public Pair(K k, V v) {
this.k = k;
this.v = v;
}
public K getKey() {
return k;
}
public V getValue() {
return v;
}
@Override
public boolean equals(Object obj) {
if (obj instanceof Pair<?, ?>) {
Pair<?, ?> tmp = (Pair<?, ?>) obj;
return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
} else
return false;
}
}

@ -0,0 +1,217 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.ximpleware.*;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
/**
* This class is used for parsing xml data with vtd parser
*/
public class XMLRecordParserNoDoi {
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
private static final String NS_COMMON = "common";
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
private static final String NS_PERSON = "person";
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
private static final String NS_DETAILS = "personal-details";
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
private static final String NS_OTHER = "other-name";
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
private static final String NS_RECORD = "record";
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
private static final String NS_WORK = "work";
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
private static final String NS_ERROR = "error";
public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
NavException, XPathEvalException {
final VTDGen vg = new VTDGen();
vg.setDoc(bytes);
vg.parse(true);
final VTDNav vn = vg.getNav();
final AutoPilot ap = new AutoPilot(vn);
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
WorkDataNoDoi workData = new WorkDataNoDoi();
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
if (!errors.isEmpty()) {
workData.setErrorCode(errors.get(0));
return workData;
}
List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code"));
if (!workNodes.isEmpty()) {
final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
workData.setOid(oid);
final String id = (workNodes.get(0).getAttributes().get("put-code"));
workData.setId(id);
} else {
return null;
}
final List<String> titles = VtdUtilityParser
.getTextValue(
ap, vn, "//common:title");
if (!titles.isEmpty()) {
workData.setTitles(titles);
}
final List<String> sourceNames = VtdUtilityParser
.getTextValue(
ap, vn, "//common:source-name");
if (!sourceNames.isEmpty()) {
workData.setSourceName(sourceNames.get(0));
}
final List<String> types = VtdUtilityParser
.getTextValue(
ap, vn, "//work:type");
if (!types.isEmpty()) {
workData.setType(types.get(0));
}
final List<String> urls = VtdUtilityParser
.getTextValue(
ap, vn, "//common:url");
if (!urls.isEmpty()) {
workData.setUrls(urls);
}
workData.setPublicationDates(getPublicationDates(vg, vn, ap));
workData.setExtIds(getExternalIds(vg, vn, ap));
workData.setContributors(getContributors(vg, vn, ap));
return workData;
}
private static List<PublicationDate> getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<PublicationDate> publicationDates = new ArrayList<PublicationDate>();
int yearIndex = 0;
ap.selectXPath("//common:publication-date/common:year");
while (ap.evalXPath() != -1) {
PublicationDate publicationDate = new PublicationDate();
int t = vn.getText();
if (t >= 0) {
publicationDate.setYear(vn.toNormalizedString(t));
publicationDates.add(yearIndex, publicationDate);
yearIndex++;
}
}
int monthIndex = 0;
ap.selectXPath("//common:publication-date/common:month");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t));
monthIndex++;
}
}
int dayIndex = 0;
ap.selectXPath("//common:publication-date/common:day");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t));
dayIndex++;
}
}
return publicationDates;
}
private static List<ExternalId> getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<ExternalId> extIds = new ArrayList<ExternalId>();
int typeIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-type");
while (ap.evalXPath() != -1) {
ExternalId extId = new ExternalId();
int t = vn.getText();
if (t >= 0) {
extId.setType(vn.toNormalizedString(t));
extIds.add(typeIndex, extId);
typeIndex++;
}
}
int valueIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-value");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
extIds.get(valueIndex).setValue(vn.toNormalizedString(t));
valueIndex++;
}
}
int relationshipIndex = 0;
ap.selectXPath("//common:external-id/common:external-id-relationship");
while (ap.evalXPath() != -1) {
int t = vn.getText();
if (t >= 0) {
extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t));
relationshipIndex++;
}
}
if (typeIndex == valueIndex) {
return extIds;
}
return new ArrayList<ExternalId>();
}
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
throws XPathParseException, NavException, XPathEvalException {
List<Contributor> contributors = new ArrayList<Contributor>();
ap.selectXPath("//work:contributors/work:contributor");
while (ap.evalXPath() != -1) {
Contributor contributor = new Contributor();
if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
int val = vn.getText();
if (val != -1) {
contributor.setCreditName(vn.toNormalizedString(val));
}
vn.toElement(VTDNav.PARENT);
}
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
int val = vn.getText();
if (val != -1) {
contributor.setSequence(vn.toNormalizedString(val));
}
vn.toElement(VTDNav.PARENT);
}
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
int val = vn.getText();
if (val != -1) {
contributor.setRole(vn.toNormalizedString(val));
}
vn.toElement(VTDNav.PARENT);
}
vn.toElement(VTDNav.PARENT);
}
contributors.add(contributor);
}
return contributors;
}
}

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
]

@ -23,76 +23,74 @@
</parameters>
<start to="ExtractCrossrefToOAF"/>
<start to="ImportCrossRef"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/input/crossref/index_dump'/>
<!-- <mkdir path='${workingPath}/input/crossref'/>-->
</fs>
<ok to="ImportCrossRef"/>
<error to="Kill"/>
</action>
<action name="ImportCrossRef">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
<arg>-t</arg><arg>${workingPath}/input/crossref/index_dump_1</arg>
<arg>-t</arg><arg>${workingPath}/input/crossref/index_update</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-ts</arg><arg>${timestamp}</arg>
</java>
<ok to="End"/>
<ok to="GenerateDataset"/>
<error to="Kill"/>
</action>
<action name="GenerateDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>/data/doiboost/input/crossref</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="RenameDataset"/>
<error to="Kill"/>
</action>
<action name="ExtractCrossrefToOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingPath}/input/crossref/index_dump,${workingPath}/input/crossref/index_dump_1,${workingPath}/crossref/index_dump</arg>
<arg>--targetPath</arg><arg>${workingPath}/input/crossref</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<action name="RenameDataset">
<fs>
<delete path='${workingPath}/input/crossref/crossref_ds'/>
<move source="${workingPath}/input/crossref/crossref_ds_updated"
target="${workingPath}/input/crossref/crossref_ds"/>
</fs>
<ok to="ConvertCrossrefToOAF"/>
<error to="Kill"/>
</action>
<action name="GenerateDataset">
<action name="ConvertCrossrefToOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ExtractCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
<name>ConvertCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>/data/doiboost/crossref/cr_dataset</arg>
<arg>--targetPath</arg><arg>/data/doiboost/crossref/crossrefDataset</arg>
<arg>--sourcePath</arg><arg>${workingPath}/input/crossref/crossref_ds</arg>
<arg>--targetPath</arg><arg>${workingPath}/process/</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>

@ -1,6 +1,5 @@
[
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
]

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
]

@ -1,6 +1,6 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
]

@ -1,4 +0,0 @@
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write the authors data", "paramRequired": true}
]

@ -0,0 +1,7 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
]

@ -39,14 +39,7 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingDirPath}'/>
<mkdir path='${workingDirPath}'/>
</fs>
<ok to="CreateDOIBoost"/>
<error to="Kill"/>
</action>
<action name="CreateDOIBoost">
<spark xmlns="uri:oozie:spark-action:0.2">

@ -8,6 +8,10 @@
<name>targetPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -31,10 +35,10 @@
<action name="ResetWorkingPath">
<fs>
<delete path='${targetPath}'/>
<mkdir path='${targetPath}'/>
<delete path='${workingPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="PreprocessMag"/>
<ok to="ConvertMagToDataset"/>
<error to="Kill"/>
</action>
@ -52,10 +56,10 @@
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<ok to="PreprocessMag"/>
<error to="Kill"/>
</action>
@ -65,7 +69,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to Dataset</name>
<name>Convert Mag to OAF Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
@ -75,7 +79,8 @@
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}/process</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>

@ -1,6 +1,7 @@
[
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target dir path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
]

@ -0,0 +1,31 @@
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx2g</value>
</property>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,542 @@
<workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
</property>
<property>
<name>shell_cmd_1</name>
<value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
</property>
<property>
<name>shell_cmd_2</name>
<value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
</property>
<property>
<name>shell_cmd_3</name>
<value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
</property>
<property>
<name>shell_cmd_4</name>
<value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
</property>
<property>
<name>shell_cmd_5</name>
<value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
</property>
<property>
<name>shell_cmd_6</name>
<value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
</property>
<property>
<name>shell_cmd_7</name>
<value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
</property>
<property>
<name>shell_cmd_8</name>
<value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
</property>
<property>
<name>shell_cmd_9</name>
<value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
</property>
<property>
<name>shell_cmd_X</name>
<value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/no_doi_works/*'/>
</fs>
<ok to="fork_check_download_files"/>
<error to="Kill"/>
</action>
<fork name = "fork_check_download_files">
<path start = "check_exist_on_hdfs_activities_0"/>
<path start = "check_exist_on_hdfs_activities_1"/>
<path start = "check_exist_on_hdfs_activities_2"/>
<path start = "check_exist_on_hdfs_activities_3"/>
<path start = "check_exist_on_hdfs_activities_4"/>
<path start = "check_exist_on_hdfs_activities_5"/>
<path start = "check_exist_on_hdfs_activities_6"/>
<path start = "check_exist_on_hdfs_activities_7"/>
<path start = "check_exist_on_hdfs_activities_8"/>
<path start = "check_exist_on_hdfs_activities_9"/>
<path start = "check_exist_on_hdfs_activities_X"/>
</fork>
<decision name="check_exist_on_hdfs_activities_0">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
</case>
<default to="Download_0" />
</switch>
</decision>
<action name="Download_0">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_1">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
</case>
<default to="Download_1" />
</switch>
</decision>
<action name="Download_1">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_1}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_2">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
</case>
<default to="Download_2" />
</switch>
</decision>
<action name="Download_2">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_2}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_3">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
</case>
<default to="Download_3" />
</switch>
</decision>
<action name="Download_3">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_3}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_4">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
</case>
<default to="Download_4" />
</switch>
</decision>
<action name="Download_4">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_4}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_5">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
</case>
<default to="Download_5" />
</switch>
</decision>
<action name="Download_5">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_5}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_6">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
</case>
<default to="Download_6" />
</switch>
</decision>
<action name="Download_6">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_6}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_7">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
</case>
<default to="Download_7" />
</switch>
</decision>
<action name="Download_7">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_7}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_8">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
</case>
<default to="Download_8" />
</switch>
</decision>
<action name="Download_8">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_8}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_9">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
</case>
<default to="Download_9" />
</switch>
</decision>
<action name="Download_9">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_9}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_activities_X">
<switch>
<case to="wait_download_phase_node">
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
</case>
<default to="Download_X" />
</switch>
</decision>
<action name="Download_X">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_X}</argument>
<capture-output/>
</shell>
<ok to="wait_download_phase_node"/>
<error to="Kill"/>
</action>
<action name="GenOrcidAuthorWork_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
<fork name = "fork_gen_orcid_author_work">
<path start = "GenOrcidAuthorWork_0"/>
<path start = "GenOrcidAuthorWork_1"/>
<path start = "GenOrcidAuthorWork_2"/>
<path start = "GenOrcidAuthorWork_3"/>
<path start = "GenOrcidAuthorWork_4"/>
<path start = "GenOrcidAuthorWork_5"/>
<path start = "GenOrcidAuthorWork_6"/>
<path start = "GenOrcidAuthorWork_7"/>
<path start = "GenOrcidAuthorWork_8"/>
<path start = "GenOrcidAuthorWork_9"/>
<path start = "GenOrcidAuthorWork_X"/>
</fork>
<join name = "join_node" to = "End"/>
<!-- <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
<!-- <fork name = "fork_gen_orcid_author_work_2">-->
<!-- <path start = "GenOrcidAuthorWork_6"/>-->
<!-- <path start = "GenOrcidAuthorWork_7"/>-->
<!-- <path start = "GenOrcidAuthorWork_8"/>-->
<!-- <path start = "GenOrcidAuthorWork_9"/>-->
<!-- <path start = "GenOrcidAuthorWork_X"/>-->
<!-- </fork>-->
<!-- <join name = "join_node_2" to = "End"/>-->
<end name="End"/>
</workflow-app>

@ -1,45 +0,0 @@
<workflow-app name="Orcid Download" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPathOrcid</name>
<description>the working dir base path</description>
</property>
<property>
<name>token</name>
<description>access token</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPathOrcid}/download'/>
<mkdir path='${workingPathOrcid}/download'/>
</fs>
<ok to="DownloadOrcidData"/>
<error to="Kill"/>
</action>
<action name="DownloadOrcidData">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv</arg>
<arg>-o</arg><arg>download/</arg>
<arg>-t</arg><arg>${token}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,232 @@
<workflow-app name="Extract Orcid XML Works From Activities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.java</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx2g</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
</configuration>
</global>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/xml/works'/>
<mkdir path='${workingPath}/xml/works'/>
</fs>
<ok to="fork_node"/>
<error to="Kill"/>
</action>
<fork name = "fork_node">
<path start = "ExtractXMLWorkActivities_0"/>
<path start = "ExtractXMLWorkActivities_1"/>
<path start = "ExtractXMLWorkActivities_2"/>
<path start = "ExtractXMLWorkActivities_3"/>
<path start = "ExtractXMLWorkActivities_4"/>
<path start = "ExtractXMLWorkActivities_5"/>
<path start = "ExtractXMLWorkActivities_6"/>
<path start = "ExtractXMLWorkActivities_7"/>
<path start = "ExtractXMLWorkActivities_8"/>
<path start = "ExtractXMLWorkActivities_9"/>
<path start = "ExtractXMLWorkActivities_X"/>
</fork>
<action name="ExtractXMLWorkActivities_0">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_0.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_1">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_1.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_2">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_2.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_3">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_3.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_4">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_4.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_5">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_5.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_6">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_6.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_7">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_7.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_8">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_8.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_9">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_9.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<action name="ExtractXMLWorkActivities_X">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
<arg>-ow</arg><arg>xml/works/xml_works_X.seq</arg>
<arg>-oew</arg><arg>---</arg>
</java>
<ok to="join_node"/>
<error to="Kill"/>
</action>
<join name = "join_node" to = "End"/>
<end name="End"/>
</workflow-app>

@ -0,0 +1,26 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx8g</value>
</property>
</configuration>

@ -1,41 +1,40 @@
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Extract Orcid XML Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/output'/>
<mkdir path='${workingPath}/output'/>
<delete path='${workingPath}/xml/authors'/>
<mkdir path='${workingPath}/xml/authors'/>
</fs>
<ok to="ImportOrcidSummary"/>
<ok to="ExtractXMLAuthorsSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummary">
<action name="ExtractXMLAuthorsSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-d</arg><arg>${workingPath}/</arg>
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
<arg>-o</arg><arg>output/</arg>
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
<arg>-o</arg><arg>xml/authors/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,22 +0,0 @@
<configuration>
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>queueName</name>
<value>default</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -1,83 +0,0 @@
<workflow-app name="Gen Orcid Authors" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>token</name>
<description>access token</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>outputPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath_activities}/authors'/>
</fs>
<ok to="Gen_Orcid_Authors"/>
<error to="Kill"/>
</action>
<action name="Split_Lambda_File">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master>
<mode>cluster</mode>
<name>Split_Lambda_File</name>
<class>eu.dnetlib.doiboost.orcid.SparkPartitionLambdaFile</class>
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
<spark-opts>--num-executors 24 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-o</arg><arg>authors/</arg>
<arg>-t</arg><arg>${token}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="Gen_Orcid_Authors">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn</master>
<mode>cluster</mode>
<name>Gen_Orcid_Authors</name>
<class>eu.dnetlib.doiboost.orcid.SparkOrcidGenerateAuthors</class>
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
<spark-opts>--num-executors 20 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot; --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-o</arg><arg>authors/</arg>
<arg>-t</arg><arg>${token}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,26 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.java.opts</name>
<value>-Xmx8g</value>
</property>
</configuration>

@ -0,0 +1,68 @@
<workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>shell_cmd_0</name>
<value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
</value>
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
</property>
</parameters>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/authors'/>
<mkdir path='${workingPath}/authors'/>
</fs>
<ok to="check_exist_on_hdfs_summaries"/>
<error to="Kill"/>
</action>
<decision name="check_exist_on_hdfs_summaries">
<switch>
<case to="ImportOrcidSummaries">
${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
</case>
<default to="DownloadSummaries" />
</switch>
</decision>
<action name="DownloadSummaries">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd_0}</argument>
<capture-output/>
</shell>
<ok to="ImportOrcidSummaries"/>
<error to="Kill"/>
</action>
<action name="ImportOrcidSummaries">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
<arg>-o</arg><arg>authors/</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,168 @@
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>token</name>
<description>access token</description>
</property>
<property>
<name>shell_cmd</name>
<value>wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar
</value>
<description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
</property>
<property>
<name>sparkDriverMemory</name>
<value>7G</value>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<value>2G</value>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
<description>number of cores used by single executor</description>
</property>
<property>
<name>spark2MaxExecutors</name>
<value>10</value>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="DownloadOrcidAuthors"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/downloads'/>
<delete path='${workingPath}/last_modified.csv.tar'/>
<mkdir path='${workingPath}/downloads'/>
</fs>
<ok to="DownloadLambdaFile"/>
<error to="Kill"/>
</action>
<action name="DownloadLambdaFile">
<shell xmlns="uri:oozie:shell-action:0.1">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<exec>bash</exec>
<argument>-c</argument>
<argument>${shell_cmd}</argument>
<capture-output/>
</shell>
<ok to="DownloadUpdatedXMLAuthors"/>
<error to="Kill"/>
</action>
<action name="DownloadUpdatedXMLAuthors">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv.tar</arg>
<arg>-o</arg><arg>downloads/</arg>
<arg>-t</arg><arg>${token}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="GenLastModifiedSeq">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenLastModifiedSeq</name>
<class>eu.dnetlib.doiboost.orcid.SparkGenLastModifiedSeq</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.csv.tar</arg>
<arg>-o</arg><arg>last_modified.seq</arg>
<arg>-t</arg><arg>-</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="DownloadOrcidAuthors">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>DownloadOrcidAuthors</name>
<class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>last_modified.seq</arg>
<arg>-o</arg><arg>downloads/updated_authors</arg>
<arg>-t</arg><arg>${token}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,41 @@
{
"reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"},
"report": {"cobj":"0017", "value": "Report"},
"dataset": {"cobj":"0021", "value": "Dataset"},
"journal-article": {"cobj":"0001", "value": "Article"},
"reference-book": {"cobj":"0002", "value": "Book"},
"other": {"cobj":"0020", "value": "Other ORP type"},
"proceedings-article": {"cobj":"0004", "value": "Conference object"},
"standard": {"cobj":"0038", "value": "Other literature type"},
"book-part": {"cobj":"0002", "value": "Book"},
"monograph": {"cobj":"0002", "value": "Book"},
"report-series": {"cobj":"0017", "value": "Report"},
"book": {"cobj":"0002", "value": "Book"},
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
"peer-review": {"cobj":"0015", "value": "Review"},
"book-section": {"cobj":"0013", "value": "Part of book or chapter of book"},
"book-review": {"cobj":"0015", "value": "Review"},
"conference-abstract": {"cobj":"0004", "value": "Conference object"},
"conference-paper": {"cobj":"0004", "value": "Conference object"},
"conference-poster": {"cobj":"0004", "value": "Conference object"},
"data-set": {"cobj":"0021", "value": "Dataset"},
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
"disclosure": {"cobj":"0038", "value": "Other literature type"},
"dissertation": {"cobj":"0006", "value": "Doctoral thesis"},
"edited-book": {"cobj":"0002", "value": "Book"},
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
"lecture-speech": {"cobj":"0010", "value": "Lecture"},
"license": {"cobj":"0038", "value": "Other literature type"},
"magazine-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
"manual": {"cobj":"0038", "value": "Other literature type"},
"newsletter-article": {"cobj":"0012", "value": "Newsletter"},
"newspaper-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
"patent": {"cobj":"0019", "value": "Patent"},
"research-technique": {"cobj":"0020", "value": "Other ORP type"},
"research-tool": {"cobj":"0020", "value": "Other ORP type"},
"standards-and-policy": {"cobj":"0038", "value": "Other literature type"},
"supervised-student-publication": {"cobj":"0001", "value": "Article"},
"technical-standard": {"cobj":"0038", "value": "Other literature type"},
"website": {"cobj":"0020", "value": "Other ORP type"},
"working-paper": {"cobj":"0014", "value": "Research"}
}

@ -0,0 +1,95 @@
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="ResetWorkingPath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingPath}/no_doi_dataset'/>
</fs>
<ok to="GenOrcidNoDoiDataset"/>
<error to="Kill"/>
</action>
<action name="GenOrcidNoDoiDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenOrcidNoDoiDataset</name>
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
</spark-opts>
<arg>-w</arg><arg>${workingPath}/</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-f</arg><arg>-</arg>
<arg>-ow</arg><arg>no_doi_works/</arg>
<arg>-oew</arg><arg>no_doi_dataset</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,63 +0,0 @@
package eu.dnetlib.dhp.doiboost
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation}
import org.apache.spark.SparkContext
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._
import scala.collection.JavaConverters._
class QueryTest {
def extract_payload(input:String) :String = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
compact(render((json \ "payload")))
}
def has_ands(r:Relation) :Boolean = {
r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0
}
def hasInstanceWithUrl(p:Publication):Boolean = {
val c = p.getInstance.asScala.map(i => i.getUrl!= null && !i.getUrl.isEmpty).size
!(!p.getInstance.isEmpty && c == p.getInstance().size)
}
def hasNullAccessRights(p:Publication):Boolean = {
val c = p.getInstance.asScala.map(i => i.getAccessright!= null && i.getAccessright.getClassname.nonEmpty).size
!p.getInstance.isEmpty && c == p.getInstance().size()
}
def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
val mapper = new ObjectMapper()
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
val ds:Dataset[Publication] = spark.read.load("/tmp/p").as[Publication]
ds.filter(p =>p.getBestaccessright!= null && p.getBestaccessright.getClassname.nonEmpty).count()
}
}

@ -158,7 +158,7 @@ class CrossrefMappingTest {
rels.foreach(s => logger.info(s.getTarget))
assertEquals(rels.size, 3 )
assertEquals(rels.size, 6 )
}

@ -1,5 +1,8 @@
package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.codehaus.jackson.map.ObjectMapper
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
})
}
// @Test
// def testOAFConvert():Unit ={
//
// val spark: SparkSession =
// SparkSession
// .builder()
// .appName(getClass.getSimpleName)
// .master("local[*]").getOrCreate()
//
//
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
//
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
// println(df.first.getId)
// println(mapper.writeValueAsString(df.first()))
//
//
//
//
// }

@ -3,23 +3,34 @@ package eu.dnetlib.doiboost.orcid;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.temporal.TemporalUnit;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull;
import org.junit.jupiter.api.Test;
import org.mortbay.log.Log;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class OrcidClientTest {
final String orcidId = "0000-0001-7291-3210";
@ -32,16 +43,64 @@ public class OrcidClientTest {
String lastUpdate = "2019-09-30 00:00:00";
String shortDate = "2020-05-06 16:06:11";
// curl -i -H "Accept: application/vnd.orcid+xml"
// curl -i -H "Accept: application/vnd.orcid+xml"
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
public String testDownloadRecord(String orcidId) throws Exception {
@Test
private void multipleDownloadTest() throws Exception {
int toDownload = 10;
long start = System.currentTimeMillis();
OrcidDownloader downloader = new OrcidDownloader();
TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
int rowNum = 0;
int entryNum = 0;
int modified = 0;
while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (downloader.isModified(orcidId, recordInfo.get(3))) {
slowedDownDownload(orcidId);
modified++;
}
rowNum++;
if (modified > toDownload) {
break;
}
}
entryNum++;
entry = input.getNextTarEntry();
}
long end = System.currentTimeMillis();
logToFile("start test: " + new Date(start).toString());
logToFile("end test: " + new Date(end).toString());
}
@Test
private void downloadTest(String orcid) throws Exception {
String record = testDownloadRecord(orcid);
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
private String testDownloadRecord(String orcidId) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
logToFile("start connection: " + new Date(System.currentTimeMillis()).toString());
CloseableHttpResponse response = client.execute(httpGet);
logToFile("end connection: " + new Date(System.currentTimeMillis()).toString());
if (response.getStatusLine().getStatusCode() != 200) {
System.out
.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
@ -53,8 +112,8 @@ public class OrcidClientTest {
return new String("");
}
// @Test
public void testLambdaFileParser() throws Exception {
// @Test
private void testLambdaFileParser() throws Exception {
try (BufferedReader br = new BufferedReader(
new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) {
String line;
@ -99,8 +158,8 @@ public class OrcidClientTest {
}
}
// @Test
public void getRecordDatestamp() throws ParseException {
// @Test
private void getRecordDatestamp() throws ParseException {
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
@ -108,7 +167,7 @@ public class OrcidClientTest {
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
}
public void testDate(String value) throws ParseException {
private void testDate(String value) throws ParseException {
System.out.println(value.toString());
if (value.length() != 19) {
value = value.substring(0, 19);
@ -117,20 +176,126 @@ public class OrcidClientTest {
System.out.println(valueDt.toString());
}
// @Test
public void testModifiedDate() throws ParseException {
// @Test
@Ignore
private void testModifiedDate() throws ParseException {
testDate(toRetrieveDate);
testDate(toNotRetrieveDate);
testDate(shortDate);
}
// @Test
public void testReadBase64CompressedRecord() throws Exception {
@Test
private void testReadBase64CompressedRecord() throws Exception {
final String base64CompressedRecord = IOUtils
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
System.out.println(recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0001-6645-509X");
logToFile("\n\ndownloaded \n\n" + recordFromSeqFile);
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161");
assertTrue(recordFromSeqFile.equals(downloadedRecord));
}
@Test
private void lambdaFileReaderTest() throws Exception {
TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
int rowNum = 0;
int entryNum = 0;
while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values);
assertTrue(recordInfo.size() == 4);
rowNum++;
if (rowNum == 1) {
assertTrue(recordInfo.get(3).equals("last_modified"));
} else if (rowNum == 2) {
assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333"));
}
}
entryNum++;
assertTrue(entryNum == 1);
entry = input.getNextTarEntry();
}
}
@Test
private void lambdaFileCounterTest() throws Exception {
final String lastUpdate = "2020-09-29 00:00:00";
OrcidDownloader downloader = new OrcidDownloader();
TarArchiveInputStream input = new TarArchiveInputStream(
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
TarArchiveEntry entry = input.getNextTarEntry();
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
int rowNum = 0;
int entryNum = 0;
int modified = 0;
while (entry != null) {
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
String line;
while ((line = br.readLine()) != null) {
String[] values = line.toString().split(",");
List<String> recordInfo = Arrays.asList(values);
String orcidId = recordInfo.get(0);
if (downloader.isModified(orcidId, recordInfo.get(3))) {
modified++;
}
rowNum++;
}
entryNum++;
entry = input.getNextTarEntry();
}
logToFile("rowNum: " + rowNum);
logToFile("modified: " + modified);
}
private void logToFile(String log)
throws IOException {
log = log.concat("\n");
Path path = Paths.get("/tmp/orcid_log.txt");
Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
}
@Test
private void slowedDownDownloadTest() throws Exception {
String orcid = "0000-0001-5496-1243";
String record = slowedDownDownload(orcid);
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
File f = new File(filename);
OutputStream outStream = new FileOutputStream(f);
IOUtils.write(record.getBytes(), outStream);
}
private String slowedDownDownload(String orcidId) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
long start = System.currentTimeMillis();
CloseableHttpResponse response = client.execute(httpGet);
long endReq = System.currentTimeMillis();
long reqSessionDuration = endReq - start;
logToFile("req time (millisec): " + reqSessionDuration);
if (reqSessionDuration < 1000) {
logToFile("wait ....");
Thread.sleep(1000 - reqSessionDuration);
}
long end = System.currentTimeMillis();
long total = end - start;
logToFile("total time (millisec): " + total);
if (response.getStatusLine().getStatusCode() != 200) {
logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
}
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
e.printStackTrace();
}
return new String("");
}
}

@ -2,17 +2,19 @@
package eu.dnetlib.doiboost.orcid.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import eu.dnetlib.doiboost.orcid.model.AuthorData;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcid.model.WorkData;
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
public class XMLRecordParserTest {
@Test
public void testOrcidAuthorDataXMLParser() throws Exception {
private void testOrcidAuthorDataXMLParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
@ -27,7 +29,7 @@ public class XMLRecordParserTest {
}
@Test
public void testOrcidXMLErrorRecordParser() throws Exception {
private void testOrcidXMLErrorRecordParser() throws Exception {
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
@ -40,11 +42,11 @@ public class XMLRecordParserTest {
}
@Test
public void testOrcidWorkDataXMLParser() throws Exception {
private void testOrcidWorkDataXMLParser() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml"));
this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
XMLRecordParser p = new XMLRecordParser();
@ -55,4 +57,21 @@ public class XMLRecordParserTest {
assertNotNull(workData.getDoi());
System.out.println("doi: " + workData.getDoi());
}
@Test
public void testOrcidOtherNamesXMLParser() throws Exception {
String xml = IOUtils
.toString(
this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
XMLRecordParser p = new XMLRecordParser();
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
assertNotNull(authorData);
assertNotNull(authorData.getOtherNames());
assertTrue(authorData.getOtherNames().get(0).equals("Andrew C. Porteus"));
String jsonData = JsonWriter.create(authorData);
assertNotNull(jsonData);
}
}

@ -0,0 +1,78 @@
package eu.dnetlib.doiboost.orcidnodoi;
import static org.junit.jupiter.api.Assertions.*;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.JsonElement;
import com.google.gson.JsonParser;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class PublicationToOafTest {
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
@Test
@Ignore
private void convertOafPublicationTest() throws Exception {
String jsonPublication = IOUtils
.toString(
PublicationToOafTest.class.getResourceAsStream("publication.json"));
JsonElement j = new JsonParser().parse(jsonPublication);
logger.info("json publication loaded: " + j.toString());
PublicationToOaf publicationToOaf = new PublicationToOaf();
Publication oafPublication = (Publication) publicationToOaf
.generatePublicationActionsFromDump(j.getAsJsonObject());
assertNotNull(oafPublication.getId());
assertNotNull(oafPublication.getOriginalId());
assertEquals(oafPublication.getOriginalId().get(0), "60153327");
logger.info("oafPublication.getId(): " + oafPublication.getId());
assertEquals(
oafPublication.getTitle().get(0).getValue(),
"Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp");
assertNotNull(oafPublication.getLastupdatetimestamp());
assertNotNull(oafPublication.getDateofcollection());
assertNotNull(oafPublication.getDateoftransformation());
assertTrue(oafPublication.getAuthor().size() == 7);
oafPublication.getAuthor().forEach(a -> {
assertNotNull(a.getFullname());
assertNotNull(a.getRank());
logger.info("a.getFullname(): " + a.getFullname());
if (a.getName() != null) {
logger.info("a.getName(): " + a.getName());
}
if (a.getSurname() != null) {
logger.info("a.getSurname(): " + a.getSurname());
}
logger.info("a.getRank(): " + a.getRank());
if (a.getPid() != null) {
logger.info("a.getPid(): " + a.getPid().get(0).getValue());
}
});
assertNotNull(oafPublication.getCollectedfrom());
if (oafPublication.getSource() != null) {
logger.info((oafPublication.getSource().get(0).getValue()));
}
if (oafPublication.getExternalReference() != null) {
oafPublication.getExternalReference().forEach(e -> {
assertNotNull(e.getRefidentifier());
assertEquals(e.getQualifier().getSchemeid(), "dnet:pid_types");
});
}
assertNotNull(oafPublication.getInstance());
oafPublication.getInstance().forEach(i -> {
assertNotNull(i.getInstancetype().getClassid());
logger.info("i.getInstancetype().getClassid(): " + i.getInstancetype().getClassid());
assertNotNull(i.getInstancetype().getClassname());
logger.info("i.getInstancetype().getClassname(): " + i.getInstancetype().getClassname());
});
}
}

@ -0,0 +1,348 @@
package eu.dnetlib.doiboost.orcidnodoi.xml;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.util.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.parser.utility.VtdException;
import eu.dnetlib.dhp.schema.orcid.AuthorData;
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
public class OrcidNoDoiTest {
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
static String nameA = "Khairy";
static String surnameA = "Abdel Dayem";
static String orcidIdA = "0000-0003-2760-1191";
@Test
public void readPublicationFieldsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
assertNotNull(workData.getOid());
logger.info("oid: " + workData.getOid());
assertNotNull(workData.getTitles());
logger.info("titles: ");
workData.getTitles().forEach(t -> {
logger.info(t);
});
logger.info("source: " + workData.getSourceName());
logger.info("type: " + workData.getType());
logger.info("urls: ");
workData.getUrls().forEach(u -> {
logger.info(u);
});
logger.info("publication date: ");
workData.getPublicationDates().forEach(d -> {
logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay());
});
logger.info("external id: ");
workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self"));
workData.getExtIds().forEach(e -> {
logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip());
});
logger.info("contributors: ");
workData.getContributors().forEach(c -> {
logger
.info(
c.getName() + " - " + c.getRole() + " - " + c.getSequence());
});
}
@Test
public void authorDoubleMatchTest() throws Exception {
logger.info("running authorSimpleMatchTest ....");
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
AuthorData author = new AuthorData();
author.setName(nameA);
author.setSurname(surnameA);
author.setOid(orcidIdA);
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
Contributor a = workData.getContributors().get(0);
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
AuthorMatcher.match(author, workData.getContributors());
assertTrue(workData.getContributors().size() == 6);
}
@Test
public void readContributorsTest()
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
logger.info("running loadPublicationFieldsTest ....");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData.getContributors());
assertTrue(workData.getContributors().size() == 5);
assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
}
@Test
public void authorSimpleMatchTest() throws Exception {
String orcidWork = "activity_work_0000-0002-5982-8983.xml";
AuthorData author = new AuthorData();
author.setName("Parkhouse");
author.setSurname("H.");
author.setOid("0000-0002-5982-8983");
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
assertNotNull(workData);
Contributor a = workData.getContributors().get(0);
assertTrue(a.getCreditName().equals("Parkhouse, H."));
AuthorMatcher.match(author, workData.getContributors());
assertTrue(workData.getContributors().size() == 2);
Contributor c = workData.getContributors().get(0);
assertTrue(c.getOid().equals("0000-0002-5982-8983"));
assertTrue(c.getName().equals("Parkhouse"));
assertTrue(c.getSurname().equals("H."));
assertTrue(c.getCreditName().equals("Parkhouse, H."));
}
@Test
public void match() {
AuthorData author = new AuthorData();
author.setName("Joe");
author.setSurname("Dodge");
author.setOid("0000-1111-2222-3333");
Contributor contributor = new Contributor();
contributor.setCreditName("Joe Dodge");
List<Contributor> contributors = Arrays.asList(contributor);
AuthorMatcher am = new AuthorMatcher();
int matchCounter = 0;
List<Integer> matchCounters = Arrays.asList(matchCounter);
contributors
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (am.simpleMatch(c.getCreditName(), author.getName()) ||
am.simpleMatch(c.getCreditName(), author.getSurname()) ||
am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
matchCounters.set(0, matchCounters.get(0) + 1);
c.setSimpleMatch(true);
}
});
assertTrue(matchCounters.get(0) == 1);
am.updateAuthorsSimpleMatch(contributors, author);
assertTrue(contributors.get(0).getName().equals("Joe"));
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
AuthorData authorX = new AuthorData();
authorX.setName(nameA);
authorX.setSurname(surnameA);
authorX.setOid(orcidIdA);
Contributor contributorA = new Contributor();
contributorA.setCreditName("Abdel-Dayem Khai");
Contributor contributorB = new Contributor();
contributorB.setCreditName("Abdel-Dayem Fake");
List<Contributor> contributorList = new ArrayList<>();
contributorList.add(contributorA);
contributorList.add(contributorB);
int matchCounter2 = 0;
List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
contributorList
.stream()
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.forEach(c -> {
if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
int currentCounter = matchCounters2.get(0);
currentCounter += 1;
matchCounters2.set(0, currentCounter);
c.setSimpleMatch(true);
}
});
assertTrue(matchCounters2.get(0) == 2);
assertTrue(contributorList.get(0).isSimpleMatch());
assertTrue(contributorList.get(1).isSimpleMatch());
Optional<Contributor> optCon = contributorList
.stream()
.filter(c -> c.isSimpleMatch())
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
.map(c -> {
c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
return c;
})
.filter(c -> c.getScore() >= AuthorMatcher.threshold)
.max(Comparator.comparing(c -> c.getScore()));
assertTrue(optCon.isPresent());
final Contributor bestMatchContributor = optCon.get();
bestMatchContributor.setBestMatch(true);
assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
assertTrue(contributorList.get(0).isBestMatch());
assertTrue(!contributorList.get(1).isBestMatch());
am.updateAuthorsSimilarityMatch(contributorList, authorX);
assertTrue(contributorList.get(0).getName().equals(nameA));
assertTrue(contributorList.get(0).getSurname().equals(surnameA));
assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
}
@Test
public void authorBestMatchTest() throws Exception {
String name = "Khairy";
String surname = "Abdel Dayem";
String orcidWork = "activity_work_0000-0003-2760-1191.xml";
AuthorData author = new AuthorData();
author.setName(name);
author.setSurname(surname);
author.setOid(orcidIdA);
String xml = IOUtils
.toString(
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
if (xml == null) {
logger.info("Resource not found");
}
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
if (p == null) {
logger.info("XMLRecordParserNoDoi null");
}
WorkDataNoDoi workData = null;
try {
workData = p.VTDParseWorkData(xml.getBytes());
} catch (Exception e) {
logger.error("parsing xml", e);
}
AuthorMatcher.match(author, workData.getContributors());
assertTrue(workData.getContributors().size() == 5);
List<Contributor> c = workData.getContributors();
assertTrue(c.get(0).getName().equals(name));
assertTrue(c.get(0).getSurname().equals(surname));
assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
assertTrue(c.get(0).getOid().equals(orcidIdA));
}
@Test
public void otherNamesMatchTest()
throws VtdException, ParseException, IOException, XPathEvalException, NavException, XPathParseException {
AuthorData author = new AuthorData();
author.setName("Joe");
author.setSurname("Dodge");
author.setOid("0000-1111-2222-3333");
String otherName1 = new String("Joe Dr. Dodge");
String otherName2 = new String("XY");
List<String> others = Lists.newArrayList();
others.add(otherName1);
others.add(otherName2);
author.setOtherNames(others);
Contributor contributor = new Contributor();
contributor.setCreditName("XY");
List<Contributor> contributors = Arrays.asList(contributor);
AuthorMatcher.match(author, contributors);
assertTrue(contributors.get(0).getName().equals("Joe"));
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save