forked from D-Net/dnet-hadoop
Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop
This commit is contained in:
commit
bece04b330
|
@ -0,0 +1,30 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class Constants {
|
||||||
|
|
||||||
|
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||||
|
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||||
|
|
||||||
|
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
|
||||||
|
|
||||||
|
static {
|
||||||
|
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||||
|
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||||
|
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
|
||||||
|
accessRightsCoarMap.put("CLOSED", "c_14cb");
|
||||||
|
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
coarCodeLabelMap.put("c_abf2", "OPEN");
|
||||||
|
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
|
||||||
|
coarCodeLabelMap.put("c_14cb", "CLOSED");
|
||||||
|
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,415 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class GraphResultMapper implements Serializable {
|
||||||
|
|
||||||
|
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||||
|
E in) {
|
||||||
|
|
||||||
|
CommunityResult out = new CommunityResult();
|
||||||
|
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||||
|
if (ort.isPresent()) {
|
||||||
|
switch (ort.get().getClassid()) {
|
||||||
|
case "publication":
|
||||||
|
Optional<Journal> journal = Optional
|
||||||
|
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||||
|
if (journal.isPresent()) {
|
||||||
|
Journal j = journal.get();
|
||||||
|
Container c = new Container();
|
||||||
|
c.setConferencedate(j.getConferencedate());
|
||||||
|
c.setConferenceplace(j.getConferenceplace());
|
||||||
|
c.setEdition(j.getEdition());
|
||||||
|
c.setEp(j.getEp());
|
||||||
|
c.setIss(j.getIss());
|
||||||
|
c.setIssnLinking(j.getIssnLinking());
|
||||||
|
c.setIssnOnline(j.getIssnOnline());
|
||||||
|
c.setIssnPrinted(j.getIssnPrinted());
|
||||||
|
c.setName(j.getName());
|
||||||
|
c.setSp(j.getSp());
|
||||||
|
c.setVol(j.getVol());
|
||||||
|
out.setContainer(c);
|
||||||
|
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case "dataset":
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||||
|
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||||
|
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||||
|
|
||||||
|
out
|
||||||
|
.setGeolocation(
|
||||||
|
Optional
|
||||||
|
.ofNullable(id.getGeolocation())
|
||||||
|
.map(
|
||||||
|
igl -> igl
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.map(gli -> {
|
||||||
|
GeoLocation gl = new GeoLocation();
|
||||||
|
gl.setBox(gli.getBox());
|
||||||
|
gl.setPlace(gli.getPlace());
|
||||||
|
gl.setPoint(gli.getPoint());
|
||||||
|
return gl;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
|
||||||
|
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||||
|
break;
|
||||||
|
case "software":
|
||||||
|
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||||
|
Optional
|
||||||
|
.ofNullable(is.getCodeRepositoryUrl())
|
||||||
|
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(is.getDocumentationUrl())
|
||||||
|
.ifPresent(
|
||||||
|
value -> out
|
||||||
|
.setDocumentationUrl(
|
||||||
|
value
|
||||||
|
.stream()
|
||||||
|
.map(v -> v.getValue())
|
||||||
|
.collect(Collectors.toList())));
|
||||||
|
|
||||||
|
Optional
|
||||||
|
.ofNullable(is.getProgrammingLanguage())
|
||||||
|
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||||
|
|
||||||
|
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||||
|
break;
|
||||||
|
case "other":
|
||||||
|
|
||||||
|
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||||
|
out
|
||||||
|
.setContactgroup(
|
||||||
|
Optional
|
||||||
|
.ofNullable(ir.getContactgroup())
|
||||||
|
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
|
||||||
|
out
|
||||||
|
.setContactperson(
|
||||||
|
Optional
|
||||||
|
.ofNullable(ir.getContactperson())
|
||||||
|
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
out
|
||||||
|
.setTool(
|
||||||
|
Optional
|
||||||
|
.ofNullable(ir.getTool())
|
||||||
|
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
|
||||||
|
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getAuthor())
|
||||||
|
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
|
||||||
|
|
||||||
|
// I do not map Access Right UNKNOWN or OTHER
|
||||||
|
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||||
|
if (oar.isPresent()) {
|
||||||
|
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||||
|
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||||
|
out
|
||||||
|
.setBestaccessright(
|
||||||
|
AccessRight
|
||||||
|
.newInstance(
|
||||||
|
code,
|
||||||
|
Constants.coarCodeLabelMap.get(code),
|
||||||
|
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> contributorList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getContributor())
|
||||||
|
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||||
|
out.setContributor(contributorList);
|
||||||
|
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getCountry())
|
||||||
|
.ifPresent(
|
||||||
|
value -> out
|
||||||
|
.setCountry(
|
||||||
|
value
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
c -> {
|
||||||
|
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Country country = new Country();
|
||||||
|
country.setCode(c.getClassid());
|
||||||
|
country.setLabel(c.getClassname());
|
||||||
|
Optional
|
||||||
|
.ofNullable(c.getDataInfo())
|
||||||
|
.ifPresent(
|
||||||
|
provenance -> country
|
||||||
|
.setProvenance(
|
||||||
|
Provenance
|
||||||
|
.newInstance(
|
||||||
|
provenance
|
||||||
|
.getProvenanceaction()
|
||||||
|
.getClassname(),
|
||||||
|
c.getDataInfo().getTrust())));
|
||||||
|
return country;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList())));
|
||||||
|
|
||||||
|
final List<String> coverageList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getCoverage())
|
||||||
|
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||||
|
out.setCoverage(coverageList);
|
||||||
|
|
||||||
|
out.setDateofcollection(input.getDateofcollection());
|
||||||
|
|
||||||
|
final List<String> descriptionList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getDescription())
|
||||||
|
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||||
|
out.setDescription(descriptionList);
|
||||||
|
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setEmbargoenddate(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> formatList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getFormat())
|
||||||
|
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||||
|
out.setFormat(formatList);
|
||||||
|
out.setId(input.getId());
|
||||||
|
out.setOriginalId(input.getOriginalId());
|
||||||
|
|
||||||
|
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||||
|
.ofNullable(input.getInstance());
|
||||||
|
|
||||||
|
if (oInst.isPresent()) {
|
||||||
|
out
|
||||||
|
.setInstance(
|
||||||
|
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||||
|
if (oL.isPresent()) {
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||||
|
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||||
|
}
|
||||||
|
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||||
|
if (oLong.isPresent()) {
|
||||||
|
out.setLastupdatetimestamp(oLong.get());
|
||||||
|
}
|
||||||
|
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||||
|
if (otitle.isPresent()) {
|
||||||
|
List<StructuredProperty> iTitle = otitle
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
if (iTitle.size() > 0) {
|
||||||
|
out.setMaintitle(iTitle.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
iTitle = otitle
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
if (iTitle.size() > 0) {
|
||||||
|
out.setSubtitle(iTitle.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ControlledField> pids = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getPid())
|
||||||
|
.ifPresent(
|
||||||
|
value -> value
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
p -> pids
|
||||||
|
.add(
|
||||||
|
ControlledField
|
||||||
|
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||||
|
out.setPid(pids);
|
||||||
|
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setPublicationdate(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
oStr = Optional.ofNullable(input.getPublisher());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setPublisher(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> sourceList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getSource())
|
||||||
|
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||||
|
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||||
|
List<Subject> subjectList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getSubject())
|
||||||
|
.ifPresent(
|
||||||
|
value -> value
|
||||||
|
.forEach(s -> subjectList.add(getSubject(s))));
|
||||||
|
|
||||||
|
out.setSubjects(subjectList);
|
||||||
|
|
||||||
|
out.setType(input.getResulttype().getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
out
|
||||||
|
.setCollectedfrom(
|
||||||
|
input
|
||||||
|
.getCollectedfrom()
|
||||||
|
.stream()
|
||||||
|
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
|
||||||
|
return out;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||||
|
CommunityInstance instance = new CommunityInstance();
|
||||||
|
|
||||||
|
setCommonValue(i, instance);
|
||||||
|
|
||||||
|
instance
|
||||||
|
.setCollectedfrom(
|
||||||
|
KeyValue
|
||||||
|
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||||
|
|
||||||
|
instance
|
||||||
|
.setHostedby(
|
||||||
|
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||||
|
|
||||||
|
return instance;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||||
|
.ofNullable(i.getAccessright());
|
||||||
|
if (opAr.isPresent()) {
|
||||||
|
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||||
|
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||||
|
instance
|
||||||
|
.setAccessright(
|
||||||
|
AccessRight
|
||||||
|
.newInstance(
|
||||||
|
code,
|
||||||
|
Constants.coarCodeLabelMap.get(code),
|
||||||
|
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getLicense())
|
||||||
|
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getDateofacceptance())
|
||||||
|
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getRefereed())
|
||||||
|
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getInstancetype())
|
||||||
|
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||||
|
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Subject getSubject(StructuredProperty s) {
|
||||||
|
Subject subject = new Subject();
|
||||||
|
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||||
|
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||||
|
if (di.isPresent()) {
|
||||||
|
Provenance p = new Provenance();
|
||||||
|
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||||
|
p.setTrust(di.get().getTrust());
|
||||||
|
subject.setProvenance(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
return subject;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||||
|
Author a = new Author();
|
||||||
|
a.setFullname(oa.getFullname());
|
||||||
|
a.setName(oa.getName());
|
||||||
|
a.setSurname(oa.getSurname());
|
||||||
|
a.setRank(oa.getRank());
|
||||||
|
|
||||||
|
Optional<List<StructuredProperty>> oPids = Optional
|
||||||
|
.ofNullable(oa.getPid());
|
||||||
|
if (oPids.isPresent()) {
|
||||||
|
Pid pid = getOrcid(oPids.get());
|
||||||
|
if (pid != null) {
|
||||||
|
a.setPid(pid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||||
|
for (StructuredProperty pid : p) {
|
||||||
|
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||||
|
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||||
|
if (di.isPresent()) {
|
||||||
|
return Pid
|
||||||
|
.newInstance(
|
||||||
|
ControlledField
|
||||||
|
.newInstance(
|
||||||
|
pid.getQualifier().getClassid(),
|
||||||
|
pid.getValue()),
|
||||||
|
Provenance
|
||||||
|
.newInstance(
|
||||||
|
di.get().getProvenanceaction().getClassname(),
|
||||||
|
di.get().getTrust()));
|
||||||
|
} else {
|
||||||
|
return Pid
|
||||||
|
.newInstance(
|
||||||
|
ControlledField
|
||||||
|
.newInstance(
|
||||||
|
pid.getQualifier().getClassid(),
|
||||||
|
pid.getValue())
|
||||||
|
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
|
||||||
String p_string = p.toString();
|
String p_string = p.toString();
|
||||||
if (!p_string.endsWith("_SUCCESS")) {
|
if (!p_string.endsWith("_SUCCESS")) {
|
||||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||||
if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
|
|
||||||
name = "communities_infrastructures.json";
|
|
||||||
}
|
|
||||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
|
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
|
||||||
entry.setSize(fileStatus.getLen());
|
entry.setSize(fileStatus.getLen());
|
||||||
current_size += fileStatus.getLen();
|
current_size += fileStatus.getLen();
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp</artifactId>
|
<artifactId>dhp</artifactId>
|
||||||
<version>1.2.4-SNAPSHOT</version>
|
<version>1.2.4-SNAPSHOT</version>
|
||||||
<relativePath>../</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
<artifactId>dhp-schemas</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
|
|
@ -7,6 +7,10 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
public class ModelConstants {
|
public class ModelConstants {
|
||||||
|
|
||||||
|
public static final String ORCID = "orcid";
|
||||||
|
public static final String ORCID_PENDING = "orcid_pending";
|
||||||
|
public static final String ORCID_CLASSNAME = "Open Researcher and Contributor ID";
|
||||||
|
|
||||||
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
|
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
|
||||||
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
|
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
|
||||||
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
|
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
|
||||||
|
|
|
@ -2,8 +2,12 @@
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public abstract class Oaf implements Serializable {
|
public abstract class Oaf implements Serializable {
|
||||||
|
|
||||||
|
@ -40,9 +44,35 @@ public abstract class Oaf implements Serializable {
|
||||||
this.lastupdatetimestamp = lastupdatetimestamp;
|
this.lastupdatetimestamp = lastupdatetimestamp;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void mergeOAFDataInfo(Oaf e) {
|
public void mergeFrom(Oaf o) {
|
||||||
if (e.getDataInfo() != null && compareTrust(this, e) < 0)
|
if (Objects.isNull(o)) {
|
||||||
dataInfo = e.getDataInfo();
|
return;
|
||||||
|
}
|
||||||
|
setCollectedfrom(
|
||||||
|
Stream
|
||||||
|
.concat(
|
||||||
|
Optional
|
||||||
|
.ofNullable(getCollectedfrom())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty()),
|
||||||
|
Optional
|
||||||
|
.ofNullable(o.getCollectedfrom())
|
||||||
|
.map(Collection::stream)
|
||||||
|
.orElse(Stream.empty()))
|
||||||
|
.distinct() // relies on KeyValue.equals
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
|
mergeOAFDataInfo(o);
|
||||||
|
|
||||||
|
setLastupdatetimestamp(
|
||||||
|
Math.max(
|
||||||
|
Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
|
||||||
|
Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void mergeOAFDataInfo(Oaf o) {
|
||||||
|
if (o.getDataInfo() != null && compareTrust(this, o) < 0)
|
||||||
|
dataInfo = o.getDataInfo();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String extractTrust(Oaf e) {
|
protected String extractTrust(Oaf e) {
|
||||||
|
@ -62,7 +92,7 @@ public abstract class Oaf implements Serializable {
|
||||||
if (o == null || getClass() != o.getClass())
|
if (o == null || getClass() != o.getClass())
|
||||||
return false;
|
return false;
|
||||||
Oaf oaf = (Oaf) o;
|
Oaf oaf = (Oaf) o;
|
||||||
return Objects.equals(dataInfo, oaf.dataInfo)
|
return Objects.equals(getDataInfo(), oaf.getDataInfo())
|
||||||
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
|
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void mergeFrom(OafEntity e) {
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
if (e == null)
|
|
||||||
return;
|
|
||||||
|
|
||||||
originalId = mergeLists(originalId, e.getOriginalId());
|
originalId = mergeLists(originalId, e.getOriginalId());
|
||||||
|
|
||||||
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
|
||||||
|
|
||||||
pid = mergeLists(pid, e.getPid());
|
pid = mergeLists(pid, e.getPid());
|
||||||
|
|
||||||
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
||||||
|
|
|
@ -130,19 +130,7 @@ public class Relation extends Oaf {
|
||||||
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
||||||
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
||||||
|
|
||||||
setCollectedfrom(
|
super.mergeFrom(r);
|
||||||
Stream
|
|
||||||
.concat(
|
|
||||||
Optional
|
|
||||||
.ofNullable(getCollectedfrom())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty()),
|
|
||||||
Optional
|
|
||||||
.ofNullable(r.getCollectedfrom())
|
|
||||||
.map(Collection::stream)
|
|
||||||
.orElse(Stream.empty()))
|
|
||||||
.distinct() // relies on KeyValue.equals
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.model;
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data that are retrieved from orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
public class AuthorData implements Serializable {
|
public class AuthorData implements Serializable {
|
||||||
|
|
||||||
|
@ -10,6 +17,7 @@ public class AuthorData implements Serializable {
|
||||||
private String surname;
|
private String surname;
|
||||||
private String creditName;
|
private String creditName;
|
||||||
private String errorCode;
|
private String errorCode;
|
||||||
|
private List<String> otherNames;
|
||||||
|
|
||||||
public String getErrorCode() {
|
public String getErrorCode() {
|
||||||
return errorCode;
|
return errorCode;
|
||||||
|
@ -50,4 +58,15 @@ public class AuthorData implements Serializable {
|
||||||
public void setOid(String oid) {
|
public void setOid(String oid) {
|
||||||
this.oid = oid;
|
this.oid = oid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<String> getOtherNames() {
|
||||||
|
return otherNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOtherNames(List<String> otherNames) {
|
||||||
|
if (this.otherNames == null) {
|
||||||
|
this.otherNames = Lists.newArrayList();
|
||||||
|
}
|
||||||
|
this.otherNames = otherNames;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.orcid;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class OrcidDOI {
|
||||||
|
private String doi;
|
||||||
|
private List<AuthorData> authors;
|
||||||
|
|
||||||
|
public String getDoi() {
|
||||||
|
return doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDoi(String doi) {
|
||||||
|
this.doi = doi;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<AuthorData> getAuthors() {
|
||||||
|
return authors;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setAuthors(List<AuthorData> authors) {
|
||||||
|
this.authors = authors;
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -29,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
public class PartitionEventsByDsIdJob {
|
public class PartitionEventsByDsIdJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
||||||
private static final String OPENDOAR_NSPREFIX = "opendoar____::";
|
private static final String OPENDOAR_NSPREFIX = "10|opendoar____::";
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
PartitionEventsByDsIdJob.class
|
PartitionEventsByDsIdJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final Boolean isSparkSessionManaged = Optional
|
final Boolean isSparkSessionManaged = Optional
|
||||||
|
@ -54,14 +61,32 @@ public class PartitionEventsByDsIdJob {
|
||||||
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
||||||
log.info("partitionPath: {}", partitionPath);
|
log.info("partitionPath: {}", partitionPath);
|
||||||
|
|
||||||
|
final String opendoarIds = parser.get("opendoarIds");
|
||||||
|
log.info("opendoarIds: {}", opendoarIds);
|
||||||
|
|
||||||
|
final Set<String> validOpendoarIds = new HashSet<>();
|
||||||
|
if (!opendoarIds.trim().equals("-")) {
|
||||||
|
validOpendoarIds
|
||||||
|
.addAll(
|
||||||
|
Arrays
|
||||||
|
.stream(opendoarIds.split(","))
|
||||||
|
.map(String::trim)
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
|
||||||
|
.collect(Collectors.toSet()));
|
||||||
|
}
|
||||||
|
log.info("validOpendoarIds: {}", validOpendoarIds);
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
|
||||||
ClusterUtils
|
ClusterUtils
|
||||||
.readPath(spark, eventsPath, Event.class)
|
.readPath(spark, eventsPath, Event.class)
|
||||||
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||||
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
|
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
|
||||||
.limit(10000)
|
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
|
||||||
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
|
.map(
|
||||||
|
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
|
||||||
|
Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||||
.coalesce(1)
|
.coalesce(1)
|
||||||
.write()
|
.write()
|
||||||
.partitionBy("group")
|
.partitionBy("group")
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "o",
|
||||||
|
"paramLongName": "workingPath",
|
||||||
|
"paramDescription": "the path where the temporary data will be stored",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "list",
|
||||||
|
"paramLongName": "opendoarIds",
|
||||||
|
"paramDescription": "the opendoar IDs whitelist (comma separated)",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -1,60 +1,13 @@
|
||||||
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>graphInputPath</name>
|
<name>opendoarIds</name>
|
||||||
<description>the path where the graph is stored</description>
|
<description>the opendoar IDs whitelist (comma separated)</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data will be stored</description>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>datasourceIdWhitelist</name>
|
|
||||||
<value>-</value>
|
|
||||||
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>datasourceTypeWhitelist</name>
|
|
||||||
<value>-</value>
|
|
||||||
<description>a white list (comma separeted, - for empty list) of datasource types</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>datasourceIdBlacklist</name>
|
|
||||||
<value>-</value>
|
|
||||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>esEventIndexName</name>
|
|
||||||
<description>the elasticsearch index name for events</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>esNotificationsIndexName</name>
|
|
||||||
<description>the elasticsearch index name for notifications</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>esIndexHost</name>
|
|
||||||
<description>the elasticsearch host</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>maxIndexedEventsForDsAndTopic</name>
|
|
||||||
<description>the max number of events for each couple (ds/topic)</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>brokerApiBaseUrl</name>
|
|
||||||
<description>the url of the broker service api</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>brokerDbUrl</name>
|
|
||||||
<description>the url of the broker database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>brokerDbUser</name>
|
|
||||||
<description>the user of the broker database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>brokerDbPassword</name>
|
|
||||||
<description>the password of the broker database</description>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
|
@ -111,13 +64,13 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="partition"/>
|
<start to="opendoarPartition"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="partition">
|
<action name="opendoarPartition">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -134,8 +87,8 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
|
@ -14,7 +14,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>net.alchim31.maven</groupId>
|
<groupId>net.alchim31.maven</groupId>
|
||||||
<artifactId>scala-maven-plugin</artifactId>
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
<version>4.0.1</version>
|
<version>${net.alchim31.maven.version}</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
|
@ -51,7 +51,6 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
<artifactId>httpclient</artifactId>
|
<artifactId>httpclient</artifactId>
|
||||||
<version>4.3.4</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
@ -84,6 +83,11 @@
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-text</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
|
@ -62,7 +62,7 @@ object SparkGenerateDoiBoost {
|
||||||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||||
|
|
||||||
logger.info("Phase 3) Join Result with MAG")
|
logger.info("Phase 4) Join Result with MAG")
|
||||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
||||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||||
|
|
|
@ -200,7 +200,7 @@ case object Crossref2Oaf {
|
||||||
a.setSurname(family)
|
a.setSurname(family)
|
||||||
a.setFullname(s"$given $family")
|
a.setFullname(s"$given $family")
|
||||||
if (StringUtils.isNotBlank(orcid))
|
if (StringUtils.isNotBlank(orcid))
|
||||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
|
||||||
|
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
@ -248,7 +248,7 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
|
|
||||||
def snsfRule(award:String): String = {
|
def snsfRule(award:String): String = {
|
||||||
var tmp1 = StringUtils.substringAfter(award,"_")
|
val tmp1 = StringUtils.substringAfter(award,"_")
|
||||||
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
||||||
logger.debug(s"From $award to $tmp2")
|
logger.debug(s"From $award to $tmp2")
|
||||||
tmp2
|
tmp2
|
||||||
|
@ -294,7 +294,7 @@ case object Crossref2Oaf {
|
||||||
}
|
}
|
||||||
|
|
||||||
def getProjectId (nsPrefix:String, targetId:String):String = {
|
def getProjectId (nsPrefix:String, targetId:String):String = {
|
||||||
"40|$nsPrefix::$targetId"
|
s"40|$nsPrefix::$targetId"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.crossref
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.hadoop.io.{IntWritable, Text}
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.expressions.Aggregator
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
|
@ -12,21 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object CrossrefDataset {
|
object CrossrefDataset {
|
||||||
|
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||||
|
|
||||||
def extractTimestamp(input:String): Long = {
|
|
||||||
|
def to_item(input:String):CrossrefDT = {
|
||||||
|
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: json4s.JValue = parse(input)
|
lazy val json: json4s.JValue = parse(input)
|
||||||
|
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
|
||||||
(json\"indexed"\"timestamp").extractOrElse[Long](0)
|
val doi:String = (json \ "DOI").extract[String]
|
||||||
|
CrossrefDT(doi, input, ts)
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
@ -49,9 +52,8 @@ object CrossrefDataset {
|
||||||
if (a == null)
|
if (a == null)
|
||||||
return b
|
return b
|
||||||
|
|
||||||
val tb = extractTimestamp(b.json)
|
|
||||||
val ta = extractTimestamp(a.json)
|
if(a.timestamp >b.timestamp) {
|
||||||
if(ta >tb) {
|
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
b
|
b
|
||||||
|
@ -63,9 +65,7 @@ object CrossrefDataset {
|
||||||
if (a == null)
|
if (a == null)
|
||||||
return b
|
return b
|
||||||
|
|
||||||
val tb = extractTimestamp(b.json)
|
if(a.timestamp >b.timestamp) {
|
||||||
val ta = extractTimestamp(a.json)
|
|
||||||
if(ta >tb) {
|
|
||||||
return a
|
return a
|
||||||
}
|
}
|
||||||
b
|
b
|
||||||
|
@ -78,15 +78,21 @@ object CrossrefDataset {
|
||||||
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
|
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
|
||||||
}
|
}
|
||||||
|
|
||||||
val sourcePath:String = parser.get("sourcePath")
|
val workingPath:String = parser.get("workingPath")
|
||||||
val targetPath:String = parser.get("targetPath")
|
|
||||||
|
|
||||||
val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT]
|
|
||||||
|
|
||||||
ds.groupByKey(_.doi)
|
val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
|
||||||
|
|
||||||
|
|
||||||
|
val update =
|
||||||
|
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
||||||
|
.map(i =>CrossrefImporter.decompressBlob(i._2.toString))
|
||||||
|
.map(i =>to_item(i)))
|
||||||
|
|
||||||
|
main_ds.union(update).groupByKey(_.doi)
|
||||||
.agg(crossrefAggregator.toColumn)
|
.agg(crossrefAggregator.toColumn)
|
||||||
.map(s=>s._2)
|
.map(s=>s._2)
|
||||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,85 +34,21 @@ object SparkMapDumpIntoOAF {
|
||||||
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
|
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
|
||||||
|
|
||||||
val sc = spark.sparkContext
|
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
|
|
||||||
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
||||||
.flatMap(k => Crossref2Oaf.convert(k.json))
|
.flatMap(k => Crossref2Oaf.convert(k.json))
|
||||||
.filter(o => o != null)
|
.filter(o => o != null)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
|
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
|
||||||
|
|
||||||
|
|
||||||
val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.save(s"$targetPath/publication")
|
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.save(s"$targetPath/relation")
|
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
|
||||||
|
|
||||||
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.save(s"$targetPath/dataset")
|
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
|
|
||||||
// .map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
|
|
||||||
// .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
|
|
||||||
//
|
|
||||||
// val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
|
|
||||||
//
|
|
||||||
// val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
|
|
||||||
// .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
|
|
||||||
// var r = if (p1 == null) p2 else p1
|
|
||||||
// if (p1 != null && p2 != null) {
|
|
||||||
// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
|
||||||
// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
|
||||||
// r = p2
|
|
||||||
// else
|
|
||||||
// r = p1
|
|
||||||
// } else {
|
|
||||||
// r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// r
|
|
||||||
// }.map(_._2)
|
|
||||||
//
|
|
||||||
// val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
|
|
||||||
// pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
|
|
||||||
// .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
|
|
||||||
// var r = if (p1 == null) p2 else p1
|
|
||||||
// if (p1 != null && p2 != null) {
|
|
||||||
// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
|
||||||
// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
|
||||||
// r = p2
|
|
||||||
// else
|
|
||||||
// r = p1
|
|
||||||
// } else {
|
|
||||||
// r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// r
|
|
||||||
// }.map(_._2)
|
|
||||||
//
|
|
||||||
// spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
|
|
||||||
//
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
|
|
||||||
// .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
|
|
||||||
// .reduceByKey { case (p1: Relation, p2: Relation) =>
|
|
||||||
// if (p1 == null) p2 else p1
|
|
||||||
// }.map(_._2)
|
|
||||||
//
|
|
||||||
// val rels: Dataset[Relation] = spark.createDataset(distinctRels)
|
|
||||||
//
|
|
||||||
// rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
|
||||||
|
|
||||||
|
|
||||||
val stream = Map(
|
val stream = Map(
|
||||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
|
||||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||||
|
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
||||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
||||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
||||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
// ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
|
||||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||||
|
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
|
||||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
||||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
||||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
||||||
|
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
|
||||||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
||||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
||||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
||||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
|
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
|
||||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
|
val workingPath = parser.get("workingPath")
|
||||||
|
val targetPath = parser.get("targetPath")
|
||||||
|
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||||
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
||||||
|
|
||||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||||
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
|
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
|
||||||
|
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
|
|
||||||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
|
||||||
|
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
|
||||||
|
|
||||||
logger.info("Phase 0) Enrich Publication with description")
|
logger.info("Phase 0) Enrich Publication with description")
|
||||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
|
||||||
|
|
||||||
logger.info("Phase 3) Group Author by PaperId")
|
logger.info("Phase 3) Group Author by PaperId")
|
||||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||||
|
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
|
||||||
} else
|
} else
|
||||||
mpa
|
mpa
|
||||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
|
||||||
|
|
||||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||||
|
|
||||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||||
|
|
||||||
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
|
||||||
|
|
||||||
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||||
|
|
||||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
|
||||||
|
|
||||||
|
|
||||||
var magPubs: Dataset[(String, Publication)] =
|
var magPubs: Dataset[(String, Publication)] =
|
||||||
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
|
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
|
||||||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||||
.write
|
.write
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
|
.save(s"$workingPath/merge_step_2_conference")
|
||||||
|
|
||||||
|
|
||||||
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
|
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
|
||||||
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||||
|
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
|
||||||
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||||
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
||||||
.write.mode(SaveMode.Overwrite)
|
.write.mode(SaveMode.Overwrite)
|
||||||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
.save(s"$workingPath/merge_step_3")
|
||||||
|
|
||||||
|
|
||||||
// logger.info("Phase 6) Enrich Publication with description")
|
// logger.info("Phase 6) Enrich Publication with description")
|
||||||
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||||
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||||
|
|
||||||
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
|
||||||
|
|
||||||
|
|
||||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
|
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
|
||||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
||||||
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
|
||||||
|
|
||||||
|
|
||||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||||
|
|
||||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
|
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
|
||||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||||
|
|
||||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||||
|
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
|
||||||
.equalTo(paperField("PaperId")), "left")
|
.equalTo(paperField("PaperId")), "left")
|
||||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||||
.write.mode(SaveMode.Overwrite)
|
.write.mode(SaveMode.Overwrite)
|
||||||
.save(s"${parser.get("targetPath")}/mag_publication")
|
.save(s"$workingPath/mag_publication")
|
||||||
|
|
||||||
|
|
||||||
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
|
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||||
.map(_._2)
|
.map(_._2)
|
||||||
|
|
||||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
|
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,11 +17,12 @@ import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
|
||||||
public class ActivitiesDecompressor {
|
public class ActivitiesDecompressor {
|
||||||
|
|
||||||
|
@ -143,4 +144,64 @@ public class ActivitiesDecompressor {
|
||||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||||
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
|
||||||
|
throws Exception {
|
||||||
|
String uri = inputUri;
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||||
|
Path inputPath = new Path(uri);
|
||||||
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||||
|
CompressionCodec codec = factory.getCodec(inputPath);
|
||||||
|
if (codec == null) {
|
||||||
|
System.err.println("No codec found for " + uri);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||||
|
InputStream gzipInputStream = null;
|
||||||
|
try {
|
||||||
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||||
|
int counter = 0;
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(outputPath),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
String filename = entry.getName();
|
||||||
|
if (entry.isDirectory() || !filename.contains("works")) {
|
||||||
|
} else {
|
||||||
|
counter++;
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||||
|
String line;
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
String xml = buffer.toString();
|
||||||
|
String[] filenameParts = filename.split("/");
|
||||||
|
final Text key = new Text(
|
||||||
|
XMLRecordParser
|
||||||
|
.retrieveOrcidIdFromActivity(
|
||||||
|
xml.getBytes(), filenameParts[filenameParts.length - 1]));
|
||||||
|
final Text value = new Text(xml);
|
||||||
|
writer.append(key, value);
|
||||||
|
if ((counter % 100000) == 0) {
|
||||||
|
Log.info("Current xml works extracted: " + counter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Log.info("Activities extraction completed");
|
||||||
|
Log.info("Total XML works parsed: " + counter);
|
||||||
|
} finally {
|
||||||
|
Log.debug("Closing gzip stream");
|
||||||
|
IOUtils.closeStream(gzipInputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
|
||||||
|
|
||||||
|
public class ExtractXMLActivitiesData extends OrcidDSManager {
|
||||||
|
private String outputWorksPath;
|
||||||
|
private String activitiesFileNameTarGz;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
ExtractXMLActivitiesData extractXMLActivitiesData = new ExtractXMLActivitiesData();
|
||||||
|
extractXMLActivitiesData.loadArgs(args);
|
||||||
|
extractXMLActivitiesData.extractWorks();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadArgs(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenOrcidAuthorWork.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
Log.info("Working Path: " + workingPath);
|
||||||
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
|
outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractWorks() throws Exception {
|
||||||
|
Configuration conf = initConfigurationObject();
|
||||||
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
|
Path outputPath = new Path(
|
||||||
|
hdfsServerUri
|
||||||
|
.concat(workingPath)
|
||||||
|
.concat(outputWorksPath));
|
||||||
|
ActivitiesDecompressor.extractXML(conf, tarGzUri, outputPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,56 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork;
|
||||||
|
|
||||||
|
public class ExtractXMLSummariesData extends OrcidDSManager {
|
||||||
|
|
||||||
|
private String outputAuthorsPath;
|
||||||
|
private String summariesFileNameTarGz;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
ExtractXMLSummariesData extractXMLSummariesData = new ExtractXMLSummariesData();
|
||||||
|
extractXMLSummariesData.loadArgs(args);
|
||||||
|
extractXMLSummariesData.extractAuthors();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadArgs(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenOrcidAuthorWork.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
Log.info("Working Path: " + workingPath);
|
||||||
|
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||||
|
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||||
|
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||||
|
Log.info("Output Authors Data: " + outputAuthorsPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void extractAuthors() throws Exception {
|
||||||
|
Configuration conf = initConfigurationObject();
|
||||||
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
|
||||||
|
Path outputPath = new Path(
|
||||||
|
hdfsServerUri
|
||||||
|
.concat(workingPath)
|
||||||
|
.concat(outputAuthorsPath)
|
||||||
|
.concat("xml_authors.seq"));
|
||||||
|
SummariesDecompressor.extractXML(conf, tarGzUri, outputPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang.StringUtils
|
||||||
|
@ -43,16 +44,19 @@ object ORCIDToOAF {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def convertTOOAF(input:ORCIDElement) :Publication = {
|
def convertTOOAF(input:OrcidDOI) :Publication = {
|
||||||
val doi = input.doi
|
val doi = input.getDoi
|
||||||
val pub:Publication = new Publication
|
val pub:Publication = new Publication
|
||||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||||
pub.setDataInfo(generateDataInfo())
|
pub.setDataInfo(generateDataInfo())
|
||||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||||
try{
|
try{
|
||||||
pub.setAuthor(input.authors.map(a=> {
|
|
||||||
generateAuthor(a.name, a.surname, a.creditName, a.oid)
|
val l:List[Author]= input.getAuthors.asScala.map(a=> {
|
||||||
}).asJava)
|
generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
|
||||||
|
})(collection.breakOut)
|
||||||
|
|
||||||
|
pub.setAuthor(l.asJava)
|
||||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||||
pub
|
pub
|
||||||
|
@ -63,6 +67,13 @@ object ORCIDToOAF {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def generateOricPIDDatainfo():DataInfo = {
|
||||||
|
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
|
||||||
|
di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
|
||||||
|
di.getProvenanceaction.setClassname("Harvested")
|
||||||
|
di
|
||||||
|
}
|
||||||
|
|
||||||
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||||
val a = new Author
|
val a = new Author
|
||||||
a.setName(given)
|
a.setName(given)
|
||||||
|
@ -72,7 +83,7 @@ object ORCIDToOAF {
|
||||||
else
|
else
|
||||||
a.setFullname(s"$given $family")
|
a.setFullname(s"$given $family")
|
||||||
if (StringUtils.isNotBlank(orcid))
|
if (StringUtils.isNotBlank(orcid))
|
||||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
|
||||||
|
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,8 +25,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
||||||
public void generateAuthorsDOIsData() throws Exception {
|
public void generateAuthorsDOIsData() throws Exception {
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
|
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputAuthorsDOIsPath));
|
||||||
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -41,8 +41,8 @@ public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Default Path: " + workingPath);
|
||||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
public class OrcidDSManager {
|
public class OrcidDSManager {
|
||||||
|
|
||||||
protected String hdfsServerUri;
|
protected String hdfsServerUri;
|
||||||
protected String hdfsOrcidDefaultPath;
|
protected String workingPath;
|
||||||
private String summariesFileNameTarGz;
|
private String summariesFileNameTarGz;
|
||||||
private String outputAuthorsPath;
|
private String outputAuthorsPath;
|
||||||
|
|
||||||
|
@ -28,10 +28,10 @@ public class OrcidDSManager {
|
||||||
public void generateAuthors() throws Exception {
|
public void generateAuthors() throws Exception {
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(summariesFileNameTarGz);
|
||||||
Path outputPath = new Path(
|
Path outputPath = new Path(
|
||||||
hdfsServerUri
|
hdfsServerUri
|
||||||
.concat(hdfsOrcidDefaultPath)
|
.concat(workingPath)
|
||||||
.concat(outputAuthorsPath)
|
.concat(outputAuthorsPath)
|
||||||
.concat("authors.seq"));
|
.concat("authors.seq"));
|
||||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||||
|
@ -41,22 +41,18 @@ public class OrcidDSManager {
|
||||||
// ====== Init HDFS File System Object
|
// ====== Init HDFS File System Object
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
// Set FileSystem URI
|
// Set FileSystem URI
|
||||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
|
||||||
// Because of Maven
|
// Because of Maven
|
||||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
return conf;
|
return conf;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected FileSystem initFileSystemObject(Configuration conf) {
|
protected FileSystem initFileSystemObject(Configuration conf) throws IOException {
|
||||||
// Get the filesystem - HDFS
|
// Get the filesystem - HDFS
|
||||||
|
// if there is an exception, it will be propagate
|
||||||
FileSystem fs = null;
|
FileSystem fs = null;
|
||||||
try {
|
fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
|
||||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
|
||||||
} catch (IOException e) {
|
|
||||||
// TODO Auto-generated catch block
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return fs;
|
return fs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -66,13 +62,13 @@ public class OrcidDSManager {
|
||||||
.toString(
|
.toString(
|
||||||
OrcidDSManager.class
|
OrcidDSManager.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
|
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_from_summaries.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Working Path: " + workingPath);
|
||||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||||
|
|
|
@ -1,14 +1,15 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid;
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
@ -16,6 +17,7 @@ import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
@ -27,10 +29,10 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
public class OrcidDownloader extends OrcidDSManager {
|
public class OrcidDownloader extends OrcidDSManager {
|
||||||
|
|
||||||
static final int REQ_LIMIT = 24;
|
static final int REQ_LIMIT = 24;
|
||||||
// static final int REQ_MAX_TEST = 100;
|
static final int REQ_MAX_TEST = -1;
|
||||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
|
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 500;
|
||||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
static final String lastUpdate = "2019-09-30 00:00:00";
|
static final String lastUpdate = "2020-09-29 00:00:00";
|
||||||
private String lambdaFileName;
|
private String lambdaFileName;
|
||||||
private String outputPath;
|
private String outputPath;
|
||||||
private String token;
|
private String token;
|
||||||
|
@ -41,7 +43,7 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
orcidDownloader.parseLambdaFile();
|
orcidDownloader.parseLambdaFile();
|
||||||
}
|
}
|
||||||
|
|
||||||
private String downloadRecord(String orcidId) {
|
private String downloadRecord(String orcidId) throws IOException {
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
|
@ -49,17 +51,23 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
if (response.getStatusLine().getStatusCode() != 200) {
|
if (response.getStatusLine().getStatusCode() != 200) {
|
||||||
Log
|
Log
|
||||||
.warn(
|
.info(
|
||||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||||
return new String("");
|
return new String("");
|
||||||
}
|
}
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
// return IOUtils.toString(response.getEntity().getContent());
|
||||||
|
return xmlStreamToString(response.getEntity().getContent());
|
||||||
} catch (Throwable e) {
|
|
||||||
Log.warn("Downloading " + orcidId, e.getMessage());
|
|
||||||
|
|
||||||
}
|
}
|
||||||
return new String("");
|
}
|
||||||
|
|
||||||
|
private String xmlStreamToString(InputStream xmlStream) throws IOException {
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(xmlStream));
|
||||||
|
String line;
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void parseLambdaFile() throws Exception {
|
public void parseLambdaFile() throws Exception {
|
||||||
|
@ -69,97 +77,94 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
long startDownload = 0;
|
long startDownload = 0;
|
||||||
Configuration conf = initConfigurationObject();
|
Configuration conf = initConfigurationObject();
|
||||||
FileSystem fs = initFileSystemObject(conf);
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
|
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||||
Path hdfsoutputPath = new Path(
|
Path hdfsoutputPath = new Path(
|
||||||
hdfsServerUri
|
hdfsServerUri
|
||||||
.concat(hdfsOrcidDefaultPath)
|
.concat(workingPath)
|
||||||
.concat(outputPath)
|
.concat(outputPath)
|
||||||
.concat("orcid_records.seq"));
|
.concat("updated_xml_authors.seq"));
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
new GzipCompressorInputStream(lambdaFileStream))) {
|
||||||
.createWriter(
|
TarArchiveEntry entry = null;
|
||||||
conf,
|
StringBuilder sb = new StringBuilder();
|
||||||
SequenceFile.Writer.file(hdfsoutputPath),
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
SequenceFile.Writer.keyClass(Text.class),
|
.createWriter(
|
||||||
SequenceFile.Writer.valueClass(Text.class))) {
|
conf,
|
||||||
|
SequenceFile.Writer.file(hdfsoutputPath),
|
||||||
try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
String line;
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
int nReqTmp = 0;
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||||
startDownload = System.currentTimeMillis();
|
startDownload = System.currentTimeMillis();
|
||||||
long startReqTmp = System.currentTimeMillis();
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
while ((line = br.readLine()) != null) {
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from tarInput
|
||||||
parsedRecordsCounter++;
|
String line;
|
||||||
// skip headers line
|
while ((line = br.readLine()) != null) {
|
||||||
if (parsedRecordsCounter == 1) {
|
String[] values = line.split(",");
|
||||||
continue;
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
}
|
int nReqTmp = 0;
|
||||||
String[] values = line.split(",");
|
long startReqTmp = System.currentTimeMillis();
|
||||||
List<String> recordInfo = Arrays.asList(values);
|
// skip headers line
|
||||||
String orcidId = recordInfo.get(0);
|
if (parsedRecordsCounter == 0) {
|
||||||
if (isModified(orcidId, recordInfo.get(3))) {
|
parsedRecordsCounter++;
|
||||||
String record = downloadRecord(orcidId);
|
continue;
|
||||||
downloadedRecordsCounter++;
|
}
|
||||||
if (!record.isEmpty()) {
|
parsedRecordsCounter++;
|
||||||
String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
String orcidId = recordInfo.get(0);
|
||||||
final Text key = new Text(recordInfo.get(0));
|
if (isModified(orcidId, recordInfo.get(3))) {
|
||||||
final Text value = new Text(compressRecord);
|
String record = downloadRecord(orcidId);
|
||||||
|
downloadedRecordsCounter++;
|
||||||
try {
|
if (!record.isEmpty()) {
|
||||||
|
// String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
||||||
|
final Text key = new Text(recordInfo.get(0));
|
||||||
|
final Text value = new Text(record);
|
||||||
writer.append(key, value);
|
writer.append(key, value);
|
||||||
savedRecordsCounter++;
|
savedRecordsCounter++;
|
||||||
} catch (IOException e) {
|
}
|
||||||
Log.warn("Writing to sequence file: " + e.getMessage());
|
} else {
|
||||||
Log.warn(e);
|
break;
|
||||||
throw new RuntimeException(e);
|
}
|
||||||
|
long endReq = System.currentTimeMillis();
|
||||||
|
nReqTmp++;
|
||||||
|
if (nReqTmp == REQ_LIMIT) {
|
||||||
|
long reqSessionDuration = endReq - startReqTmp;
|
||||||
|
if (reqSessionDuration <= 1000) {
|
||||||
|
Log
|
||||||
|
.info(
|
||||||
|
"\nreqSessionDuration: "
|
||||||
|
+ reqSessionDuration
|
||||||
|
+ " nReqTmp: "
|
||||||
|
+ nReqTmp
|
||||||
|
+ " wait ....");
|
||||||
|
Thread.sleep(1000 - reqSessionDuration);
|
||||||
|
} else {
|
||||||
|
nReqTmp = 0;
|
||||||
|
startReqTmp = System.currentTimeMillis();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||||
|
Log
|
||||||
|
.info(
|
||||||
|
"Current parsed: "
|
||||||
|
+ parsedRecordsCounter
|
||||||
|
+ " downloaded: "
|
||||||
|
+ downloadedRecordsCounter
|
||||||
|
+ " saved: "
|
||||||
|
+ savedRecordsCounter);
|
||||||
|
if (REQ_MAX_TEST != -1 && parsedRecordsCounter > REQ_MAX_TEST) {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
long endReq = System.currentTimeMillis();
|
long endDownload = System.currentTimeMillis();
|
||||||
nReqTmp++;
|
long downloadTime = endDownload - startDownload;
|
||||||
if (nReqTmp == REQ_LIMIT) {
|
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
|
||||||
long reqSessionDuration = endReq - startReqTmp;
|
|
||||||
if (reqSessionDuration <= 1000) {
|
|
||||||
Log
|
|
||||||
.warn(
|
|
||||||
"\nreqSessionDuration: "
|
|
||||||
+ reqSessionDuration
|
|
||||||
+ " nReqTmp: "
|
|
||||||
+ nReqTmp
|
|
||||||
+ " wait ....");
|
|
||||||
Thread.sleep(1000 - reqSessionDuration);
|
|
||||||
} else {
|
|
||||||
nReqTmp = 0;
|
|
||||||
startReqTmp = System.currentTimeMillis();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if (parsedRecordsCounter > REQ_MAX_TEST) {
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
|
||||||
Log
|
|
||||||
.info(
|
|
||||||
"Current parsed: "
|
|
||||||
+ parsedRecordsCounter
|
|
||||||
+ " downloaded: "
|
|
||||||
+ downloadedRecordsCounter
|
|
||||||
+ " saved: "
|
|
||||||
+ savedRecordsCounter);
|
|
||||||
// if (parsedRecordsCounter > REQ_MAX_TEST) {
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
long endDownload = System.currentTimeMillis();
|
|
||||||
long downloadTime = endDownload - startDownload;
|
|
||||||
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lambdaFileStream.close();
|
|
||||||
Log.info("Download started at: " + new Date(startDownload).toString());
|
Log.info("Download started at: " + new Date(startDownload).toString());
|
||||||
|
Log.info("Download ended at: " + new Date(System.currentTimeMillis()).toString());
|
||||||
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
|
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
|
||||||
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
|
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
|
||||||
Log.info("Saved Records Counter: " + savedRecordsCounter);
|
Log.info("Saved Records Counter: " + savedRecordsCounter);
|
||||||
|
@ -176,8 +181,8 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
|
|
||||||
hdfsServerUri = parser.get("hdfsServerUri");
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
Log.info("HDFS URI: " + hdfsServerUri);
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
workingPath = parser.get("workingPath");
|
||||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
Log.info("Default Path: " + workingPath);
|
||||||
lambdaFileName = parser.get("lambdaFileName");
|
lambdaFileName = parser.get("lambdaFileName");
|
||||||
Log.info("Lambda File Name: " + lambdaFileName);
|
Log.info("Lambda File Name: " + lambdaFileName);
|
||||||
outputPath = parser.get("outputPath");
|
outputPath = parser.get("outputPath");
|
||||||
|
@ -185,7 +190,7 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
token = parser.get("token");
|
token = parser.get("token");
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isModified(String orcidId, String modifiedDate) {
|
public boolean isModified(String orcidId, String modifiedDate) {
|
||||||
Date modifiedDateDt = null;
|
Date modifiedDateDt = null;
|
||||||
Date lastUpdateDt = null;
|
Date lastUpdateDt = null;
|
||||||
try {
|
try {
|
||||||
|
@ -195,7 +200,7 @@ public class OrcidDownloader extends OrcidDSManager {
|
||||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
|
Log.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return modifiedDateDt.after(lastUpdateDt);
|
return modifiedDateDt.after(lastUpdateDt);
|
||||||
|
|
|
@ -1,21 +1,72 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.rdd.RDD
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.expressions.Aggregator
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
object SparkConvertORCIDToOAF {
|
object SparkConvertORCIDToOAF {
|
||||||
|
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||||
|
|
||||||
|
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||||
|
|
||||||
|
override def zero: Publication = new Publication()
|
||||||
|
|
||||||
|
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||||
|
b.mergeFrom(a._2)
|
||||||
|
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||||
|
if (b.getId == null)
|
||||||
|
b.setId(a._2.getId)
|
||||||
|
b
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||||
|
wx.mergeFrom(wy)
|
||||||
|
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||||
|
if(wx.getId == null && wy.getId.nonEmpty)
|
||||||
|
wx.setId(wy.getId)
|
||||||
|
wx
|
||||||
|
}
|
||||||
|
override def finish(reduction: Publication): Publication = reduction
|
||||||
|
|
||||||
|
override def bufferEncoder: Encoder[Publication] =
|
||||||
|
Encoders.kryo(classOf[Publication])
|
||||||
|
|
||||||
|
override def outputEncoder: Encoder[Publication] =
|
||||||
|
Encoders.kryo(classOf[Publication])
|
||||||
|
}
|
||||||
|
|
||||||
|
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
|
||||||
|
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
|
||||||
|
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||||
|
|
||||||
|
val mapper = new ObjectMapper()
|
||||||
|
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
|
||||||
|
|
||||||
|
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
|
||||||
|
|
||||||
|
logger.info("Converting ORCID to OAF")
|
||||||
|
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
|
||||||
|
.map(d => (d.getId, d))
|
||||||
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
|
.agg(getPublicationAggregator().toColumn)
|
||||||
|
.map(p => p._2)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||||
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
||||||
parser.parseArgument(args)
|
parser.parseArgument(args)
|
||||||
|
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
|
||||||
.appName(getClass.getSimpleName)
|
.appName(getClass.getSimpleName)
|
||||||
.master(parser.get("master")).getOrCreate()
|
.master(parser.get("master")).getOrCreate()
|
||||||
|
|
||||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
|
||||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
|
||||||
import spark.implicits._
|
|
||||||
val sourcePath = parser.get("sourcePath")
|
val sourcePath = parser.get("sourcePath")
|
||||||
val targetPath = parser.get("targetPath")
|
val targetPath = parser.get("targetPath")
|
||||||
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
|
run(spark, sourcePath, targetPath)
|
||||||
|
|
||||||
|
|
||||||
logger.info("Converting ORCID to OAF")
|
|
||||||
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
|
|
||||||
.map(_._2)
|
|
||||||
|
|
||||||
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,188 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||||
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.Function;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkDownloadOrcidAuthors {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(SparkDownloadOrcidAuthors.class);
|
||||||
|
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
|
static final String lastUpdate = "2020-09-29 00:00:00";
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkDownloadOrcidAuthors.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
logger.info("workingPath: ", workingPath);
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
logger.info("outputPath: ", outputPath);
|
||||||
|
final String token = parser.get("token");
|
||||||
|
final String lambdaFileName = parser.get("lambdaFileName");
|
||||||
|
logger.info("lambdaFileName: ", lambdaFileName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
LongAccumulator parsedRecordsAcc = spark.sparkContext().longAccumulator("parsed_records");
|
||||||
|
LongAccumulator modifiedRecordsAcc = spark.sparkContext().longAccumulator("to_download_records");
|
||||||
|
LongAccumulator downloadedRecordsAcc = spark.sparkContext().longAccumulator("downloaded_records");
|
||||||
|
LongAccumulator errorHTTP403Acc = spark.sparkContext().longAccumulator("error_HTTP_403");
|
||||||
|
LongAccumulator errorHTTP409Acc = spark.sparkContext().longAccumulator("error_HTTP_409");
|
||||||
|
LongAccumulator errorHTTP503Acc = spark.sparkContext().longAccumulator("error_HTTP_503");
|
||||||
|
LongAccumulator errorHTTP525Acc = spark.sparkContext().longAccumulator("error_HTTP_525");
|
||||||
|
LongAccumulator errorHTTPGenericAcc = spark.sparkContext().longAccumulator("error_HTTP_Generic");
|
||||||
|
|
||||||
|
logger.info("Retrieving data from lamda sequence file");
|
||||||
|
JavaPairRDD<Text, Text> lamdaFileRDD = sc
|
||||||
|
.sequenceFile(workingPath + lambdaFileName, Text.class, Text.class);
|
||||||
|
logger.info("Data retrieved: " + lamdaFileRDD.count());
|
||||||
|
|
||||||
|
Function<Tuple2<Text, Text>, Boolean> isModifiedAfterFilter = data -> {
|
||||||
|
String orcidId = data._1().toString();
|
||||||
|
String lastModifiedDate = data._2().toString();
|
||||||
|
parsedRecordsAcc.add(1);
|
||||||
|
if (isModified(orcidId, lastModifiedDate)) {
|
||||||
|
modifiedRecordsAcc.add(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
|
||||||
|
Function<Tuple2<Text, Text>, Tuple2<String, String>> downloadRecordFunction = data -> {
|
||||||
|
String orcidId = data._1().toString();
|
||||||
|
String lastModifiedDate = data._2().toString();
|
||||||
|
final DownloadedRecordData downloaded = new DownloadedRecordData();
|
||||||
|
downloaded.setOrcidId(orcidId);
|
||||||
|
downloaded.setLastModifiedDate(lastModifiedDate);
|
||||||
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
|
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||||
|
long startReq = System.currentTimeMillis();
|
||||||
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
|
long endReq = System.currentTimeMillis();
|
||||||
|
long reqTime = endReq - startReq;
|
||||||
|
if (reqTime < 1000) {
|
||||||
|
Thread.sleep(1000 - reqTime);
|
||||||
|
}
|
||||||
|
int statusCode = response.getStatusLine().getStatusCode();
|
||||||
|
downloaded.setStatusCode(statusCode);
|
||||||
|
if (statusCode != 200) {
|
||||||
|
switch (statusCode) {
|
||||||
|
case 403:
|
||||||
|
errorHTTP403Acc.add(1);
|
||||||
|
case 409:
|
||||||
|
errorHTTP409Acc.add(1);
|
||||||
|
case 503:
|
||||||
|
errorHTTP503Acc.add(1);
|
||||||
|
throw new RuntimeException("Orcid request rate limit reached (HTTP 503)");
|
||||||
|
case 525:
|
||||||
|
errorHTTP525Acc.add(1);
|
||||||
|
default:
|
||||||
|
errorHTTPGenericAcc.add(1);
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
"Downloading " + orcidId + " status code: "
|
||||||
|
+ response.getStatusLine().getStatusCode());
|
||||||
|
}
|
||||||
|
return downloaded.toTuple2();
|
||||||
|
}
|
||||||
|
downloadedRecordsAcc.add(1);
|
||||||
|
downloaded
|
||||||
|
.setCompressedData(
|
||||||
|
ArgumentApplicationParser
|
||||||
|
.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||||
|
} catch (Throwable e) {
|
||||||
|
logger.info("Downloading " + orcidId, e.getMessage());
|
||||||
|
downloaded.setErrorMessage(e.getMessage());
|
||||||
|
return downloaded.toTuple2();
|
||||||
|
}
|
||||||
|
return downloaded.toTuple2();
|
||||||
|
};
|
||||||
|
|
||||||
|
sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
|
||||||
|
|
||||||
|
logger.info("Start execution ...");
|
||||||
|
JavaPairRDD<Text, Text> authorsModifiedRDD = lamdaFileRDD.filter(isModifiedAfterFilter);
|
||||||
|
logger.info("Authors modified count: " + authorsModifiedRDD.count());
|
||||||
|
logger.info("Start downloading ...");
|
||||||
|
authorsModifiedRDD
|
||||||
|
.repartition(10)
|
||||||
|
.map(downloadRecordFunction)
|
||||||
|
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||||
|
.saveAsNewAPIHadoopFile(
|
||||||
|
workingPath.concat(outputPath),
|
||||||
|
Text.class,
|
||||||
|
Text.class,
|
||||||
|
SequenceFileOutputFormat.class,
|
||||||
|
sc.hadoopConfiguration());
|
||||||
|
logger.info("parsedRecordsAcc: " + parsedRecordsAcc.value().toString());
|
||||||
|
logger.info("modifiedRecordsAcc: " + modifiedRecordsAcc.value().toString());
|
||||||
|
logger.info("downloadedRecordsAcc: " + downloadedRecordsAcc.value().toString());
|
||||||
|
logger.info("errorHTTP403Acc: " + errorHTTP403Acc.value().toString());
|
||||||
|
logger.info("errorHTTP409Acc: " + errorHTTP409Acc.value().toString());
|
||||||
|
logger.info("errorHTTP503Acc: " + errorHTTP503Acc.value().toString());
|
||||||
|
logger.info("errorHTTP525Acc: " + errorHTTP525Acc.value().toString());
|
||||||
|
logger.info("errorHTTPGenericAcc: " + errorHTTPGenericAcc.value().toString());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isModified(String orcidId, String modifiedDate) {
|
||||||
|
Date modifiedDateDt = null;
|
||||||
|
Date lastUpdateDt = null;
|
||||||
|
try {
|
||||||
|
if (modifiedDate.length() != 19) {
|
||||||
|
modifiedDate = modifiedDate.substring(0, 19);
|
||||||
|
}
|
||||||
|
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||||
|
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.info("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return modifiedDateDt.after(lastUpdateDt);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,99 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
|
public class SparkGenLastModifiedSeq {
|
||||||
|
private static String hdfsServerUri;
|
||||||
|
private static String workingPath;
|
||||||
|
private static String outputPath;
|
||||||
|
private static String lambdaFileName;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkGenLastModifiedSeq.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
outputPath = parser.get("outputPath");
|
||||||
|
lambdaFileName = parser.get("lambdaFileName");
|
||||||
|
String lambdaFileUri = hdfsServerUri.concat(workingPath).concat(lambdaFileName);
|
||||||
|
|
||||||
|
SparkConf sparkConf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
sparkConf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
int rowsNum = 0;
|
||||||
|
Path output = new Path(
|
||||||
|
hdfsServerUri
|
||||||
|
.concat(workingPath)
|
||||||
|
.concat(outputPath));
|
||||||
|
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
|
||||||
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
|
||||||
|
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(
|
||||||
|
new GzipCompressorInputStream(lambdaFileStream))) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(output),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
|
SequenceFile.Writer.compression(SequenceFile.CompressionType.BLOCK, new GzipCodec()))) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String[] values = line.split(",");
|
||||||
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
|
String orcidId = recordInfo.get(0);
|
||||||
|
final Text key = new Text(orcidId);
|
||||||
|
final Text value = new Text(recordInfo.get(3));
|
||||||
|
writer.append(key, value);
|
||||||
|
rowsNum++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Log.info("Saved rows from lamda csv tar file: " + rowsNum);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
|
@ -13,9 +13,6 @@ import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
|
||||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
@ -33,7 +30,7 @@ import com.google.gson.JsonElement;
|
||||||
import com.google.gson.JsonParser;
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
|
|
@ -1,165 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
|
||||||
import org.apache.http.client.methods.HttpGet;
|
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
|
||||||
import org.apache.http.impl.client.HttpClients;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.Function;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SaveMode;
|
|
||||||
import org.apache.spark.util.LongAccumulator;
|
|
||||||
import org.mortbay.log.Log;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class SparkOrcidGenerateAuthors {
|
|
||||||
|
|
||||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
|
||||||
static final String lastUpdate = "2019-09-30 00:00:00";
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
|
||||||
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
|
|
||||||
logger.info("[ SparkOrcidGenerateAuthors STARTED]");
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkOrcidGenerateAuthors.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
final String workingPath = parser.get("workingPath");
|
|
||||||
logger.info("workingPath: ", workingPath);
|
|
||||||
final String outputAuthorsPath = parser.get("outputAuthorsPath");
|
|
||||||
logger.info("outputAuthorsPath: ", outputAuthorsPath);
|
|
||||||
final String token = parser.get("token");
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
|
|
||||||
LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
|
|
||||||
LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
|
|
||||||
LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
|
|
||||||
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
|
|
||||||
|
|
||||||
JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
|
|
||||||
Function<String, String> getOrcidIdFunction = line -> {
|
|
||||||
try {
|
|
||||||
String[] values = line.split(",");
|
|
||||||
return values[0].substring(1);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return new String("");
|
|
||||||
}
|
|
||||||
};
|
|
||||||
List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
|
|
||||||
|
|
||||||
Function<String, Boolean> isModifiedAfterFilter = line -> {
|
|
||||||
String[] values = line.split(",");
|
|
||||||
String orcidId = values[0];
|
|
||||||
parsedRecordsAcc.add(1);
|
|
||||||
if (isModified(orcidId, values[3])) {
|
|
||||||
modifiedRecordsAcc.add(1);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
Function<String, Boolean> isNotDownloadedFilter = line -> {
|
|
||||||
String[] values = line.split(",");
|
|
||||||
String orcidId = values[0];
|
|
||||||
if (downloadedRecords.contains(orcidId)) {
|
|
||||||
alreadyDownloadedRecords.add(1);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
};
|
|
||||||
Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
|
|
||||||
String[] values = line.split(",");
|
|
||||||
String orcidId = values[0];
|
|
||||||
String modifiedDate = values[3];
|
|
||||||
return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
|
|
||||||
};
|
|
||||||
|
|
||||||
lamdaFileRDD
|
|
||||||
.filter(isModifiedAfterFilter)
|
|
||||||
.filter(isNotDownloadedFilter)
|
|
||||||
.map(downloadRecordFunction)
|
|
||||||
.rdd()
|
|
||||||
.saveAsTextFile(workingPath.concat(outputAuthorsPath));
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isModified(String orcidId, String modifiedDate) {
|
|
||||||
Date modifiedDateDt = null;
|
|
||||||
Date lastUpdateDt = null;
|
|
||||||
try {
|
|
||||||
if (modifiedDate.length() != 19) {
|
|
||||||
modifiedDate = modifiedDate.substring(0, 19);
|
|
||||||
}
|
|
||||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
|
||||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
|
||||||
} catch (Exception e) {
|
|
||||||
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return modifiedDateDt.after(lastUpdateDt);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
|
|
||||||
LongAccumulator downloadedRecordsAcc) {
|
|
||||||
final DownloadedRecordData data = new DownloadedRecordData();
|
|
||||||
data.setOrcidId(orcidId);
|
|
||||||
data.setModifiedDate(modifiedDate);
|
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
|
||||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
|
||||||
int statusCode = response.getStatusLine().getStatusCode();
|
|
||||||
data.setStatusCode(statusCode);
|
|
||||||
if (statusCode != 200) {
|
|
||||||
Log
|
|
||||||
.warn(
|
|
||||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
|
||||||
return data.toTuple2();
|
|
||||||
}
|
|
||||||
downloadedRecordsAcc.add(1);
|
|
||||||
data
|
|
||||||
.setCompressedData(
|
|
||||||
ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
|
||||||
} catch (Throwable e) {
|
|
||||||
Log.warn("Downloading " + orcidId, e.getMessage());
|
|
||||||
data.setErrorMessage(e.getMessage());
|
|
||||||
return data.toTuple2();
|
|
||||||
}
|
|
||||||
return data.toTuple2();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,50 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
|
|
||||||
public class SparkPartitionLambdaFile {
|
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
|
||||||
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
||||||
IOUtils
|
|
||||||
.toString(
|
|
||||||
SparkOrcidGenerateAuthors.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
final String workingPath = parser.get("workingPath");
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
|
||||||
runWithSparkSession(
|
|
||||||
conf,
|
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv");
|
|
||||||
|
|
||||||
lamdaFileRDD
|
|
||||||
.repartition(20)
|
|
||||||
.saveAsTextFile(workingPath.concat("lamdafiles"));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -17,11 +17,12 @@ import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.mortbay.log.Log;
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
|
||||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
|
||||||
public class SummariesDecompressor {
|
public class SummariesDecompressor {
|
||||||
|
|
||||||
|
@ -56,6 +57,7 @@ public class SummariesDecompressor {
|
||||||
int nameFound = 0;
|
int nameFound = 0;
|
||||||
int surnameFound = 0;
|
int surnameFound = 0;
|
||||||
int creditNameFound = 0;
|
int creditNameFound = 0;
|
||||||
|
int otherNamesFound = 0;
|
||||||
int errorFromOrcidFound = 0;
|
int errorFromOrcidFound = 0;
|
||||||
int xmlParserErrorFound = 0;
|
int xmlParserErrorFound = 0;
|
||||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
@ -117,6 +119,9 @@ public class SummariesDecompressor {
|
||||||
if (authorData.getCreditName() != null) {
|
if (authorData.getCreditName() != null) {
|
||||||
creditNameFound += 1;
|
creditNameFound += 1;
|
||||||
}
|
}
|
||||||
|
if (authorData.getOtherNames() != null && authorData.getOtherNames().size() > 1) {
|
||||||
|
otherNamesFound += authorData.getOtherNames().size();
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||||
|
@ -152,7 +157,71 @@ public class SummariesDecompressor {
|
||||||
Log.info("Name found: " + nameFound);
|
Log.info("Name found: " + nameFound);
|
||||||
Log.info("Surname found: " + surnameFound);
|
Log.info("Surname found: " + surnameFound);
|
||||||
Log.info("Credit name found: " + creditNameFound);
|
Log.info("Credit name found: " + creditNameFound);
|
||||||
|
Log.info("Other names found: " + otherNamesFound);
|
||||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||||
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
|
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void extractXML(Configuration conf, String inputUri, Path outputPath)
|
||||||
|
throws Exception {
|
||||||
|
String uri = inputUri;
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||||
|
Path inputPath = new Path(uri);
|
||||||
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||||
|
CompressionCodec codec = factory.getCodec(inputPath);
|
||||||
|
if (codec == null) {
|
||||||
|
System.err.println("No codec found for " + uri);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||||
|
InputStream gzipInputStream = null;
|
||||||
|
try {
|
||||||
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||||
|
int counter = 0;
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
CompressionCodec Codec = new GzipCodec();
|
||||||
|
org.apache.hadoop.io.SequenceFile.Writer.Option optCom = SequenceFile.Writer
|
||||||
|
.compression(SequenceFile.CompressionType.RECORD, Codec);
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(outputPath),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class), optCom)) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
String filename = entry.getName();
|
||||||
|
if (entry.isDirectory()) {
|
||||||
|
Log.debug("Directory entry name: " + entry.getName());
|
||||||
|
} else {
|
||||||
|
Log.debug("XML record entry name: " + entry.getName());
|
||||||
|
counter++;
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais));
|
||||||
|
String line;
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
String xml = buffer.toString();
|
||||||
|
final Text key = new Text(
|
||||||
|
XMLRecordParser
|
||||||
|
.retrieveOrcidIdFromSummary(
|
||||||
|
xml.getBytes(), filename.split("/")[2].substring(0, 19)));
|
||||||
|
final Text value = new Text(xml);
|
||||||
|
writer.append(key, value);
|
||||||
|
}
|
||||||
|
if ((counter % 100000) == 0) {
|
||||||
|
Log.info("Current xml records extracted: " + counter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Log.info("Summaries extract completed");
|
||||||
|
Log.info("Total XML records parsed: " + counter);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
Log.debug("Closing gzip stream");
|
||||||
|
IOUtils.closeStream(gzipInputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcid.json;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
public class JsonHelper {
|
||||||
|
|
||||||
|
public static String createOidWork(WorkDataNoDoi workData) {
|
||||||
|
return new Gson().toJson(workData);
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,28 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.orcid.json;
|
|
||||||
|
|
||||||
import com.google.gson.JsonObject;
|
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
|
||||||
|
|
||||||
public class JsonWriter {
|
|
||||||
|
|
||||||
public static String create(AuthorData authorData) {
|
|
||||||
JsonObject author = new JsonObject();
|
|
||||||
author.addProperty("oid", authorData.getOid());
|
|
||||||
author.addProperty("name", authorData.getName());
|
|
||||||
author.addProperty("surname", authorData.getSurname());
|
|
||||||
if (authorData.getCreditName() != null) {
|
|
||||||
author.addProperty("creditname", authorData.getCreditName());
|
|
||||||
}
|
|
||||||
return author.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String create(WorkData workData) {
|
|
||||||
JsonObject work = new JsonObject();
|
|
||||||
work.addProperty("oid", workData.getOid());
|
|
||||||
work.addProperty("doi", workData.getDoi());
|
|
||||||
return work.toString();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -3,8 +3,6 @@ package eu.dnetlib.doiboost.orcid.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
|
|
||||||
import com.google.gson.JsonObject;
|
import com.google.gson.JsonObject;
|
||||||
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -12,7 +10,7 @@ import scala.Tuple2;
|
||||||
public class DownloadedRecordData implements Serializable {
|
public class DownloadedRecordData implements Serializable {
|
||||||
|
|
||||||
private String orcidId;
|
private String orcidId;
|
||||||
private String modifiedDate;
|
private String lastModifiedDate;
|
||||||
private String statusCode;
|
private String statusCode;
|
||||||
private String compressedData;
|
private String compressedData;
|
||||||
private String errorMessage;
|
private String errorMessage;
|
||||||
|
@ -20,7 +18,7 @@ public class DownloadedRecordData implements Serializable {
|
||||||
public Tuple2<String, String> toTuple2() {
|
public Tuple2<String, String> toTuple2() {
|
||||||
JsonObject data = new JsonObject();
|
JsonObject data = new JsonObject();
|
||||||
data.addProperty("statusCode", getStatusCode());
|
data.addProperty("statusCode", getStatusCode());
|
||||||
data.addProperty("modifiedDate", getModifiedDate());
|
data.addProperty("lastModifiedDate", getLastModifiedDate());
|
||||||
if (getCompressedData() != null) {
|
if (getCompressedData() != null) {
|
||||||
data.addProperty("compressedData", getCompressedData());
|
data.addProperty("compressedData", getCompressedData());
|
||||||
}
|
}
|
||||||
|
@ -66,11 +64,11 @@ public class DownloadedRecordData implements Serializable {
|
||||||
this.compressedData = compressedData;
|
this.compressedData = compressedData;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getModifiedDate() {
|
public String getLastModifiedDate() {
|
||||||
return modifiedDate;
|
return lastModifiedDate;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setModifiedDate(String modifiedDate) {
|
public void setLastModifiedDate(String lastModifiedDate) {
|
||||||
this.modifiedDate = modifiedDate;
|
this.lastModifiedDate = lastModifiedDate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,6 +4,8 @@ package eu.dnetlib.doiboost.orcid.xml;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import com.ximpleware.AutoPilot;
|
import com.ximpleware.AutoPilot;
|
||||||
import com.ximpleware.EOFException;
|
import com.ximpleware.EOFException;
|
||||||
import com.ximpleware.EncodingException;
|
import com.ximpleware.EncodingException;
|
||||||
|
@ -14,7 +16,7 @@ import com.ximpleware.VTDNav;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
|
||||||
public class XMLRecordParser {
|
public class XMLRecordParser {
|
||||||
|
@ -81,6 +83,12 @@ public class XMLRecordParser {
|
||||||
if (!creditNames.isEmpty()) {
|
if (!creditNames.isEmpty()) {
|
||||||
authorData.setCreditName(creditNames.get(0));
|
authorData.setCreditName(creditNames.get(0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final List<String> otherNames = VtdUtilityParser.getTextValue(ap, vn, "//other-name:content");
|
||||||
|
if (!otherNames.isEmpty()) {
|
||||||
|
authorData.setOtherNames(otherNames);
|
||||||
|
}
|
||||||
|
|
||||||
return authorData;
|
return authorData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -120,4 +128,33 @@ public class XMLRecordParser {
|
||||||
}
|
}
|
||||||
return workData;
|
return workData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String retrieveOrcidIdFromSummary(byte[] bytes, String defaultValue)
|
||||||
|
throws VtdException, ParseException {
|
||||||
|
return retrieveOrcidId(bytes, defaultValue, NS_RECORD, NS_RECORD_URL, "//record:record", "path").substring(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String retrieveOrcidIdFromActivity(byte[] bytes, String defaultValue)
|
||||||
|
throws VtdException, ParseException {
|
||||||
|
return retrieveOrcidId(bytes, defaultValue, NS_WORK, NS_WORK_URL, "//work:work", "put-code");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String retrieveOrcidId(byte[] bytes, String defaultValue, String ns, String nsUrl, String xpath,
|
||||||
|
String idAttributeName)
|
||||||
|
throws VtdException, ParseException {
|
||||||
|
final VTDGen vg = new VTDGen();
|
||||||
|
vg.setDoc(bytes);
|
||||||
|
vg.parse(true);
|
||||||
|
final VTDNav vn = vg.getNav();
|
||||||
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
|
ap.declareXPathNameSpace(ns, nsUrl);
|
||||||
|
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
||||||
|
.getTextValuesWithAttributes(
|
||||||
|
ap, vn, xpath, Arrays.asList(idAttributeName));
|
||||||
|
if (!recordNodes.isEmpty()) {
|
||||||
|
return (recordNodes.get(0).getAttributes().get(idAttributeName));
|
||||||
|
}
|
||||||
|
Log.info("id not found - default: " + defaultValue);
|
||||||
|
return defaultValue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,154 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.URI;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||||
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class write on hdfs one sequence file, the key is an orcid identifier and the
|
||||||
|
* value is an orcid publication in json format
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class ActivitiesDumpReader {
|
||||||
|
|
||||||
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||||
|
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||||
|
|
||||||
|
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||||
|
throws Exception {
|
||||||
|
String uri = inputUri;
|
||||||
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||||
|
Path inputPath = new Path(uri);
|
||||||
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||||
|
CompressionCodec codec = factory.getCodec(inputPath);
|
||||||
|
if (codec == null) {
|
||||||
|
System.err.println("No codec found for " + uri);
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||||
|
InputStream gzipInputStream = null;
|
||||||
|
try {
|
||||||
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||||
|
parseTarActivities(fs, conf, gzipInputStream, outputPath);
|
||||||
|
|
||||||
|
} finally {
|
||||||
|
Log.debug("Closing gzip stream");
|
||||||
|
IOUtils.closeStream(gzipInputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void parseTarActivities(
|
||||||
|
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||||
|
int counter = 0;
|
||||||
|
int noDoiFound = 0;
|
||||||
|
int errorFromOrcidFound = 0;
|
||||||
|
int xmlParserErrorFound = 0;
|
||||||
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||||
|
TarArchiveEntry entry = null;
|
||||||
|
|
||||||
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
|
.createWriter(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(outputPath),
|
||||||
|
SequenceFile.Writer.keyClass(Text.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class))) {
|
||||||
|
while ((entry = tais.getNextTarEntry()) != null) {
|
||||||
|
String filename = entry.getName();
|
||||||
|
StringBuffer buffer = new StringBuffer();
|
||||||
|
try {
|
||||||
|
if (entry.isDirectory() || !filename.contains("works")) {
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.debug("XML work entry name: " + entry.getName());
|
||||||
|
counter++;
|
||||||
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||||
|
// tarInput
|
||||||
|
String line;
|
||||||
|
buffer = new StringBuffer();
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
buffer.append(line);
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workDataNoDoi = XMLRecordParserNoDoi
|
||||||
|
.VTDParseWorkData(buffer.toString().getBytes());
|
||||||
|
if (workDataNoDoi != null) {
|
||||||
|
if (workDataNoDoi.getErrorCode() != null) {
|
||||||
|
errorFromOrcidFound += 1;
|
||||||
|
Log
|
||||||
|
.debug(
|
||||||
|
"error from Orcid with code "
|
||||||
|
+ workDataNoDoi.getErrorCode()
|
||||||
|
+ " for entry "
|
||||||
|
+ entry.getName());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
boolean isDoiFound = workDataNoDoi
|
||||||
|
.getExtIds()
|
||||||
|
.stream()
|
||||||
|
.filter(e -> e.getType() != null)
|
||||||
|
.anyMatch(e -> e.getType().equals("doi"));
|
||||||
|
if (!isDoiFound) {
|
||||||
|
String jsonData = JsonHelper.createOidWork(workDataNoDoi);
|
||||||
|
Log.debug("oid: " + workDataNoDoi.getOid() + " data: " + jsonData);
|
||||||
|
|
||||||
|
final Text key = new Text(workDataNoDoi.getOid());
|
||||||
|
final Text value = new Text(jsonData);
|
||||||
|
|
||||||
|
try {
|
||||||
|
writer.append(key, value);
|
||||||
|
} catch (IOException e) {
|
||||||
|
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||||
|
Log.debug(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
noDoiFound += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||||
|
xmlParserErrorFound += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new Exception(filename, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||||
|
Log.info("Current xml works parsed: " + counter);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (Exception e) {
|
||||||
|
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
||||||
|
Log.warn(e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
Log.info("Activities parse completed");
|
||||||
|
Log.info("Total XML works parsed: " + counter);
|
||||||
|
Log.info("Total no doi work found: " + noDoiFound);
|
||||||
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||||
|
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.doiboost.orcid.OrcidDSManager;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This job generates one sequence file, the key is an orcid identifier and the
|
||||||
|
* value is an orcid publication in json format
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class GenOrcidAuthorWork extends OrcidDSManager {
|
||||||
|
|
||||||
|
private String activitiesFileNameTarGz;
|
||||||
|
private String outputWorksPath;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
GenOrcidAuthorWork genOrcidAuthorWork = new GenOrcidAuthorWork();
|
||||||
|
genOrcidAuthorWork.loadArgs(args);
|
||||||
|
genOrcidAuthorWork.generateAuthorsDOIsData();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void generateAuthorsDOIsData() throws Exception {
|
||||||
|
Configuration conf = initConfigurationObject();
|
||||||
|
FileSystem fs = initFileSystemObject(conf);
|
||||||
|
String tarGzUri = hdfsServerUri.concat(workingPath).concat(activitiesFileNameTarGz);
|
||||||
|
Path outputPath = new Path(hdfsServerUri.concat(workingPath).concat(outputWorksPath));
|
||||||
|
ActivitiesDumpReader.parseGzActivities(conf, tarGzUri, outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadArgs(String[] args) throws IOException, Exception {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
GenOrcidAuthorWork.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_orcid_works-no-doi_from_activities.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
Log.info("HDFS URI: " + hdfsServerUri);
|
||||||
|
workingPath = parser.get("workingPath");
|
||||||
|
Log.info("Working Path: " + workingPath);
|
||||||
|
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||||
|
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||||
|
outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
Log.info("Output Author Work Data: " + outputWorksPath);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,180 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.io.Text;
|
||||||
|
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This spark job generates one parquet file, containing orcid publications dataset
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class SparkGenEnrichedOrcidWorks {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(SparkGenEnrichedOrcidWorks.class);
|
||||||
|
|
||||||
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException, Exception {
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkGenEnrichedOrcidWorks.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/gen_enriched_orcid_works_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
final String outputEnrichedWorksPath = parser.get("outputEnrichedWorksPath");
|
||||||
|
final String outputWorksPath = parser.get("outputWorksPath");
|
||||||
|
final String hdfsServerUri = parser.get("hdfsServerUri");
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||||
|
.sequenceFile(workingPath + "authors/authors.seq", Text.class, Text.class);
|
||||||
|
Dataset<AuthorData> summariesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(AuthorData.class));
|
||||||
|
logger.info("Authors data loaded: " + summariesDataset.count());
|
||||||
|
|
||||||
|
JavaPairRDD<Text, Text> activitiesRDD = sc
|
||||||
|
.sequenceFile(workingPath + outputWorksPath + "*.seq", Text.class, Text.class);
|
||||||
|
Dataset<WorkDataNoDoi> activitiesDataset = spark
|
||||||
|
.createDataset(
|
||||||
|
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
||||||
|
Encoders.bean(WorkDataNoDoi.class));
|
||||||
|
logger.info("Works data loaded: " + activitiesDataset.count());
|
||||||
|
|
||||||
|
JavaRDD<Tuple2<String, String>> enrichedWorksRDD = activitiesDataset
|
||||||
|
.joinWith(
|
||||||
|
summariesDataset,
|
||||||
|
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<WorkDataNoDoi, AuthorData>, Tuple2<String, String>>) value -> {
|
||||||
|
WorkDataNoDoi w = value._1;
|
||||||
|
AuthorData a = value._2;
|
||||||
|
AuthorMatcher.match(a, w.getContributors());
|
||||||
|
return new Tuple2<>(a.getOid(), JsonHelper.createOidWork(w));
|
||||||
|
},
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.toJavaRDD();
|
||||||
|
logger.info("Enriched works RDD ready.");
|
||||||
|
|
||||||
|
final LongAccumulator parsedPublications = spark.sparkContext().longAccumulator("parsedPublications");
|
||||||
|
final LongAccumulator enrichedPublications = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("enrichedPublications");
|
||||||
|
final LongAccumulator errorsGeneric = spark.sparkContext().longAccumulator("errorsGeneric");
|
||||||
|
final LongAccumulator errorsInvalidTitle = spark.sparkContext().longAccumulator("errorsInvalidTitle");
|
||||||
|
final LongAccumulator errorsNotFoundAuthors = spark
|
||||||
|
.sparkContext()
|
||||||
|
.longAccumulator("errorsNotFoundAuthors");
|
||||||
|
final LongAccumulator errorsInvalidType = spark.sparkContext().longAccumulator("errorsInvalidType");
|
||||||
|
final PublicationToOaf publicationToOaf = new PublicationToOaf(
|
||||||
|
parsedPublications,
|
||||||
|
enrichedPublications,
|
||||||
|
errorsGeneric,
|
||||||
|
errorsInvalidTitle,
|
||||||
|
errorsNotFoundAuthors,
|
||||||
|
errorsInvalidType);
|
||||||
|
JavaRDD<Publication> oafPublicationRDD = enrichedWorksRDD
|
||||||
|
.map(
|
||||||
|
e -> {
|
||||||
|
return (Publication) publicationToOaf
|
||||||
|
.generatePublicationActionsFromJson(e._2());
|
||||||
|
})
|
||||||
|
.filter(p -> p != null);
|
||||||
|
|
||||||
|
sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
|
||||||
|
|
||||||
|
oafPublicationRDD
|
||||||
|
.mapToPair(
|
||||||
|
p -> new Tuple2<>(p.getClass().toString(),
|
||||||
|
OBJECT_MAPPER.writeValueAsString(new AtomicAction<>(Publication.class, (Publication) p))))
|
||||||
|
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
|
||||||
|
.saveAsNewAPIHadoopFile(
|
||||||
|
workingPath.concat(outputEnrichedWorksPath),
|
||||||
|
Text.class,
|
||||||
|
Text.class,
|
||||||
|
SequenceFileOutputFormat.class,
|
||||||
|
sc.hadoopConfiguration());
|
||||||
|
|
||||||
|
logger.info("parsedPublications: " + parsedPublications.value().toString());
|
||||||
|
logger.info("enrichedPublications: " + enrichedPublications.value().toString());
|
||||||
|
logger.info("errorsGeneric: " + errorsGeneric.value().toString());
|
||||||
|
logger.info("errorsInvalidTitle: " + errorsInvalidTitle.value().toString());
|
||||||
|
logger.info("errorsNotFoundAuthors: " + errorsNotFoundAuthors.value().toString());
|
||||||
|
logger.info("errorsInvalidType: " + errorsInvalidType.value().toString());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
||||||
|
AuthorData authorData = new AuthorData();
|
||||||
|
authorData.setOid(orcidId.toString());
|
||||||
|
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||||
|
authorData.setName(getJsonValue(jElement, "name"));
|
||||||
|
authorData.setSurname(getJsonValue(jElement, "surname"));
|
||||||
|
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
||||||
|
return authorData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WorkDataNoDoi loadWorkFromJson(Text orcidId, Text json) {
|
||||||
|
|
||||||
|
WorkDataNoDoi workData = new Gson().fromJson(json.toString(), WorkDataNoDoi.class);
|
||||||
|
return workData;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getJsonValue(JsonElement jElement, String property) {
|
||||||
|
if (jElement.getAsJsonObject().has(property)) {
|
||||||
|
JsonElement name = null;
|
||||||
|
name = jElement.getAsJsonObject().get(property);
|
||||||
|
if (name != null && !name.isJsonNull()) {
|
||||||
|
return name.getAsString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new String("");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,31 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.json;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class converts an object to json and viceversa
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class JsonWriter {
|
||||||
|
|
||||||
|
public static final com.fasterxml.jackson.databind.ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.setSerializationInclusion(JsonInclude.Include.NON_NULL);;
|
||||||
|
|
||||||
|
public static String create(AuthorData authorData) throws JsonProcessingException {
|
||||||
|
return OBJECT_MAPPER.writeValueAsString(authorData);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String create(WorkData workData) {
|
||||||
|
JsonObject work = new JsonObject();
|
||||||
|
work.addProperty("oid", workData.getOid());
|
||||||
|
work.addProperty("doi", workData.getDoi());
|
||||||
|
return work.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,58 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to a contributor, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class Contributor extends AuthorData implements Serializable {
|
||||||
|
private String sequence;
|
||||||
|
private String role;
|
||||||
|
private transient boolean simpleMatch = false;
|
||||||
|
private transient Double score = 0.0;
|
||||||
|
private transient boolean bestMatch = false;
|
||||||
|
|
||||||
|
public String getSequence() {
|
||||||
|
return sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSequence(String sequence) {
|
||||||
|
this.sequence = sequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRole() {
|
||||||
|
return role;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRole(String role) {
|
||||||
|
this.role = role;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSimpleMatch() {
|
||||||
|
return simpleMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSimpleMatch(boolean simpleMatch) {
|
||||||
|
this.simpleMatch = simpleMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Double getScore() {
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScore(Double score) {
|
||||||
|
this.score = score;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isBestMatch() {
|
||||||
|
return bestMatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBestMatch(boolean bestMatch) {
|
||||||
|
this.bestMatch = bestMatch;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to external id, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class ExternalId {
|
||||||
|
private String type;
|
||||||
|
private String value;
|
||||||
|
private String relationShip;
|
||||||
|
|
||||||
|
public String getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getValue() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setValue(String value) {
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelationShip() {
|
||||||
|
return relationShip;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRelationShip(String relationShip) {
|
||||||
|
this.relationShip = relationShip;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data related to a publication date, that are retrieved from an orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class PublicationDate {
|
||||||
|
private String year;
|
||||||
|
private String month;
|
||||||
|
private String day;
|
||||||
|
|
||||||
|
public String getYear() {
|
||||||
|
return year;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setYear(String year) {
|
||||||
|
this.year = year;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getMonth() {
|
||||||
|
return month;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMonth(String month) {
|
||||||
|
this.month = month;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDay() {
|
||||||
|
return day;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDay(String day) {
|
||||||
|
this.day = day;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,104 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.model;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class models the data that are retrieved from orcid publication
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class WorkDataNoDoi implements Serializable {
|
||||||
|
|
||||||
|
private String oid;
|
||||||
|
private String id;
|
||||||
|
private String sourceName;
|
||||||
|
private String type;
|
||||||
|
private List<String> titles;
|
||||||
|
private List<String> urls;
|
||||||
|
List<ExternalId> extIds;
|
||||||
|
List<PublicationDate> publicationDates;
|
||||||
|
List<Contributor> contributors;
|
||||||
|
|
||||||
|
public String getOid() {
|
||||||
|
return oid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOid(String oid) {
|
||||||
|
this.oid = oid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getErrorCode() {
|
||||||
|
return errorCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setErrorCode(String errorCode) {
|
||||||
|
this.errorCode = errorCode;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String errorCode;
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getTitles() {
|
||||||
|
return titles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTitles(List<String> titles) {
|
||||||
|
this.titles = titles;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourceName() {
|
||||||
|
return sourceName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSourceName(String sourceName) {
|
||||||
|
this.sourceName = sourceName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getUrls() {
|
||||||
|
return urls;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUrls(List<String> urls) {
|
||||||
|
this.urls = urls;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<ExternalId> getExtIds() {
|
||||||
|
return extIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setExtIds(List<ExternalId> extIds) {
|
||||||
|
this.extIds = extIds;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<PublicationDate> getPublicationDates() {
|
||||||
|
return publicationDates;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPublicationDates(List<PublicationDate> publicationDates) {
|
||||||
|
this.publicationDates = publicationDates;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Contributor> getContributors() {
|
||||||
|
return contributors;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setContributors(List<Contributor> contributors) {
|
||||||
|
this.contributors = contributors;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,543 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.oaf;
|
||||||
|
|
||||||
|
import static eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility.*;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.*;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class converts an orcid publication from json format to oaf
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class PublicationToOaf implements Serializable {
|
||||||
|
|
||||||
|
static Logger logger = LoggerFactory.getLogger(PublicationToOaf.class);
|
||||||
|
|
||||||
|
public static final String ORCID = "ORCID";
|
||||||
|
public final static String orcidPREFIX = "orcid_______";
|
||||||
|
public static final String OPENAIRE_PREFIX = "openaire____";
|
||||||
|
public static final String SEPARATOR = "::";
|
||||||
|
|
||||||
|
private final LongAccumulator parsedPublications;
|
||||||
|
private final LongAccumulator enrichedPublications;
|
||||||
|
private final LongAccumulator errorsGeneric;
|
||||||
|
private final LongAccumulator errorsInvalidTitle;
|
||||||
|
private final LongAccumulator errorsNotFoundAuthors;
|
||||||
|
private final LongAccumulator errorsInvalidType;
|
||||||
|
|
||||||
|
public PublicationToOaf(
|
||||||
|
LongAccumulator parsedPublications,
|
||||||
|
LongAccumulator enrichedPublications,
|
||||||
|
LongAccumulator errorsGeneric,
|
||||||
|
LongAccumulator errorsInvalidTitle,
|
||||||
|
LongAccumulator errorsNotFoundAuthors,
|
||||||
|
LongAccumulator errorsInvalidType) {
|
||||||
|
this.parsedPublications = parsedPublications;
|
||||||
|
this.enrichedPublications = enrichedPublications;
|
||||||
|
this.errorsGeneric = errorsGeneric;
|
||||||
|
this.errorsInvalidTitle = errorsInvalidTitle;
|
||||||
|
this.errorsNotFoundAuthors = errorsNotFoundAuthors;
|
||||||
|
this.errorsInvalidType = errorsInvalidType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PublicationToOaf() {
|
||||||
|
this.parsedPublications = null;
|
||||||
|
this.enrichedPublications = null;
|
||||||
|
this.errorsGeneric = null;
|
||||||
|
this.errorsInvalidTitle = null;
|
||||||
|
this.errorsNotFoundAuthors = null;
|
||||||
|
this.errorsInvalidType = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, Pair<String, String>> datasources = new HashMap<String, Pair<String, String>>() {
|
||||||
|
|
||||||
|
{
|
||||||
|
put(ORCID.toLowerCase(), new Pair<>(ORCID, OPENAIRE_PREFIX + SEPARATOR + "orcid"));
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// json external id will be mapped to oaf:pid/@classid Map to oaf:pid/@classname
|
||||||
|
private static Map<String, Pair<String, String>> externalIds = new HashMap<String, Pair<String, String>>() {
|
||||||
|
|
||||||
|
{
|
||||||
|
put("ark".toLowerCase(), new Pair<>("ark", "ark"));
|
||||||
|
put("arxiv".toLowerCase(), new Pair<>("arxiv", "arXiv"));
|
||||||
|
put("pmc".toLowerCase(), new Pair<>("pmc", "pmc"));
|
||||||
|
put("pmid".toLowerCase(), new Pair<>("pmid", "pmid"));
|
||||||
|
put("source-work-id".toLowerCase(), new Pair<>("orcidworkid", "orcidworkid"));
|
||||||
|
put("urn".toLowerCase(), new Pair<>("urn", "urn"));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static Map<String, Map<String, String>> typologiesMapping;
|
||||||
|
|
||||||
|
static {
|
||||||
|
try {
|
||||||
|
final String tt = IOUtils
|
||||||
|
.toString(
|
||||||
|
PublicationToOaf.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/doiboost/orcidnodoi/mappings/typologies.json"));
|
||||||
|
typologiesMapping = new Gson().fromJson(tt, Map.class);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException("loading typologies", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final String PID_TYPES = "dnet:pid_types";
|
||||||
|
|
||||||
|
public Oaf generatePublicationActionsFromJson(final String json) {
|
||||||
|
try {
|
||||||
|
if (parsedPublications != null) {
|
||||||
|
parsedPublications.add(1);
|
||||||
|
}
|
||||||
|
JsonElement jElement = new JsonParser().parse(json);
|
||||||
|
JsonObject jObject = jElement.getAsJsonObject();
|
||||||
|
return generatePublicationActionsFromDump(jObject);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
logger.error("creating publication: " + t.getMessage());
|
||||||
|
if (errorsGeneric != null) {
|
||||||
|
errorsGeneric.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Oaf generatePublicationActionsFromDump(final JsonObject rootElement) {
|
||||||
|
|
||||||
|
if (!isValid(rootElement)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Publication publication = new Publication();
|
||||||
|
|
||||||
|
final DataInfo dataInfo = new DataInfo();
|
||||||
|
dataInfo.setDeletedbyinference(false);
|
||||||
|
dataInfo.setInferred(false);
|
||||||
|
dataInfo.setTrust("0.9");
|
||||||
|
dataInfo
|
||||||
|
.setProvenanceaction(
|
||||||
|
mapQualifier(
|
||||||
|
"sysimport:actionset:orcidworks-no-doi",
|
||||||
|
"sysimport:actionset:orcidworks-no-doi",
|
||||||
|
"dnet:provenanceActions",
|
||||||
|
"dnet:provenanceActions"));
|
||||||
|
publication.setDataInfo(dataInfo);
|
||||||
|
|
||||||
|
publication.setLastupdatetimestamp(new Date().getTime());
|
||||||
|
|
||||||
|
publication.setDateofcollection("2020-10-14");
|
||||||
|
publication.setDateoftransformation(DumpToActionsUtility.now_ISO8601());
|
||||||
|
|
||||||
|
// Adding external ids
|
||||||
|
externalIds
|
||||||
|
.keySet()
|
||||||
|
.stream()
|
||||||
|
.forEach(jsonExtId -> {
|
||||||
|
final String classid = externalIds.get(jsonExtId.toLowerCase()).getValue();
|
||||||
|
final String classname = externalIds.get(jsonExtId.toLowerCase()).getKey();
|
||||||
|
final String extId = getStringValue(rootElement, jsonExtId);
|
||||||
|
if (StringUtils.isNotBlank(extId)) {
|
||||||
|
publication
|
||||||
|
.getExternalReference()
|
||||||
|
.add(
|
||||||
|
convertExtRef(extId, classid, classname, "dnet:pid_types", "dnet:pid_types"));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Adding source
|
||||||
|
final String source = getStringValue(rootElement, "sourceName");
|
||||||
|
if (StringUtils.isNotBlank(source)) {
|
||||||
|
Field<String> sourceField = mapStringField(source, null);
|
||||||
|
if (sourceField == null) {
|
||||||
|
publication.setSource(null);
|
||||||
|
} else {
|
||||||
|
publication.setSource(Arrays.asList(sourceField));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adding titles
|
||||||
|
final List<String> titles = createRepeatedField(rootElement, "titles");
|
||||||
|
if (titles == null || titles.isEmpty()) {
|
||||||
|
if (errorsInvalidTitle != null) {
|
||||||
|
errorsInvalidTitle.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
Qualifier q = mapQualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||||
|
publication
|
||||||
|
.setTitle(
|
||||||
|
titles
|
||||||
|
.stream()
|
||||||
|
.map(t -> {
|
||||||
|
return mapStructuredProperty(t, q, null);
|
||||||
|
})
|
||||||
|
.filter(s -> s != null)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
// Adding identifier
|
||||||
|
final String id = getStringValue(rootElement, "id");
|
||||||
|
String sourceId = null;
|
||||||
|
if (id != null) {
|
||||||
|
publication.setOriginalId(Arrays.asList(id));
|
||||||
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(id.toLowerCase()));
|
||||||
|
} else {
|
||||||
|
String mergedTitle = titles.stream().map(Object::toString).collect(Collectors.joining(","));
|
||||||
|
sourceId = String.format("50|%s" + SEPARATOR + "%s", orcidPREFIX, DHPUtils.md5(mergedTitle.toLowerCase()));
|
||||||
|
}
|
||||||
|
publication.setId(sourceId);
|
||||||
|
|
||||||
|
// Adding relevant date
|
||||||
|
settingRelevantDate(rootElement, publication, "publication_date", "issued", true);
|
||||||
|
|
||||||
|
// Adding collectedfrom
|
||||||
|
publication.setCollectedfrom(Arrays.asList(createCollectedFrom()));
|
||||||
|
|
||||||
|
// Adding type
|
||||||
|
final String type = getStringValue(rootElement, "type");
|
||||||
|
String cobjValue = "";
|
||||||
|
if (StringUtils.isNotBlank(type)) {
|
||||||
|
publication.setResourcetype(mapQualifier(type, type, "dnet:dataCite_resource", "dnet:dataCite_resource"));
|
||||||
|
|
||||||
|
final String typeValue = typologiesMapping.get(type).get("value");
|
||||||
|
cobjValue = typologiesMapping.get(type).get("cobj");
|
||||||
|
final Instance instance = new Instance();
|
||||||
|
|
||||||
|
// Adding hostedby
|
||||||
|
instance.setHostedby(createHostedBy());
|
||||||
|
|
||||||
|
// Adding url
|
||||||
|
final List<String> urls = createRepeatedField(rootElement, "urls");
|
||||||
|
if (urls != null && !urls.isEmpty()) {
|
||||||
|
instance.setUrl(urls);
|
||||||
|
} else {
|
||||||
|
dataInfo.setInvisible(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String pubDate = getPublicationDate(rootElement, "publicationDates");
|
||||||
|
if (StringUtils.isNotBlank(pubDate)) {
|
||||||
|
instance.setDateofacceptance(mapStringField(pubDate, null));
|
||||||
|
}
|
||||||
|
|
||||||
|
instance.setCollectedfrom(createCollectedFrom());
|
||||||
|
|
||||||
|
// Adding accessright
|
||||||
|
instance.setAccessright(mapQualifier("UNKNOWN", "UNKNOWN", "dnet:access_modes", "dnet:access_modes"));
|
||||||
|
|
||||||
|
// Adding type
|
||||||
|
instance
|
||||||
|
.setInstancetype(
|
||||||
|
mapQualifier(cobjValue, typeValue, "dnet:publication_resource", "dnet:publication_resource"));
|
||||||
|
|
||||||
|
publication.setInstance(Arrays.asList(instance));
|
||||||
|
} else {
|
||||||
|
if (errorsInvalidType != null) {
|
||||||
|
errorsInvalidType.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Adding authors
|
||||||
|
final List<Author> authors = createAuthors(rootElement);
|
||||||
|
if (authors != null && authors.size() > 0) {
|
||||||
|
publication.setAuthor(authors);
|
||||||
|
} else {
|
||||||
|
if (errorsNotFoundAuthors != null) {
|
||||||
|
errorsNotFoundAuthors.add(1);
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String classValue = getDefaultResulttype(cobjValue);
|
||||||
|
publication
|
||||||
|
.setResulttype(mapQualifier(classValue, classValue, "dnet:result_typologies", "dnet:result_typologies"));
|
||||||
|
if (enrichedPublications != null) {
|
||||||
|
enrichedPublications.add(1);
|
||||||
|
}
|
||||||
|
return publication;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Author> createAuthors(final JsonObject root) {
|
||||||
|
|
||||||
|
final String authorsJSONFieldName = "contributors";
|
||||||
|
|
||||||
|
if (root.has(authorsJSONFieldName) && root.get(authorsJSONFieldName).isJsonArray()) {
|
||||||
|
|
||||||
|
final List<Author> authors = new ArrayList<>();
|
||||||
|
final JsonArray jsonAuthors = root.getAsJsonArray(authorsJSONFieldName);
|
||||||
|
int firstCounter = 0;
|
||||||
|
int defaultCounter = 0;
|
||||||
|
int rank = 1;
|
||||||
|
int currentRank = 0;
|
||||||
|
|
||||||
|
for (final JsonElement item : jsonAuthors) {
|
||||||
|
final JsonObject jsonAuthor = item.getAsJsonObject();
|
||||||
|
final Author author = new Author();
|
||||||
|
if (item.isJsonObject()) {
|
||||||
|
final String creditname = getStringValue(jsonAuthor, "creditName");
|
||||||
|
final String surname = getStringValue(jsonAuthor, "surname");
|
||||||
|
final String name = getStringValue(jsonAuthor, "name");
|
||||||
|
final String oid = getStringValue(jsonAuthor, "oid");
|
||||||
|
final String seq = getStringValue(jsonAuthor, "sequence");
|
||||||
|
if (StringUtils.isNotBlank(seq)) {
|
||||||
|
if (seq.equals("first")) {
|
||||||
|
firstCounter += 1;
|
||||||
|
rank = firstCounter;
|
||||||
|
|
||||||
|
} else if (seq.equals("additional")) {
|
||||||
|
rank = currentRank + 1;
|
||||||
|
} else {
|
||||||
|
defaultCounter += 1;
|
||||||
|
rank = defaultCounter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotBlank(oid)) {
|
||||||
|
author.setPid(Arrays.asList(mapAuthorId(oid)));
|
||||||
|
author.setFullname(name + " " + surname);
|
||||||
|
if (StringUtils.isNotBlank(name)) {
|
||||||
|
author.setName(name);
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotBlank(surname)) {
|
||||||
|
author.setSurname(surname);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
PacePerson p = new PacePerson(creditname, false);
|
||||||
|
if (p.isAccurate()) {
|
||||||
|
author.setName(p.getNormalisedFirstName());
|
||||||
|
author.setSurname(p.getNormalisedSurname());
|
||||||
|
author.setFullname(p.getNormalisedFullname());
|
||||||
|
} else {
|
||||||
|
author.setFullname(creditname);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
author.setRank(rank);
|
||||||
|
authors.add(author);
|
||||||
|
currentRank = rank;
|
||||||
|
}
|
||||||
|
return authors;
|
||||||
|
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> createRepeatedField(final JsonObject rootElement, final String fieldName) {
|
||||||
|
if (!rootElement.has(fieldName)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (rootElement.has(fieldName) && rootElement.get(fieldName).isJsonNull()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (rootElement.get(fieldName).isJsonArray()) {
|
||||||
|
if (!isValidJsonArray(rootElement, fieldName)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getArrayValues(rootElement, fieldName);
|
||||||
|
} else {
|
||||||
|
String field = getStringValue(rootElement, fieldName);
|
||||||
|
return Arrays.asList(cleanField(field));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String cleanField(String value) {
|
||||||
|
if (value != null && !value.isEmpty() && value.charAt(0) == '"' && value.charAt(value.length() - 1) == '"') {
|
||||||
|
value = value.substring(1, value.length() - 1);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void settingRelevantDate(final JsonObject rootElement,
|
||||||
|
final Publication publication,
|
||||||
|
final String jsonKey,
|
||||||
|
final String dictionaryKey,
|
||||||
|
final boolean addToDateOfAcceptance) {
|
||||||
|
|
||||||
|
final String pubDate = getPublicationDate(rootElement, "publication_date");
|
||||||
|
if (StringUtils.isNotBlank(pubDate)) {
|
||||||
|
if (addToDateOfAcceptance) {
|
||||||
|
publication.setDateofacceptance(mapStringField(pubDate, null));
|
||||||
|
}
|
||||||
|
Qualifier q = mapQualifier(dictionaryKey, dictionaryKey, "dnet:dataCite_date", "dnet:dataCite_date");
|
||||||
|
publication
|
||||||
|
.setRelevantdate(
|
||||||
|
Arrays
|
||||||
|
.asList(pubDate)
|
||||||
|
.stream()
|
||||||
|
.map(r -> {
|
||||||
|
return mapStructuredProperty(r, q, null);
|
||||||
|
})
|
||||||
|
.filter(s -> s != null)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getPublicationDate(final JsonObject rootElement,
|
||||||
|
final String jsonKey) {
|
||||||
|
|
||||||
|
JsonObject pubDateJson = null;
|
||||||
|
try {
|
||||||
|
pubDateJson = rootElement.getAsJsonObject(jsonKey);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (pubDateJson == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
final String year = getStringValue(pubDateJson, "year");
|
||||||
|
final String month = getStringValue(pubDateJson, "month");
|
||||||
|
final String day = getStringValue(pubDateJson, "day");
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(year)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
String pubDate = "".concat(year);
|
||||||
|
if (StringUtils.isNotBlank(month)) {
|
||||||
|
pubDate = pubDate.concat("-" + month);
|
||||||
|
if (StringUtils.isNotBlank(day)) {
|
||||||
|
pubDate = pubDate.concat("-" + day);
|
||||||
|
} else {
|
||||||
|
pubDate += "-01";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
pubDate += "-01-01";
|
||||||
|
}
|
||||||
|
if (isValidDate(pubDate)) {
|
||||||
|
return pubDate;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected boolean isValid(final JsonObject rootElement/* , final Reporter context */) {
|
||||||
|
|
||||||
|
final String type = getStringValue(rootElement, "type");
|
||||||
|
if (!typologiesMapping.containsKey(type)) {
|
||||||
|
logger.error("unknowntype_" + type);
|
||||||
|
if (errorsInvalidType != null) {
|
||||||
|
errorsInvalidType.add(1);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isValidJsonArray(rootElement, "titles")) {
|
||||||
|
if (errorsInvalidTitle != null) {
|
||||||
|
errorsInvalidTitle.add(1);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isValidJsonArray(final JsonObject rootElement, final String fieldName) {
|
||||||
|
if (!rootElement.has(fieldName)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
final JsonElement jsonElement = rootElement.get(fieldName);
|
||||||
|
if (jsonElement.isJsonNull()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (jsonElement.isJsonArray()) {
|
||||||
|
final JsonArray jsonArray = jsonElement.getAsJsonArray();
|
||||||
|
if (jsonArray.isJsonNull()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (jsonArray.get(0).isJsonNull()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Qualifier mapQualifier(String classId, String className, String schemeId, String schemeName) {
|
||||||
|
final Qualifier qualifier = new Qualifier();
|
||||||
|
qualifier.setClassid(classId);
|
||||||
|
qualifier.setClassname(className);
|
||||||
|
qualifier.setSchemeid(schemeId);
|
||||||
|
qualifier.setSchemename(schemeName);
|
||||||
|
return qualifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
private ExternalReference convertExtRef(String extId, String classId, String className, String schemeId,
|
||||||
|
String schemeName) {
|
||||||
|
ExternalReference ex = new ExternalReference();
|
||||||
|
ex.setRefidentifier(extId);
|
||||||
|
ex.setQualifier(mapQualifier(classId, className, schemeId, schemeName));
|
||||||
|
return ex;
|
||||||
|
}
|
||||||
|
|
||||||
|
private StructuredProperty mapStructuredProperty(String value, Qualifier qualifier, DataInfo dataInfo) {
|
||||||
|
if (value == null | StringUtils.isBlank(value)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final StructuredProperty structuredProperty = new StructuredProperty();
|
||||||
|
structuredProperty.setValue(value);
|
||||||
|
structuredProperty.setQualifier(qualifier);
|
||||||
|
structuredProperty.setDataInfo(dataInfo);
|
||||||
|
return structuredProperty;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Field<String> mapStringField(String value, DataInfo dataInfo) {
|
||||||
|
if (value == null || StringUtils.isBlank(value)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Field<String> stringField = new Field<>();
|
||||||
|
stringField.setValue(value);
|
||||||
|
stringField.setDataInfo(dataInfo);
|
||||||
|
return stringField;
|
||||||
|
}
|
||||||
|
|
||||||
|
private KeyValue createCollectedFrom() {
|
||||||
|
KeyValue cf = new KeyValue();
|
||||||
|
cf.setValue(ORCID);
|
||||||
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||||
|
return cf;
|
||||||
|
}
|
||||||
|
|
||||||
|
private KeyValue createHostedBy() {
|
||||||
|
KeyValue hb = new KeyValue();
|
||||||
|
hb.setValue("Unknown Repository");
|
||||||
|
hb.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "55045bd2a65019fd8e6741a755395c8c");
|
||||||
|
return hb;
|
||||||
|
}
|
||||||
|
|
||||||
|
private StructuredProperty mapAuthorId(String orcidId) {
|
||||||
|
final StructuredProperty sp = new StructuredProperty();
|
||||||
|
sp.setValue(orcidId);
|
||||||
|
final Qualifier q = new Qualifier();
|
||||||
|
q.setClassid(ORCID.toLowerCase());
|
||||||
|
q.setClassname(ORCID.toLowerCase());
|
||||||
|
q.setSchemeid(ModelConstants.DNET_PID_TYPES);
|
||||||
|
q.setSchemename(ModelConstants.DNET_PID_TYPES);
|
||||||
|
sp.setQualifier(q);
|
||||||
|
final DataInfo dataInfo = new DataInfo();
|
||||||
|
dataInfo.setDeletedbyinference(false);
|
||||||
|
dataInfo.setInferred(false);
|
||||||
|
dataInfo.setTrust("0.9");
|
||||||
|
dataInfo
|
||||||
|
.setProvenanceaction(
|
||||||
|
mapQualifier(
|
||||||
|
"sysimport:crosswalk:entityregistry",
|
||||||
|
"Harvested",
|
||||||
|
"dnet:provenanceActions",
|
||||||
|
"dnet:provenanceActions"));
|
||||||
|
sp.setDataInfo(dataInfo);
|
||||||
|
return sp;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,217 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.similarity;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used for searching from a list of publication contributors a
|
||||||
|
* specific author making a similarity check on both name and surname of the
|
||||||
|
* author with the credit name of each contributor of the list; as soon as
|
||||||
|
* the match is found (if exist) author informations are used to enrich the
|
||||||
|
* matched contribuotr inside contributors list
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class AuthorMatcher {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(AuthorMatcher.class);
|
||||||
|
public static final Double threshold = 0.8;
|
||||||
|
|
||||||
|
public static void match(AuthorData author, List<Contributor> contributors)
|
||||||
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
|
||||||
|
int matchCounter = 0;
|
||||||
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
|
Contributor contributor = null;
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.forEach(c -> {
|
||||||
|
if (simpleMatch(c.getCreditName(), author.getName()) ||
|
||||||
|
simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||||
|
simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
|
||||||
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
if (matchCounters.get(0) == 1) {
|
||||||
|
updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
} else if (matchCounters.get(0) == 0) {
|
||||||
|
Optional<Contributor> optCon = contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
Contributor bestMatchContributor = null;
|
||||||
|
if (optCon.isPresent()) {
|
||||||
|
bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
|
}
|
||||||
|
} else if (matchCounters.get(0) > 1) {
|
||||||
|
Optional<Contributor> optCon = contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isSimpleMatch())
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(bestMatch(author.getName(), author.getSurname(), c.getCreditName()));
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
Contributor bestMatchContributor = null;
|
||||||
|
if (optCon.isPresent()) {
|
||||||
|
bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
updateAuthorsSimilarityMatch(contributors, author);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean simpleMatchOnOtherNames(String name, List<String> otherNames) {
|
||||||
|
if (otherNames == null || (otherNames != null && otherNames.isEmpty())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return otherNames.stream().filter(o -> simpleMatch(name, o)).count() > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean simpleMatch(String name, String searchValue) {
|
||||||
|
if (searchValue == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return normalize(name).contains(normalize(searchValue));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Double bestMatch(String authorSurname, String authorName, String contributor) {
|
||||||
|
String[] contributorSplitted = contributor.split(" ");
|
||||||
|
if (contributorSplitted.length == 0) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
final String contributorName = contributorSplitted[contributorSplitted.length - 1];
|
||||||
|
String contributorSurname = "";
|
||||||
|
if (contributorSplitted.length > 1) {
|
||||||
|
StringJoiner joiner = new StringJoiner(" ");
|
||||||
|
for (int i = 0; i < contributorSplitted.length - 1; i++) {
|
||||||
|
joiner.add(contributorSplitted[i]);
|
||||||
|
}
|
||||||
|
contributorSurname = joiner.toString();
|
||||||
|
}
|
||||||
|
String authorNameNrm = normalize(authorName);
|
||||||
|
String authorSurnameNrm = normalize(authorSurname);
|
||||||
|
String contributorNameNrm = normalize(contributorName);
|
||||||
|
String contributorSurnameNrm = normalize(contributorSurname);
|
||||||
|
Double sm1 = similarity(authorNameNrm, authorSurnameNrm, contributorNameNrm, contributorSurnameNrm);
|
||||||
|
Double sm2 = similarity(authorNameNrm, authorSurnameNrm, contributorSurnameNrm, contributorNameNrm);
|
||||||
|
if (sm1.compareTo(sm2) >= 0) {
|
||||||
|
return sm1;
|
||||||
|
}
|
||||||
|
return sm2;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Double similarity(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
Double score = similarityJaroWinkler(nameA, surnameA, nameB, surnameB);
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Double similarityJaroWinkler(String nameA, String surnameA, String nameB, String surnameB) {
|
||||||
|
return new JaroWinklerSimilarity().apply(normalize(parse(nameA, surnameA)), normalize(parse(nameB, surnameB)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalize(final String s) {
|
||||||
|
if (s == null) {
|
||||||
|
return new String("");
|
||||||
|
}
|
||||||
|
return nfd(s)
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
|
// in case
|
||||||
|
// of large input strings
|
||||||
|
.replaceAll("(\\W)+", " ")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String parse(String name, String surname) {
|
||||||
|
return surname + " " + name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void updateAuthorsSimpleMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
if (c.isSimpleMatch()) {
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void updateAuthorsSimilarityMatch(List<Contributor> contributors, AuthorData author) {
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isBestMatch())
|
||||||
|
.forEach(c -> {
|
||||||
|
c.setName(author.getName());
|
||||||
|
c.setSurname(author.getSurname());
|
||||||
|
c.setOid(author.getOid());
|
||||||
|
});
|
||||||
|
updateRanks(contributors);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void updateRanks(List<Contributor> contributors) {
|
||||||
|
boolean seqFound = false;
|
||||||
|
if (contributors
|
||||||
|
.stream()
|
||||||
|
.filter(
|
||||||
|
c -> c.getRole() != null && c.getSequence() != null &&
|
||||||
|
c.getRole().equals("author") && (c.getSequence().equals("first") ||
|
||||||
|
c.getSequence().equals("additional")))
|
||||||
|
.count() > 0) {
|
||||||
|
seqFound = true;
|
||||||
|
}
|
||||||
|
if (!seqFound) {
|
||||||
|
List<Integer> seqIds = Arrays.asList(0);
|
||||||
|
contributors.forEach(c -> {
|
||||||
|
int currentSeq = seqIds.get(0) + 1;
|
||||||
|
seqIds.set(0, currentSeq);
|
||||||
|
c.setSequence(Integer.toString(seqIds.get(0)));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String toJson(WorkDataNoDoi work) {
|
||||||
|
GsonBuilder builder = new GsonBuilder();
|
||||||
|
Gson gson = builder.create();
|
||||||
|
return gson.toJson(work);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,113 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.util;
|
||||||
|
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.google.gson.JsonArray;
|
||||||
|
import com.google.gson.JsonObject;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility class
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class DumpToActionsUtility {
|
||||||
|
|
||||||
|
private static final SimpleDateFormat ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US);
|
||||||
|
|
||||||
|
public static String getStringValue(final JsonObject root, final String key) {
|
||||||
|
if (root.has(key) && !root.get(key).isJsonNull())
|
||||||
|
return root.get(key).getAsString();
|
||||||
|
return new String("");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String> getArrayValues(final JsonObject root, final String key) {
|
||||||
|
if (root.has(key) && root.get(key).isJsonArray()) {
|
||||||
|
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
|
||||||
|
final List<String> result = new ArrayList<>();
|
||||||
|
|
||||||
|
asJsonArray.forEach(it -> {
|
||||||
|
if (StringUtils.isNotBlank(it.getAsString())) {
|
||||||
|
result.add(it.getAsString());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<JsonObject> getArrayObjects(final JsonObject root, final String key) {
|
||||||
|
if (root.has(key) && root.get(key).isJsonArray()) {
|
||||||
|
final JsonArray asJsonArray = root.get(key).getAsJsonArray();
|
||||||
|
final List<JsonObject> result = new ArrayList<>();
|
||||||
|
asJsonArray.forEach(it -> {
|
||||||
|
if (it.getAsJsonObject() != null) {
|
||||||
|
result.add(it.getAsJsonObject());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isValidDate(final String date) {
|
||||||
|
return date.matches("\\d{4}-\\d{2}-\\d{2}");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String now_ISO8601() { // NOPMD
|
||||||
|
String result;
|
||||||
|
synchronized (ISO8601FORMAT) {
|
||||||
|
result = ISO8601FORMAT.format(new Date());
|
||||||
|
}
|
||||||
|
// convert YYYYMMDDTHH:mm:ss+HH00 into YYYYMMDDTHH:mm:ss+HH:00
|
||||||
|
// - note the added colon for the Timezone
|
||||||
|
return result.substring(0, result.length() - 2) + ":" + result.substring(result.length() - 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getDefaultResulttype(final String cobjcategory) {
|
||||||
|
switch (cobjcategory) {
|
||||||
|
case "0029":
|
||||||
|
return "software";
|
||||||
|
case "0021":
|
||||||
|
case "0024":
|
||||||
|
case "0025":
|
||||||
|
case "0030":
|
||||||
|
return "dataset";
|
||||||
|
case "0000":
|
||||||
|
case "0010":
|
||||||
|
case "0018":
|
||||||
|
case "0020":
|
||||||
|
case "0022":
|
||||||
|
case "0023":
|
||||||
|
case "0026":
|
||||||
|
case "0027":
|
||||||
|
case "0028":
|
||||||
|
case "0037":
|
||||||
|
return "other";
|
||||||
|
case "0001":
|
||||||
|
case "0002":
|
||||||
|
case "0004":
|
||||||
|
case "0005":
|
||||||
|
case "0006":
|
||||||
|
case "0007":
|
||||||
|
case "0008":
|
||||||
|
case "0009":
|
||||||
|
case "0011":
|
||||||
|
case "0012":
|
||||||
|
case "0013":
|
||||||
|
case "0014":
|
||||||
|
case "0015":
|
||||||
|
case "0016":
|
||||||
|
case "0017":
|
||||||
|
case "0019":
|
||||||
|
case "0031":
|
||||||
|
case "0032":
|
||||||
|
return "publication";
|
||||||
|
default:
|
||||||
|
return "publication";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,32 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.util;
|
||||||
|
|
||||||
|
public class Pair<K, V> {
|
||||||
|
|
||||||
|
private K k;
|
||||||
|
|
||||||
|
private V v;
|
||||||
|
|
||||||
|
public Pair(K k, V v) {
|
||||||
|
this.k = k;
|
||||||
|
this.v = v;
|
||||||
|
}
|
||||||
|
|
||||||
|
public K getKey() {
|
||||||
|
return k;
|
||||||
|
}
|
||||||
|
|
||||||
|
public V getValue() {
|
||||||
|
return v;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj instanceof Pair<?, ?>) {
|
||||||
|
Pair<?, ?> tmp = (Pair<?, ?>) obj;
|
||||||
|
return k.equals(tmp.getKey()) && v.equals(tmp.getValue());
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,217 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.ximpleware.*;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.ExternalId;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.PublicationDate;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class is used for parsing xml data with vtd parser
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class XMLRecordParserNoDoi {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(XMLRecordParserNoDoi.class);
|
||||||
|
|
||||||
|
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
|
||||||
|
private static final String NS_COMMON = "common";
|
||||||
|
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
|
||||||
|
private static final String NS_PERSON = "person";
|
||||||
|
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
|
||||||
|
private static final String NS_DETAILS = "personal-details";
|
||||||
|
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
|
||||||
|
private static final String NS_OTHER = "other-name";
|
||||||
|
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
|
||||||
|
private static final String NS_RECORD = "record";
|
||||||
|
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
|
||||||
|
|
||||||
|
private static final String NS_WORK = "work";
|
||||||
|
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
||||||
|
|
||||||
|
private static final String NS_ERROR = "error";
|
||||||
|
|
||||||
|
public static WorkDataNoDoi VTDParseWorkData(byte[] bytes)
|
||||||
|
throws VtdException, EncodingException, EOFException, EntityException, ParseException, XPathParseException,
|
||||||
|
NavException, XPathEvalException {
|
||||||
|
final VTDGen vg = new VTDGen();
|
||||||
|
vg.setDoc(bytes);
|
||||||
|
vg.parse(true);
|
||||||
|
final VTDNav vn = vg.getNav();
|
||||||
|
final AutoPilot ap = new AutoPilot(vn);
|
||||||
|
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||||
|
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||||
|
|
||||||
|
WorkDataNoDoi workData = new WorkDataNoDoi();
|
||||||
|
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||||
|
if (!errors.isEmpty()) {
|
||||||
|
workData.setErrorCode(errors.get(0));
|
||||||
|
return workData;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
|
||||||
|
.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path", "put-code"));
|
||||||
|
if (!workNodes.isEmpty()) {
|
||||||
|
final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
|
||||||
|
workData.setOid(oid);
|
||||||
|
final String id = (workNodes.get(0).getAttributes().get("put-code"));
|
||||||
|
workData.setId(id);
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> titles = VtdUtilityParser
|
||||||
|
.getTextValue(
|
||||||
|
ap, vn, "//common:title");
|
||||||
|
if (!titles.isEmpty()) {
|
||||||
|
workData.setTitles(titles);
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> sourceNames = VtdUtilityParser
|
||||||
|
.getTextValue(
|
||||||
|
ap, vn, "//common:source-name");
|
||||||
|
if (!sourceNames.isEmpty()) {
|
||||||
|
workData.setSourceName(sourceNames.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> types = VtdUtilityParser
|
||||||
|
.getTextValue(
|
||||||
|
ap, vn, "//work:type");
|
||||||
|
if (!types.isEmpty()) {
|
||||||
|
workData.setType(types.get(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> urls = VtdUtilityParser
|
||||||
|
.getTextValue(
|
||||||
|
ap, vn, "//common:url");
|
||||||
|
if (!urls.isEmpty()) {
|
||||||
|
workData.setUrls(urls);
|
||||||
|
}
|
||||||
|
|
||||||
|
workData.setPublicationDates(getPublicationDates(vg, vn, ap));
|
||||||
|
workData.setExtIds(getExternalIds(vg, vn, ap));
|
||||||
|
workData.setContributors(getContributors(vg, vn, ap));
|
||||||
|
return workData;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<PublicationDate> getPublicationDates(VTDGen vg, VTDNav vn, AutoPilot ap)
|
||||||
|
throws XPathParseException, NavException, XPathEvalException {
|
||||||
|
List<PublicationDate> publicationDates = new ArrayList<PublicationDate>();
|
||||||
|
int yearIndex = 0;
|
||||||
|
ap.selectXPath("//common:publication-date/common:year");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
PublicationDate publicationDate = new PublicationDate();
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
publicationDate.setYear(vn.toNormalizedString(t));
|
||||||
|
publicationDates.add(yearIndex, publicationDate);
|
||||||
|
yearIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int monthIndex = 0;
|
||||||
|
ap.selectXPath("//common:publication-date/common:month");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
publicationDates.get(monthIndex).setMonth(vn.toNormalizedString(t));
|
||||||
|
monthIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int dayIndex = 0;
|
||||||
|
ap.selectXPath("//common:publication-date/common:day");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
publicationDates.get(dayIndex).setDay(vn.toNormalizedString(t));
|
||||||
|
dayIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return publicationDates;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<ExternalId> getExternalIds(VTDGen vg, VTDNav vn, AutoPilot ap)
|
||||||
|
throws XPathParseException, NavException, XPathEvalException {
|
||||||
|
List<ExternalId> extIds = new ArrayList<ExternalId>();
|
||||||
|
int typeIndex = 0;
|
||||||
|
ap.selectXPath("//common:external-id/common:external-id-type");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
ExternalId extId = new ExternalId();
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
extId.setType(vn.toNormalizedString(t));
|
||||||
|
extIds.add(typeIndex, extId);
|
||||||
|
typeIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int valueIndex = 0;
|
||||||
|
ap.selectXPath("//common:external-id/common:external-id-value");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
extIds.get(valueIndex).setValue(vn.toNormalizedString(t));
|
||||||
|
valueIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int relationshipIndex = 0;
|
||||||
|
ap.selectXPath("//common:external-id/common:external-id-relationship");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
int t = vn.getText();
|
||||||
|
if (t >= 0) {
|
||||||
|
extIds.get(relationshipIndex).setRelationShip(vn.toNormalizedString(t));
|
||||||
|
relationshipIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (typeIndex == valueIndex) {
|
||||||
|
return extIds;
|
||||||
|
}
|
||||||
|
return new ArrayList<ExternalId>();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<Contributor> getContributors(VTDGen vg, VTDNav vn, AutoPilot ap)
|
||||||
|
throws XPathParseException, NavException, XPathEvalException {
|
||||||
|
List<Contributor> contributors = new ArrayList<Contributor>();
|
||||||
|
ap.selectXPath("//work:contributors/work:contributor");
|
||||||
|
while (ap.evalXPath() != -1) {
|
||||||
|
Contributor contributor = new Contributor();
|
||||||
|
if (vn.toElement(VTDNav.FIRST_CHILD, "work:credit-name")) {
|
||||||
|
int val = vn.getText();
|
||||||
|
if (val != -1) {
|
||||||
|
contributor.setCreditName(vn.toNormalizedString(val));
|
||||||
|
}
|
||||||
|
vn.toElement(VTDNav.PARENT);
|
||||||
|
}
|
||||||
|
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-attributes")) {
|
||||||
|
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-sequence")) {
|
||||||
|
int val = vn.getText();
|
||||||
|
if (val != -1) {
|
||||||
|
contributor.setSequence(vn.toNormalizedString(val));
|
||||||
|
}
|
||||||
|
vn.toElement(VTDNav.PARENT);
|
||||||
|
}
|
||||||
|
if (vn.toElement(VTDNav.FIRST_CHILD, "work:contributor-role")) {
|
||||||
|
int val = vn.getText();
|
||||||
|
if (val != -1) {
|
||||||
|
contributor.setRole(vn.toNormalizedString(val));
|
||||||
|
}
|
||||||
|
vn.toElement(VTDNav.PARENT);
|
||||||
|
}
|
||||||
|
vn.toElement(VTDNav.PARENT);
|
||||||
|
}
|
||||||
|
contributors.add(contributor);
|
||||||
|
}
|
||||||
|
return contributors;
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
{"paramName":"o", "paramLongName":"outputAuthorsDOIsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -16,88 +16,86 @@
|
||||||
<name>sparkExecutorCores</name>
|
<name>sparkExecutorCores</name>
|
||||||
<description>number of cores used by single executor</description>
|
<description>number of cores used by single executor</description>
|
||||||
</property>
|
</property>
|
||||||
<!-- <property>-->
|
<property>
|
||||||
<!-- <name>timestamp</name>-->
|
<name>timestamp</name>
|
||||||
<!-- <description>Timestamp for incremental Harvesting</description>-->
|
<description>Timestamp for incremental Harvesting</description>
|
||||||
<!-- </property>-->
|
</property>
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ExtractCrossrefToOAF"/>
|
<start to="ImportCrossRef"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<!-- <action name="ResetWorkingPath">-->
|
<action name="ImportCrossRef">
|
||||||
<!-- <fs>-->
|
<java>
|
||||||
<!-- <delete path='${workingPath}/input/crossref/index_dump'/>-->
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<!--<!– <mkdir path='${workingPath}/input/crossref'/>–>-->
|
<name-node>${nameNode}</name-node>
|
||||||
<!-- </fs>-->
|
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||||
<!-- <ok to="ImportCrossRef"/>-->
|
<arg>-t</arg><arg>${workingPath}/input/crossref/index_update</arg>
|
||||||
<!-- <error to="Kill"/>-->
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
<!-- </action>-->
|
<arg>-ts</arg><arg>${timestamp}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="GenerateDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenerateDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ExtractCrossrefToOAF</name>
|
||||||
|
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--workingPath</arg><arg>/data/doiboost/input/crossref</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="RenameDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RenameDataset">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/input/crossref/crossref_ds'/>
|
||||||
|
<move source="${workingPath}/input/crossref/crossref_ds_updated"
|
||||||
|
target="${workingPath}/input/crossref/crossref_ds"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="ConvertCrossrefToOAF"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="ConvertCrossrefToOAF">
|
||||||
<!-- <action name="ImportCrossRef">-->
|
|
||||||
<!-- <java>-->
|
|
||||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
|
||||||
<!-- <name-node>${nameNode}</name-node>-->
|
|
||||||
<!-- <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>-->
|
|
||||||
<!-- <arg>-t</arg><arg>${workingPath}/input/crossref/index_dump_1</arg>-->
|
|
||||||
<!-- <arg>-n</arg><arg>${nameNode}</arg>-->
|
|
||||||
<!-- <arg>-ts</arg><arg>${timestamp}</arg>-->
|
|
||||||
<!-- </java>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
|
|
||||||
<action name="ExtractCrossrefToOAF">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>ExtractCrossrefToOAF</name>
|
<name>ConvertCrossrefToOAF</name>
|
||||||
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
||||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${workingPath}/input/crossref/crossref_ds</arg>
|
<arg>--sourcePath</arg><arg>${workingPath}/input/crossref/crossref_ds</arg>
|
||||||
<arg>--targetPath</arg><arg>${workingPath}/input/crossref</arg>
|
<arg>--targetPath</arg><arg>${workingPath}/process/</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<!-- <action name="GenerateDataset">-->
|
|
||||||
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
|
||||||
<!-- <master>yarn-cluster</master>-->
|
|
||||||
<!-- <mode>cluster</mode>-->
|
|
||||||
<!-- <name>ExtractCrossrefToOAF</name>-->
|
|
||||||
<!-- <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>-->
|
|
||||||
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
|
|
||||||
<!-- <spark-opts>-->
|
|
||||||
<!-- --executor-memory=${sparkExecutorMemory}-->
|
|
||||||
<!-- --executor-cores=${sparkExecutorCores}-->
|
|
||||||
<!-- --driver-memory=${sparkDriverMemory}-->
|
|
||||||
<!-- ${sparkExtraOPT}-->
|
|
||||||
<!-- </spark-opts>-->
|
|
||||||
<!-- <arg>--sourcePath</arg><arg>/data/doiboost/crossref/cr_dataset</arg>-->
|
|
||||||
<!-- <arg>--targetPath</arg><arg>/data/doiboost/crossref/crossrefDataset</arg>-->
|
|
||||||
<!-- <arg>--master</arg><arg>yarn-cluster</arg>-->
|
|
||||||
<!-- </spark>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,6 +1,5 @@
|
||||||
[
|
[
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
|
||||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||||
|
|
||||||
]
|
]
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"lambdaFileName", "paramDescription": "the name of the lambda file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}
|
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
[
|
||||||
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
|
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
|
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
|
]
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
{"paramName":"d", "paramLongName":"hdfsOrcidDefaultPath", "paramDescription": "the default work path", "paramRequired": true},
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
|
{"paramName":"f", "paramLongName":"summariesFileNameTarGz", "paramDescription": "the name of the summaries orcid file", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -1,4 +0,0 @@
|
||||||
[{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path", "paramRequired": true},
|
|
||||||
{"paramName":"t", "paramLongName":"token", "paramDescription": "token to grant access", "paramRequired": true},
|
|
||||||
{"paramName":"o", "paramLongName":"outputAuthorsPath", "paramDescription": "the relative folder of the sequencial file to write the authors data", "paramRequired": true}
|
|
||||||
]
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
[
|
||||||
|
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
|
||||||
|
{"paramName":"f", "paramLongName":"activitiesFileNameTarGz", "paramDescription": "the name of the activities orcid file", "paramRequired": true},
|
||||||
|
{"paramName":"ow", "paramLongName":"outputWorksPath", "paramDescription": "the relative folder of the sequencial file to write", "paramRequired": true},
|
||||||
|
{"paramName":"oew", "paramLongName":"outputEnrichedWorksPath", "paramDescription": "the relative folder of the sequencial file to write the data", "paramRequired": true}
|
||||||
|
]
|
|
@ -39,14 +39,7 @@
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
|
||||||
<fs>
|
|
||||||
<delete path='${workingDirPath}'/>
|
|
||||||
<mkdir path='${workingDirPath}'/>
|
|
||||||
</fs>
|
|
||||||
<ok to="CreateDOIBoost"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="CreateDOIBoost">
|
<action name="CreateDOIBoost">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
|
@ -8,6 +8,10 @@
|
||||||
<name>targetPath</name>
|
<name>targetPath</name>
|
||||||
<description>the working dir base path</description>
|
<description>the working dir base path</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -31,10 +35,10 @@
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetWorkingPath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${targetPath}'/>
|
<delete path='${workingPath}'/>
|
||||||
<mkdir path='${targetPath}'/>
|
<mkdir path='${workingPath}'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="PreprocessMag"/>
|
<ok to="ConvertMagToDataset"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -52,10 +56,10 @@
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="PreprocessMag"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
@ -65,7 +69,7 @@
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Convert Mag to Dataset</name>
|
<name>Convert Mag to OAF Dataset</name>
|
||||||
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
|
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
|
||||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -75,7 +79,8 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}/process</arg>
|
||||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
[
|
[
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target dir path", "paramRequired": true},
|
||||||
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||||
|
|
||||||
]
|
]
|
|
@ -1,42 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hive_metastore_uris</name>
|
|
||||||
<value>thrift://hadoop-edge2.garr-pa1.d4science.org:9083</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<value>http://hadoop-edge1.garr-pa1.d4science.org:18089/</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2EventLogDir</name>
|
|
||||||
<value>/user/spark/spark2ApplicationHistory</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx2g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,542 @@
|
||||||
|
<workflow-app name="Gen Orcid Works-no-doi From Activities" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_0.tar.gz https://orcid.figshare.com/ndownloader/files/25002232 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_0.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_0.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_0.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 0</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_1</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_1.tar.gz https://orcid.figshare.com/ndownloader/files/25002088 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_1.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_1.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_1.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 1</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_2</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_2.tar.gz https://orcid.figshare.com/ndownloader/files/25000596 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_2.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_2.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_2.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 2</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_3</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_3.tar.gz https://orcid.figshare.com/ndownloader/files/25015150 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_3.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_3.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_3.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 3</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_4</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_4.tar.gz https://orcid.figshare.com/ndownloader/files/25033643 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_4.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_4.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_4.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 4</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_5</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_5.tar.gz https://orcid.figshare.com/ndownloader/files/25005483 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_5.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_5.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_5.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 5</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_6</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_6.tar.gz https://orcid.figshare.com/ndownloader/files/25005425 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_6.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_6.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_6.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 6</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_7</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_7.tar.gz https://orcid.figshare.com/ndownloader/files/25012016 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_7.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_7.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_7.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 7</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_8</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_8.tar.gz https://orcid.figshare.com/ndownloader/files/25012079 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_8.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_8.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_8.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 8</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_9</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_9.tar.gz https://orcid.figshare.com/ndownloader/files/25010727 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_9.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_9.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_9.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file 9</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_X</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_activites_X.tar.gz https://orcid.figshare.com/ndownloader/files/25011025 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_activites_X.tar.gz /data/orcid_activities_2020/ORCID_2020_10_activites_X.tar.gz ; rm -f /tmp/ORCID_2020_10_activites_X.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid activity file X</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/no_doi_works/*'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="fork_check_download_files"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name = "fork_check_download_files">
|
||||||
|
<path start = "check_exist_on_hdfs_activities_0"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_1"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_2"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_3"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_4"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_5"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_6"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_7"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_8"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_9"/>
|
||||||
|
<path start = "check_exist_on_hdfs_activities_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_0">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_0.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_0" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_0">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_0">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_0.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_1">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_1.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_1" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_1">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_1}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_1">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_1.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_2">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_2.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_2" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_2">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_2}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_2">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_2.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_3">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_3.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_3" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_3">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_3}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_3">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_3.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_4">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_4.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_4" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_4">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_4}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_4">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_4.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_5">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_5.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_5" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_5">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_5}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_5">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_5.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_6">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_6.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_6" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_6">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_6}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_6">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_6.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_7">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_7.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_7" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_7">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_7}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_7">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_7.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_8">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_8.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_8" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_8">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_8}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_8">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_8.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_9">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_9.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_9" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_9">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_9}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_9">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_9.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_activities_X">
|
||||||
|
<switch>
|
||||||
|
<case to="wait_download_phase_node">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_activites_X.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="Download_X" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="Download_X">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_X}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="wait_download_phase_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidAuthorWork_X">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcidnodoi.GenOrcidAuthorWork</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/works_X.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_enriched_works/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name = "wait_download_phase_node" to = "fork_gen_orcid_author_work"/>
|
||||||
|
|
||||||
|
<fork name = "fork_gen_orcid_author_work">
|
||||||
|
<path start = "GenOrcidAuthorWork_0"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_1"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_2"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_3"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_4"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_5"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_6"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_7"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_8"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_9"/>
|
||||||
|
<path start = "GenOrcidAuthorWork_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<join name = "join_node" to = "End"/>
|
||||||
|
|
||||||
|
<!-- <join name = "join_node" to = "fork_gen_orcid_author_work_2"/>-->
|
||||||
|
|
||||||
|
<!-- <fork name = "fork_gen_orcid_author_work_2">-->
|
||||||
|
<!-- <path start = "GenOrcidAuthorWork_6"/>-->
|
||||||
|
<!-- <path start = "GenOrcidAuthorWork_7"/>-->
|
||||||
|
<!-- <path start = "GenOrcidAuthorWork_8"/>-->
|
||||||
|
<!-- <path start = "GenOrcidAuthorWork_9"/>-->
|
||||||
|
<!-- <path start = "GenOrcidAuthorWork_X"/>-->
|
||||||
|
<!-- </fork>-->
|
||||||
|
|
||||||
|
<!-- <join name = "join_node_2" to = "End"/>-->
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -1,45 +0,0 @@
|
||||||
<workflow-app name="Orcid Download" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>workingPathOrcid</name>
|
|
||||||
<description>the working dir base path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>token</name>
|
|
||||||
<description>access token</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
|
||||||
<fs>
|
|
||||||
<delete path='${workingPathOrcid}/download'/>
|
|
||||||
<mkdir path='${workingPathOrcid}/download'/>
|
|
||||||
</fs>
|
|
||||||
<ok to="DownloadOrcidData"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="DownloadOrcidData">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
|
|
||||||
<arg>-d</arg><arg>${workingPathOrcid}/</arg>
|
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
|
||||||
<arg>-f</arg><arg>last_modified.csv</arg>
|
|
||||||
<arg>-o</arg><arg>download/</arg>
|
|
||||||
<arg>-t</arg><arg>${token}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -0,0 +1,232 @@
|
||||||
|
<workflow-app name="Extract Orcid XML Works From Activities" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.java</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx2g</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/xml/works'/>
|
||||||
|
<mkdir path='${workingPath}/xml/works'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="fork_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<fork name = "fork_node">
|
||||||
|
<path start = "ExtractXMLWorkActivities_0"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_1"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_2"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_3"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_4"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_5"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_6"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_7"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_8"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_9"/>
|
||||||
|
<path start = "ExtractXMLWorkActivities_X"/>
|
||||||
|
</fork>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_0">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_0.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_0.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_1">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_1.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_1.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_2">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_2.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_2.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_3">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_3.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_3.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_4">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_4.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_4.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_5">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_5.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_5.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_6">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_6.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_6.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_7">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_7.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_7.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_8">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_8.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_8.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_9">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_9.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_9.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLWorkActivities_X">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLActivitiesData</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_activites_X.tar.gz</arg>
|
||||||
|
<arg>-ow</arg><arg>xml/works/xml_works_X.seq</arg>
|
||||||
|
<arg>-oew</arg><arg>---</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="join_node"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<join name = "join_node" to = "End"/>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,26 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx8g</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -1,41 +1,40 @@
|
||||||
<workflow-app name="import Orcid" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Extract Orcid XML Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the working dir base path</description>
|
<description>the working dir base path</description>
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
<action name="ResetWorkingPath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${workingPath}/output'/>
|
<delete path='${workingPath}/xml/authors'/>
|
||||||
<mkdir path='${workingPath}/output'/>
|
<mkdir path='${workingPath}/xml/authors'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="ImportOrcidSummary"/>
|
<ok to="ExtractXMLAuthorsSummaries"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractXMLAuthorsSummaries">
|
||||||
|
|
||||||
<action name="ImportOrcidSummary">
|
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
<main-class>eu.dnetlib.doiboost.orcid.ExtractXMLSummariesData</main-class>
|
||||||
<arg>-d</arg><arg>${workingPath}/</arg>
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
<arg>-n</arg><arg>${nameNode}</arg>
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
<arg>-f</arg><arg>ORCID_2019_summaries.tar.gz</arg>
|
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
|
||||||
<arg>-o</arg><arg>output/</arg>
|
<arg>-o</arg><arg>xml/authors/</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -1,22 +0,0 @@
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>queueName</name>
|
|
||||||
<value>default</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
|
@ -1,83 +0,0 @@
|
||||||
<workflow-app name="Gen Orcid Authors" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>workingPath</name>
|
|
||||||
<description>the working dir base path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>token</name>
|
|
||||||
<description>access token</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>outputPath</name>
|
|
||||||
<description>the working dir base path</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<action name="ResetWorkingPath">
|
|
||||||
<fs>
|
|
||||||
<delete path='${workingPath_activities}/authors'/>
|
|
||||||
</fs>
|
|
||||||
<ok to="Gen_Orcid_Authors"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="Split_Lambda_File">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Split_Lambda_File</name>
|
|
||||||
<class>eu.dnetlib.doiboost.orcid.SparkPartitionLambdaFile</class>
|
|
||||||
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
|
|
||||||
<spark-opts>--num-executors 24 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
|
||||||
<arg>-o</arg><arg>authors/</arg>
|
|
||||||
<arg>-t</arg><arg>${token}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="Gen_Orcid_Authors">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>Gen_Orcid_Authors</name>
|
|
||||||
<class>eu.dnetlib.doiboost.orcid.SparkOrcidGenerateAuthors</class>
|
|
||||||
<jar>dhp-doiboost-1.2.1-SNAPSHOT.jar</jar>
|
|
||||||
<spark-opts>--num-executors 20 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>-w</arg><arg>${workingPath}/</arg>
|
|
||||||
<arg>-o</arg><arg>authors/</arg>
|
|
||||||
<arg>-t</arg><arg>${token}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
</workflow-app>
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.map.java.opts</name>
|
||||||
|
<value>-Xmx8g</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,68 @@
|
||||||
|
<workflow-app name="Gen Orcid Authors From Summaries" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd_0</name>
|
||||||
|
<value>wget -O /tmp/ORCID_2020_10_summaries.tar.gz https://orcid.figshare.com/ndownloader/files/25032905 ; hdfs dfs -copyFromLocal /tmp/ORCID_2020_10_summaries.tar.gz /data/orcid_activities_2020/ORCID_2020_10_summaries.tar.gz ; rm -f /tmp/ORCID_2020_10_summaries.tar.gz
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads and puts to hdfs orcid summaries</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/authors'/>
|
||||||
|
<mkdir path='${workingPath}/authors'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="check_exist_on_hdfs_summaries"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<decision name="check_exist_on_hdfs_summaries">
|
||||||
|
<switch>
|
||||||
|
<case to="ImportOrcidSummaries">
|
||||||
|
${fs:exists(concat(workingPath,'/ORCID_2020_10_summaries.tar.gz'))}
|
||||||
|
</case>
|
||||||
|
<default to="DownloadSummaries" />
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<action name="DownloadSummaries">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd_0}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="ImportOrcidSummaries"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ImportOrcidSummaries">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.OrcidDSManager</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>ORCID_2020_10_summaries.tar.gz</arg>
|
||||||
|
<arg>-o</arg><arg>authors/</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,168 @@
|
||||||
|
<workflow-app name="Orcid Updates Download" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>token</name>
|
||||||
|
<description>access token</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>shell_cmd</name>
|
||||||
|
<value>wget -O /tmp/last_modified.csv.tar http://74804fb637bd8e2fba5b-e0a029c2f87486cddec3b416996a6057.r3.cf1.rackcdn.com/last_modified.csv.tar ; hdfs dfs -copyFromLocal /tmp/last_modified.csv.tar /data/orcid_activities_2020/last_modified.csv.tar ; rm -f /tmp/last_modified.csv.tar
|
||||||
|
</value>
|
||||||
|
<description>the shell command that downloads the lambda file from orcid containing last orcid update informations</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<value>7G</value>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<value>2G</value>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<value>1</value>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2MaxExecutors</name>
|
||||||
|
<value>10</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="DownloadOrcidAuthors"/>
|
||||||
|
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/downloads'/>
|
||||||
|
<delete path='${workingPath}/last_modified.csv.tar'/>
|
||||||
|
<mkdir path='${workingPath}/downloads'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="DownloadLambdaFile"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="DownloadLambdaFile">
|
||||||
|
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<exec>bash</exec>
|
||||||
|
<argument>-c</argument>
|
||||||
|
<argument>${shell_cmd}</argument>
|
||||||
|
<capture-output/>
|
||||||
|
</shell>
|
||||||
|
<ok to="DownloadUpdatedXMLAuthors"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="DownloadUpdatedXMLAuthors">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.doiboost.orcid.OrcidDownloader</main-class>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>last_modified.csv.tar</arg>
|
||||||
|
<arg>-o</arg><arg>downloads/</arg>
|
||||||
|
<arg>-t</arg><arg>${token}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenLastModifiedSeq">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>GenLastModifiedSeq</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkGenLastModifiedSeq</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>last_modified.csv.tar</arg>
|
||||||
|
<arg>-o</arg><arg>last_modified.seq</arg>
|
||||||
|
<arg>-t</arg><arg>-</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="DownloadOrcidAuthors">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>DownloadOrcidAuthors</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkDownloadOrcidAuthors</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>last_modified.seq</arg>
|
||||||
|
<arg>-o</arg><arg>downloads/updated_authors</arg>
|
||||||
|
<arg>-t</arg><arg>${token}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,41 @@
|
||||||
|
{
|
||||||
|
"reference-entry": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||||
|
"report": {"cobj":"0017", "value": "Report"},
|
||||||
|
"dataset": {"cobj":"0021", "value": "Dataset"},
|
||||||
|
"journal-article": {"cobj":"0001", "value": "Article"},
|
||||||
|
"reference-book": {"cobj":"0002", "value": "Book"},
|
||||||
|
"other": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"proceedings-article": {"cobj":"0004", "value": "Conference object"},
|
||||||
|
"standard": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"book-part": {"cobj":"0002", "value": "Book"},
|
||||||
|
"monograph": {"cobj":"0002", "value": "Book"},
|
||||||
|
"report-series": {"cobj":"0017", "value": "Report"},
|
||||||
|
"book": {"cobj":"0002", "value": "Book"},
|
||||||
|
"book-chapter": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||||
|
"peer-review": {"cobj":"0015", "value": "Review"},
|
||||||
|
"book-section": {"cobj":"0013", "value": "Part of book or chapter of book"},
|
||||||
|
"book-review": {"cobj":"0015", "value": "Review"},
|
||||||
|
"conference-abstract": {"cobj":"0004", "value": "Conference object"},
|
||||||
|
"conference-paper": {"cobj":"0004", "value": "Conference object"},
|
||||||
|
"conference-poster": {"cobj":"0004", "value": "Conference object"},
|
||||||
|
"data-set": {"cobj":"0021", "value": "Dataset"},
|
||||||
|
"dictionary-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"disclosure": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"dissertation": {"cobj":"0006", "value": "Doctoral thesis"},
|
||||||
|
"edited-book": {"cobj":"0002", "value": "Book"},
|
||||||
|
"encyclopedia-entry": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"lecture-speech": {"cobj":"0010", "value": "Lecture"},
|
||||||
|
"license": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"magazine-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
|
||||||
|
"manual": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"newsletter-article": {"cobj":"0012", "value": "Newsletter"},
|
||||||
|
"newspaper-article": {"cobj":"0005", "value": "Contribution for newspaper or weekly magazine"},
|
||||||
|
"patent": {"cobj":"0019", "value": "Patent"},
|
||||||
|
"research-technique": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"research-tool": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"standards-and-policy": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"supervised-student-publication": {"cobj":"0001", "value": "Article"},
|
||||||
|
"technical-standard": {"cobj":"0038", "value": "Other literature type"},
|
||||||
|
"website": {"cobj":"0020", "value": "Other ORP type"},
|
||||||
|
"working-paper": {"cobj":"0014", "value": "Research"}
|
||||||
|
}
|
|
@ -0,0 +1,95 @@
|
||||||
|
<workflow-app name="gen_orcid_no_doi_dataset" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working dir base path</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="ResetWorkingPath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path='${workingPath}/no_doi_dataset'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="GenOrcidNoDoiDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="GenOrcidNoDoiDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>GenOrcidNoDoiDataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcidnodoi.SparkGenEnrichedOrcidWorks</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-w</arg><arg>${workingPath}/</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-f</arg><arg>-</arg>
|
||||||
|
<arg>-ow</arg><arg>no_doi_works/</arg>
|
||||||
|
<arg>-oew</arg><arg>no_doi_dataset</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -158,7 +158,7 @@ class CrossrefMappingTest {
|
||||||
|
|
||||||
|
|
||||||
rels.foreach(s => logger.info(s.getTarget))
|
rels.foreach(s => logger.info(s.getTarget))
|
||||||
assertEquals(rels.size, 3 )
|
assertEquals(rels.size, 6 )
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
|
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
|
||||||
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import org.codehaus.jackson.map.ObjectMapper
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @Test
|
||||||
|
// def testOAFConvert():Unit ={
|
||||||
|
//
|
||||||
|
// val spark: SparkSession =
|
||||||
|
// SparkSession
|
||||||
|
// .builder()
|
||||||
|
// .appName(getClass.getSimpleName)
|
||||||
|
// .master("local[*]").getOrCreate()
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
|
||||||
|
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
//
|
||||||
|
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
|
||||||
|
// println(df.first.getId)
|
||||||
|
// println(mapper.writeValueAsString(df.first()))
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
//
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,23 +3,34 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.*;
|
||||||
import java.io.IOException;
|
import java.nio.file.Files;
|
||||||
import java.io.InputStreamReader;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.time.Duration;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.temporal.TemporalUnit;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||||
|
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mortbay.log.Log;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class OrcidClientTest {
|
public class OrcidClientTest {
|
||||||
final String orcidId = "0000-0001-7291-3210";
|
final String orcidId = "0000-0001-7291-3210";
|
||||||
|
@ -32,16 +43,64 @@ public class OrcidClientTest {
|
||||||
String lastUpdate = "2019-09-30 00:00:00";
|
String lastUpdate = "2019-09-30 00:00:00";
|
||||||
String shortDate = "2020-05-06 16:06:11";
|
String shortDate = "2020-05-06 16:06:11";
|
||||||
|
|
||||||
// curl -i -H "Accept: application/vnd.orcid+xml"
|
// curl -i -H "Accept: application/vnd.orcid+xml"
|
||||||
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
// -H 'Authorization: Bearer 78fdb232-7105-4086-8570-e153f4198e3d'
|
||||||
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
// 'https://api.orcid.org/v3.0/0000-0001-7291-3210/record'
|
||||||
|
|
||||||
public String testDownloadRecord(String orcidId) throws Exception {
|
@Test
|
||||||
|
private void multipleDownloadTest() throws Exception {
|
||||||
|
int toDownload = 10;
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
OrcidDownloader downloader = new OrcidDownloader();
|
||||||
|
TarArchiveInputStream input = new TarArchiveInputStream(
|
||||||
|
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
|
||||||
|
TarArchiveEntry entry = input.getNextTarEntry();
|
||||||
|
BufferedReader br = null;
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int rowNum = 0;
|
||||||
|
int entryNum = 0;
|
||||||
|
int modified = 0;
|
||||||
|
while (entry != null) {
|
||||||
|
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String[] values = line.toString().split(",");
|
||||||
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
|
String orcidId = recordInfo.get(0);
|
||||||
|
if (downloader.isModified(orcidId, recordInfo.get(3))) {
|
||||||
|
slowedDownDownload(orcidId);
|
||||||
|
modified++;
|
||||||
|
}
|
||||||
|
rowNum++;
|
||||||
|
if (modified > toDownload) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entryNum++;
|
||||||
|
entry = input.getNextTarEntry();
|
||||||
|
}
|
||||||
|
long end = System.currentTimeMillis();
|
||||||
|
logToFile("start test: " + new Date(start).toString());
|
||||||
|
logToFile("end test: " + new Date(end).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
private void downloadTest(String orcid) throws Exception {
|
||||||
|
String record = testDownloadRecord(orcid);
|
||||||
|
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String testDownloadRecord(String orcidId) throws Exception {
|
||||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
|
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
|
||||||
|
logToFile("start connection: " + new Date(System.currentTimeMillis()).toString());
|
||||||
CloseableHttpResponse response = client.execute(httpGet);
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
|
logToFile("end connection: " + new Date(System.currentTimeMillis()).toString());
|
||||||
if (response.getStatusLine().getStatusCode() != 200) {
|
if (response.getStatusLine().getStatusCode() != 200) {
|
||||||
System.out
|
System.out
|
||||||
.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
.println("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||||
|
@ -53,8 +112,8 @@ public class OrcidClientTest {
|
||||||
return new String("");
|
return new String("");
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void testLambdaFileParser() throws Exception {
|
private void testLambdaFileParser() throws Exception {
|
||||||
try (BufferedReader br = new BufferedReader(
|
try (BufferedReader br = new BufferedReader(
|
||||||
new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) {
|
new InputStreamReader(this.getClass().getResourceAsStream("last_modified.csv")))) {
|
||||||
String line;
|
String line;
|
||||||
|
@ -99,8 +158,8 @@ public class OrcidClientTest {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void getRecordDatestamp() throws ParseException {
|
private void getRecordDatestamp() throws ParseException {
|
||||||
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
|
Date toRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toRetrieveDate);
|
||||||
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
|
Date toNotRetrieveDateDt = new SimpleDateFormat(DATE_FORMAT).parse(toNotRetrieveDate);
|
||||||
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
Date lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||||
|
@ -108,7 +167,7 @@ public class OrcidClientTest {
|
||||||
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
|
assertTrue(!toNotRetrieveDateDt.after(lastUpdateDt));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDate(String value) throws ParseException {
|
private void testDate(String value) throws ParseException {
|
||||||
System.out.println(value.toString());
|
System.out.println(value.toString());
|
||||||
if (value.length() != 19) {
|
if (value.length() != 19) {
|
||||||
value = value.substring(0, 19);
|
value = value.substring(0, 19);
|
||||||
|
@ -117,20 +176,126 @@ public class OrcidClientTest {
|
||||||
System.out.println(valueDt.toString());
|
System.out.println(valueDt.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
// @Test
|
||||||
public void testModifiedDate() throws ParseException {
|
@Ignore
|
||||||
|
private void testModifiedDate() throws ParseException {
|
||||||
testDate(toRetrieveDate);
|
testDate(toRetrieveDate);
|
||||||
testDate(toNotRetrieveDate);
|
testDate(toNotRetrieveDate);
|
||||||
testDate(shortDate);
|
testDate(shortDate);
|
||||||
}
|
}
|
||||||
|
|
||||||
// @Test
|
@Test
|
||||||
public void testReadBase64CompressedRecord() throws Exception {
|
private void testReadBase64CompressedRecord() throws Exception {
|
||||||
final String base64CompressedRecord = IOUtils
|
final String base64CompressedRecord = IOUtils
|
||||||
.toString(getClass().getResourceAsStream("0000-0001-6645-509X.compressed.base64"));
|
.toString(getClass().getResourceAsStream("0000-0003-3028-6161.compressed.base64"));
|
||||||
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
final String recordFromSeqFile = ArgumentApplicationParser.decompressValue(base64CompressedRecord);
|
||||||
System.out.println(recordFromSeqFile);
|
logToFile("\n\ndownloaded \n\n" + recordFromSeqFile);
|
||||||
final String downloadedRecord = testDownloadRecord("0000-0001-6645-509X");
|
final String downloadedRecord = testDownloadRecord("0000-0003-3028-6161");
|
||||||
assertTrue(recordFromSeqFile.equals(downloadedRecord));
|
assertTrue(recordFromSeqFile.equals(downloadedRecord));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
private void lambdaFileReaderTest() throws Exception {
|
||||||
|
TarArchiveInputStream input = new TarArchiveInputStream(
|
||||||
|
new GzipCompressorInputStream(new FileInputStream("/develop/last_modified.csv.tar")));
|
||||||
|
TarArchiveEntry entry = input.getNextTarEntry();
|
||||||
|
BufferedReader br = null;
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int rowNum = 0;
|
||||||
|
int entryNum = 0;
|
||||||
|
while (entry != null) {
|
||||||
|
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String[] values = line.toString().split(",");
|
||||||
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
|
assertTrue(recordInfo.size() == 4);
|
||||||
|
|
||||||
|
rowNum++;
|
||||||
|
if (rowNum == 1) {
|
||||||
|
assertTrue(recordInfo.get(3).equals("last_modified"));
|
||||||
|
} else if (rowNum == 2) {
|
||||||
|
assertTrue(recordInfo.get(0).equals("0000-0002-0499-7333"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
entryNum++;
|
||||||
|
assertTrue(entryNum == 1);
|
||||||
|
entry = input.getNextTarEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
private void lambdaFileCounterTest() throws Exception {
|
||||||
|
final String lastUpdate = "2020-09-29 00:00:00";
|
||||||
|
OrcidDownloader downloader = new OrcidDownloader();
|
||||||
|
TarArchiveInputStream input = new TarArchiveInputStream(
|
||||||
|
new GzipCompressorInputStream(new FileInputStream("/tmp/last_modified.csv.tar")));
|
||||||
|
TarArchiveEntry entry = input.getNextTarEntry();
|
||||||
|
BufferedReader br = null;
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
int rowNum = 0;
|
||||||
|
int entryNum = 0;
|
||||||
|
int modified = 0;
|
||||||
|
while (entry != null) {
|
||||||
|
br = new BufferedReader(new InputStreamReader(input)); // Read directly from tarInput
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
String[] values = line.toString().split(",");
|
||||||
|
List<String> recordInfo = Arrays.asList(values);
|
||||||
|
String orcidId = recordInfo.get(0);
|
||||||
|
if (downloader.isModified(orcidId, recordInfo.get(3))) {
|
||||||
|
modified++;
|
||||||
|
}
|
||||||
|
rowNum++;
|
||||||
|
}
|
||||||
|
entryNum++;
|
||||||
|
entry = input.getNextTarEntry();
|
||||||
|
}
|
||||||
|
logToFile("rowNum: " + rowNum);
|
||||||
|
logToFile("modified: " + modified);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void logToFile(String log)
|
||||||
|
throws IOException {
|
||||||
|
log = log.concat("\n");
|
||||||
|
Path path = Paths.get("/tmp/orcid_log.txt");
|
||||||
|
Files.write(path, log.getBytes(), StandardOpenOption.APPEND);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
private void slowedDownDownloadTest() throws Exception {
|
||||||
|
String orcid = "0000-0001-5496-1243";
|
||||||
|
String record = slowedDownDownload(orcid);
|
||||||
|
String filename = "/tmp/downloaded_".concat(orcid).concat(".xml");
|
||||||
|
File f = new File(filename);
|
||||||
|
OutputStream outStream = new FileOutputStream(f);
|
||||||
|
IOUtils.write(record.getBytes(), outStream);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String slowedDownDownload(String orcidId) throws Exception {
|
||||||
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
|
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||||
|
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||||
|
httpGet.addHeader("Authorization", "Bearer 78fdb232-7105-4086-8570-e153f4198e3d");
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
CloseableHttpResponse response = client.execute(httpGet);
|
||||||
|
long endReq = System.currentTimeMillis();
|
||||||
|
long reqSessionDuration = endReq - start;
|
||||||
|
logToFile("req time (millisec): " + reqSessionDuration);
|
||||||
|
if (reqSessionDuration < 1000) {
|
||||||
|
logToFile("wait ....");
|
||||||
|
Thread.sleep(1000 - reqSessionDuration);
|
||||||
|
}
|
||||||
|
long end = System.currentTimeMillis();
|
||||||
|
long total = end - start;
|
||||||
|
logToFile("total time (millisec): " + total);
|
||||||
|
if (response.getStatusLine().getStatusCode() != 200) {
|
||||||
|
logToFile("Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||||
|
}
|
||||||
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
|
} catch (Throwable e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return new String("");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,17 +2,19 @@
|
||||||
package eu.dnetlib.doiboost.orcid.xml;
|
package eu.dnetlib.doiboost.orcid.xml;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.json.JsonWriter;
|
||||||
|
|
||||||
public class XMLRecordParserTest {
|
public class XMLRecordParserTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOrcidAuthorDataXMLParser() throws Exception {
|
private void testOrcidAuthorDataXMLParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
|
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_0000-0001-6828-479X.xml"));
|
||||||
|
|
||||||
|
@ -27,7 +29,7 @@ public class XMLRecordParserTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOrcidXMLErrorRecordParser() throws Exception {
|
private void testOrcidXMLErrorRecordParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
|
String xml = IOUtils.toString(this.getClass().getResourceAsStream("summary_error.xml"));
|
||||||
|
|
||||||
|
@ -40,11 +42,11 @@ public class XMLRecordParserTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testOrcidWorkDataXMLParser() throws Exception {
|
private void testOrcidWorkDataXMLParser() throws Exception {
|
||||||
|
|
||||||
String xml = IOUtils
|
String xml = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
this.getClass().getResourceAsStream("activity_work_0000-0002-5982-8983.xml"));
|
this.getClass().getResourceAsStream("activity_work_0000-0003-2760-1191.xml"));
|
||||||
|
|
||||||
XMLRecordParser p = new XMLRecordParser();
|
XMLRecordParser p = new XMLRecordParser();
|
||||||
|
|
||||||
|
@ -55,4 +57,21 @@ public class XMLRecordParserTest {
|
||||||
assertNotNull(workData.getDoi());
|
assertNotNull(workData.getDoi());
|
||||||
System.out.println("doi: " + workData.getDoi());
|
System.out.println("doi: " + workData.getDoi());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOrcidOtherNamesXMLParser() throws Exception {
|
||||||
|
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
this.getClass().getResourceAsStream("summary_0000-0001-5109-1000_othername.xml"));
|
||||||
|
|
||||||
|
XMLRecordParser p = new XMLRecordParser();
|
||||||
|
|
||||||
|
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(xml.getBytes());
|
||||||
|
assertNotNull(authorData);
|
||||||
|
assertNotNull(authorData.getOtherNames());
|
||||||
|
assertTrue(authorData.getOtherNames().get(0).equals("Andrew C. Porteus"));
|
||||||
|
String jsonData = JsonWriter.create(authorData);
|
||||||
|
assertNotNull(jsonData);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.JsonElement;
|
||||||
|
import com.google.gson.JsonParser;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.oaf.PublicationToOaf;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
|
public class PublicationToOafTest {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PublicationToOafTest.class);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Ignore
|
||||||
|
private void convertOafPublicationTest() throws Exception {
|
||||||
|
String jsonPublication = IOUtils
|
||||||
|
.toString(
|
||||||
|
PublicationToOafTest.class.getResourceAsStream("publication.json"));
|
||||||
|
JsonElement j = new JsonParser().parse(jsonPublication);
|
||||||
|
logger.info("json publication loaded: " + j.toString());
|
||||||
|
PublicationToOaf publicationToOaf = new PublicationToOaf();
|
||||||
|
Publication oafPublication = (Publication) publicationToOaf
|
||||||
|
.generatePublicationActionsFromDump(j.getAsJsonObject());
|
||||||
|
assertNotNull(oafPublication.getId());
|
||||||
|
assertNotNull(oafPublication.getOriginalId());
|
||||||
|
assertEquals(oafPublication.getOriginalId().get(0), "60153327");
|
||||||
|
logger.info("oafPublication.getId(): " + oafPublication.getId());
|
||||||
|
assertEquals(
|
||||||
|
oafPublication.getTitle().get(0).getValue(),
|
||||||
|
"Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp");
|
||||||
|
assertNotNull(oafPublication.getLastupdatetimestamp());
|
||||||
|
assertNotNull(oafPublication.getDateofcollection());
|
||||||
|
assertNotNull(oafPublication.getDateoftransformation());
|
||||||
|
assertTrue(oafPublication.getAuthor().size() == 7);
|
||||||
|
oafPublication.getAuthor().forEach(a -> {
|
||||||
|
assertNotNull(a.getFullname());
|
||||||
|
assertNotNull(a.getRank());
|
||||||
|
logger.info("a.getFullname(): " + a.getFullname());
|
||||||
|
if (a.getName() != null) {
|
||||||
|
logger.info("a.getName(): " + a.getName());
|
||||||
|
}
|
||||||
|
if (a.getSurname() != null) {
|
||||||
|
logger.info("a.getSurname(): " + a.getSurname());
|
||||||
|
}
|
||||||
|
logger.info("a.getRank(): " + a.getRank());
|
||||||
|
if (a.getPid() != null) {
|
||||||
|
logger.info("a.getPid(): " + a.getPid().get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
assertNotNull(oafPublication.getCollectedfrom());
|
||||||
|
if (oafPublication.getSource() != null) {
|
||||||
|
logger.info((oafPublication.getSource().get(0).getValue()));
|
||||||
|
}
|
||||||
|
if (oafPublication.getExternalReference() != null) {
|
||||||
|
oafPublication.getExternalReference().forEach(e -> {
|
||||||
|
assertNotNull(e.getRefidentifier());
|
||||||
|
assertEquals(e.getQualifier().getSchemeid(), "dnet:pid_types");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
assertNotNull(oafPublication.getInstance());
|
||||||
|
oafPublication.getInstance().forEach(i -> {
|
||||||
|
assertNotNull(i.getInstancetype().getClassid());
|
||||||
|
logger.info("i.getInstancetype().getClassid(): " + i.getInstancetype().getClassid());
|
||||||
|
assertNotNull(i.getInstancetype().getClassname());
|
||||||
|
logger.info("i.getInstancetype().getClassname(): " + i.getInstancetype().getClassname());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,348 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.doiboost.orcidnodoi.xml;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.ximpleware.NavException;
|
||||||
|
import com.ximpleware.ParseException;
|
||||||
|
import com.ximpleware.XPathEvalException;
|
||||||
|
import com.ximpleware.XPathParseException;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||||
|
import eu.dnetlib.dhp.schema.orcid.AuthorData;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.Contributor;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.model.WorkDataNoDoi;
|
||||||
|
import eu.dnetlib.doiboost.orcidnodoi.similarity.AuthorMatcher;
|
||||||
|
|
||||||
|
public class OrcidNoDoiTest {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(OrcidNoDoiTest.class);
|
||||||
|
|
||||||
|
static String nameA = "Khairy";
|
||||||
|
static String surnameA = "Abdel Dayem";
|
||||||
|
static String orcidIdA = "0000-0003-2760-1191";
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void readPublicationFieldsTest()
|
||||||
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
logger.info("running loadPublicationFieldsTest ....");
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0002-2536-4498.xml"));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
assertNotNull(workData);
|
||||||
|
assertNotNull(workData.getOid());
|
||||||
|
logger.info("oid: " + workData.getOid());
|
||||||
|
assertNotNull(workData.getTitles());
|
||||||
|
logger.info("titles: ");
|
||||||
|
workData.getTitles().forEach(t -> {
|
||||||
|
logger.info(t);
|
||||||
|
});
|
||||||
|
logger.info("source: " + workData.getSourceName());
|
||||||
|
logger.info("type: " + workData.getType());
|
||||||
|
logger.info("urls: ");
|
||||||
|
workData.getUrls().forEach(u -> {
|
||||||
|
logger.info(u);
|
||||||
|
});
|
||||||
|
logger.info("publication date: ");
|
||||||
|
workData.getPublicationDates().forEach(d -> {
|
||||||
|
logger.info(d.getYear() + " - " + d.getMonth() + " - " + d.getDay());
|
||||||
|
});
|
||||||
|
logger.info("external id: ");
|
||||||
|
workData.getExtIds().removeIf(e -> e.getRelationShip() != null && !e.getRelationShip().equals("self"));
|
||||||
|
workData.getExtIds().forEach(e -> {
|
||||||
|
logger.info(e.getType() + " - " + e.getValue() + " - " + e.getRelationShip());
|
||||||
|
});
|
||||||
|
logger.info("contributors: ");
|
||||||
|
workData.getContributors().forEach(c -> {
|
||||||
|
logger
|
||||||
|
.info(
|
||||||
|
c.getName() + " - " + c.getRole() + " - " + c.getSequence());
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void authorDoubleMatchTest() throws Exception {
|
||||||
|
logger.info("running authorSimpleMatchTest ....");
|
||||||
|
String orcidWork = "activity_work_0000-0003-2760-1191-similarity.xml";
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName(nameA);
|
||||||
|
author.setSurname(surnameA);
|
||||||
|
author.setOid(orcidIdA);
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
assertNotNull(workData);
|
||||||
|
|
||||||
|
Contributor a = workData.getContributors().get(0);
|
||||||
|
assertTrue(a.getCreditName().equals("Abdel-Dayem K"));
|
||||||
|
|
||||||
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
|
|
||||||
|
assertTrue(workData.getContributors().size() == 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void readContributorsTest()
|
||||||
|
throws IOException, XPathEvalException, XPathParseException, NavException, VtdException, ParseException {
|
||||||
|
logger.info("running loadPublicationFieldsTest ....");
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream("activity_work_0000-0003-2760-1191_contributors.xml"));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
assertNotNull(workData.getContributors());
|
||||||
|
assertTrue(workData.getContributors().size() == 5);
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(0).getCreditName()));
|
||||||
|
assertTrue(workData.getContributors().get(0).getSequence().equals("seq0"));
|
||||||
|
assertTrue(workData.getContributors().get(0).getRole().equals("role0"));
|
||||||
|
assertTrue(workData.getContributors().get(1).getCreditName().equals("creditname1"));
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getSequence()));
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(1).getRole()));
|
||||||
|
assertTrue(workData.getContributors().get(2).getCreditName().equals("creditname2"));
|
||||||
|
assertTrue(workData.getContributors().get(2).getSequence().equals("seq2"));
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(2).getRole()));
|
||||||
|
assertTrue(workData.getContributors().get(3).getCreditName().equals("creditname3"));
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(3).getSequence()));
|
||||||
|
assertTrue(workData.getContributors().get(3).getRole().equals("role3"));
|
||||||
|
assertTrue(StringUtils.isBlank(workData.getContributors().get(4).getCreditName()));
|
||||||
|
assertTrue(workData.getContributors().get(4).getSequence().equals("seq4"));
|
||||||
|
assertTrue(workData.getContributors().get(4).getRole().equals("role4"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void authorSimpleMatchTest() throws Exception {
|
||||||
|
String orcidWork = "activity_work_0000-0002-5982-8983.xml";
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName("Parkhouse");
|
||||||
|
author.setSurname("H.");
|
||||||
|
author.setOid("0000-0002-5982-8983");
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
assertNotNull(workData);
|
||||||
|
|
||||||
|
Contributor a = workData.getContributors().get(0);
|
||||||
|
assertTrue(a.getCreditName().equals("Parkhouse, H."));
|
||||||
|
|
||||||
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
|
|
||||||
|
assertTrue(workData.getContributors().size() == 2);
|
||||||
|
Contributor c = workData.getContributors().get(0);
|
||||||
|
assertTrue(c.getOid().equals("0000-0002-5982-8983"));
|
||||||
|
assertTrue(c.getName().equals("Parkhouse"));
|
||||||
|
assertTrue(c.getSurname().equals("H."));
|
||||||
|
assertTrue(c.getCreditName().equals("Parkhouse, H."));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void match() {
|
||||||
|
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName("Joe");
|
||||||
|
author.setSurname("Dodge");
|
||||||
|
author.setOid("0000-1111-2222-3333");
|
||||||
|
Contributor contributor = new Contributor();
|
||||||
|
contributor.setCreditName("Joe Dodge");
|
||||||
|
List<Contributor> contributors = Arrays.asList(contributor);
|
||||||
|
AuthorMatcher am = new AuthorMatcher();
|
||||||
|
int matchCounter = 0;
|
||||||
|
List<Integer> matchCounters = Arrays.asList(matchCounter);
|
||||||
|
contributors
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.forEach(c -> {
|
||||||
|
if (am.simpleMatch(c.getCreditName(), author.getName()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), author.getSurname()) ||
|
||||||
|
am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
|
||||||
|
matchCounters.set(0, matchCounters.get(0) + 1);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
assertTrue(matchCounters.get(0) == 1);
|
||||||
|
am.updateAuthorsSimpleMatch(contributors, author);
|
||||||
|
assertTrue(contributors.get(0).getName().equals("Joe"));
|
||||||
|
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
|
||||||
|
assertTrue(contributors.get(0).getCreditName().equals("Joe Dodge"));
|
||||||
|
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
|
||||||
|
|
||||||
|
AuthorData authorX = new AuthorData();
|
||||||
|
authorX.setName(nameA);
|
||||||
|
authorX.setSurname(surnameA);
|
||||||
|
authorX.setOid(orcidIdA);
|
||||||
|
Contributor contributorA = new Contributor();
|
||||||
|
contributorA.setCreditName("Abdel-Dayem Khai");
|
||||||
|
Contributor contributorB = new Contributor();
|
||||||
|
contributorB.setCreditName("Abdel-Dayem Fake");
|
||||||
|
List<Contributor> contributorList = new ArrayList<>();
|
||||||
|
contributorList.add(contributorA);
|
||||||
|
contributorList.add(contributorB);
|
||||||
|
int matchCounter2 = 0;
|
||||||
|
List<Integer> matchCounters2 = Arrays.asList(matchCounter2);
|
||||||
|
contributorList
|
||||||
|
.stream()
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.forEach(c -> {
|
||||||
|
if (am.simpleMatch(c.getCreditName(), authorX.getName()) ||
|
||||||
|
am.simpleMatch(c.getCreditName(), authorX.getSurname()) ||
|
||||||
|
am.simpleMatchOnOtherNames(c.getCreditName(), author.getOtherNames())) {
|
||||||
|
int currentCounter = matchCounters2.get(0);
|
||||||
|
currentCounter += 1;
|
||||||
|
matchCounters2.set(0, currentCounter);
|
||||||
|
c.setSimpleMatch(true);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
assertTrue(matchCounters2.get(0) == 2);
|
||||||
|
assertTrue(contributorList.get(0).isSimpleMatch());
|
||||||
|
assertTrue(contributorList.get(1).isSimpleMatch());
|
||||||
|
|
||||||
|
Optional<Contributor> optCon = contributorList
|
||||||
|
.stream()
|
||||||
|
.filter(c -> c.isSimpleMatch())
|
||||||
|
.filter(c -> !StringUtils.isBlank(c.getCreditName()))
|
||||||
|
.map(c -> {
|
||||||
|
c.setScore(am.bestMatch(authorX.getName(), authorX.getSurname(), c.getCreditName()));
|
||||||
|
return c;
|
||||||
|
})
|
||||||
|
.filter(c -> c.getScore() >= AuthorMatcher.threshold)
|
||||||
|
.max(Comparator.comparing(c -> c.getScore()));
|
||||||
|
assertTrue(optCon.isPresent());
|
||||||
|
|
||||||
|
final Contributor bestMatchContributor = optCon.get();
|
||||||
|
bestMatchContributor.setBestMatch(true);
|
||||||
|
assertTrue(bestMatchContributor.getCreditName().equals("Abdel-Dayem Khai"));
|
||||||
|
assertTrue(contributorList.get(0).isBestMatch());
|
||||||
|
assertTrue(!contributorList.get(1).isBestMatch());
|
||||||
|
am.updateAuthorsSimilarityMatch(contributorList, authorX);
|
||||||
|
assertTrue(contributorList.get(0).getName().equals(nameA));
|
||||||
|
assertTrue(contributorList.get(0).getSurname().equals(surnameA));
|
||||||
|
assertTrue(contributorList.get(0).getCreditName().equals("Abdel-Dayem Khai"));
|
||||||
|
assertTrue(contributorList.get(0).getOid().equals(orcidIdA));
|
||||||
|
assertTrue(StringUtils.isBlank(contributorList.get(1).getOid()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void authorBestMatchTest() throws Exception {
|
||||||
|
String name = "Khairy";
|
||||||
|
String surname = "Abdel Dayem";
|
||||||
|
String orcidWork = "activity_work_0000-0003-2760-1191.xml";
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName(name);
|
||||||
|
author.setSurname(surname);
|
||||||
|
author.setOid(orcidIdA);
|
||||||
|
String xml = IOUtils
|
||||||
|
.toString(
|
||||||
|
OrcidNoDoiTest.class.getResourceAsStream(orcidWork));
|
||||||
|
|
||||||
|
if (xml == null) {
|
||||||
|
logger.info("Resource not found");
|
||||||
|
}
|
||||||
|
XMLRecordParserNoDoi p = new XMLRecordParserNoDoi();
|
||||||
|
if (p == null) {
|
||||||
|
logger.info("XMLRecordParserNoDoi null");
|
||||||
|
}
|
||||||
|
WorkDataNoDoi workData = null;
|
||||||
|
try {
|
||||||
|
workData = p.VTDParseWorkData(xml.getBytes());
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("parsing xml", e);
|
||||||
|
}
|
||||||
|
AuthorMatcher.match(author, workData.getContributors());
|
||||||
|
assertTrue(workData.getContributors().size() == 5);
|
||||||
|
List<Contributor> c = workData.getContributors();
|
||||||
|
assertTrue(c.get(0).getName().equals(name));
|
||||||
|
assertTrue(c.get(0).getSurname().equals(surname));
|
||||||
|
assertTrue(c.get(0).getCreditName().equals("Khair Abde Daye"));
|
||||||
|
assertTrue(c.get(0).getOid().equals(orcidIdA));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void otherNamesMatchTest()
|
||||||
|
throws VtdException, ParseException, IOException, XPathEvalException, NavException, XPathParseException {
|
||||||
|
|
||||||
|
AuthorData author = new AuthorData();
|
||||||
|
author.setName("Joe");
|
||||||
|
author.setSurname("Dodge");
|
||||||
|
author.setOid("0000-1111-2222-3333");
|
||||||
|
String otherName1 = new String("Joe Dr. Dodge");
|
||||||
|
String otherName2 = new String("XY");
|
||||||
|
List<String> others = Lists.newArrayList();
|
||||||
|
others.add(otherName1);
|
||||||
|
others.add(otherName2);
|
||||||
|
author.setOtherNames(others);
|
||||||
|
Contributor contributor = new Contributor();
|
||||||
|
contributor.setCreditName("XY");
|
||||||
|
List<Contributor> contributors = Arrays.asList(contributor);
|
||||||
|
AuthorMatcher.match(author, contributors);
|
||||||
|
assertTrue(contributors.get(0).getName().equals("Joe"));
|
||||||
|
assertTrue(contributors.get(0).getSurname().equals("Dodge"));
|
||||||
|
assertTrue(contributors.get(0).getOid().equals("0000-1111-2222-3333"));
|
||||||
|
}
|
||||||
|
}
|
|
@ -1 +0,0 @@
|
||||||
H4sIAAAAAAAAAO1a227bOBB9z1cIepd18SW24aho0wTbAgEWjRdY9I2RaJtbSdSSkhP165eURIm6kHa2SbCLNkBiWDxzhhxyZg7tbN49xZFxhIQinFyZ7sQxDZgEOETJ/sr8Y3trLU2DZiAJQYQTeGUWkJrv/IsNgQEm4bp6MVKQHa5M22E/Fvt1rcViNrfmzupP02AOErpGSQZJAqIr85Bl6dq2Hx8fJ5gEKGR/93ZCbYEQFjDMA5CV01KZNBBhEyKaoSTQW0mgxg6mbCUgg6HGrMEIK5wdILESEEO1VYsRVjGMH1i8DyhVW7WYJhqEYKKJBB8W2ADHsS4A1bhAV1uoRlfjAp2yaWG2S1YIM4AiqrbrIwXDN1g8ah3WgGblMbPWrJwPN9in6gxZKIRJhnYI6mI2BAueXZ5UGaCyrQFNVAjcQcISB+oC0oKEHQhDAqnGpga0WXRE7ABaKaZIf8j7SMHAIvtNbcVHBfLA0gSTQg2uAe0+pREuYhZK3WYJjLD6OwcRC/2pTO/AhC2F5IgCTfLVgO7ZPXVim71hFYLFEOm2tMW02UQhIAFP+pxojm0X186QvSfwiOCjbpoNSNg95JFmV/lof36MgOKc6KI3gJr+hcF+NlX9WJdgKXmqURmRE+RzdsroW+qRLrGxJYsBDe8uvs6qBAzMDphmfuO2AZePq4XY2pVspISVM1zyJCMiHIAI+jDZ2COPa4dayk2dUSL1JEdiJCCwTAErhtkBh/5d2SiskonAcGOrgEMqmj/EiPK+b4Wsq/me464sZ2l53tadrmeLtXc58ZbLry1n32IQ8QjQzIqZeGBBDAWrx7Ztbrnu1puu59P11JksPfdrE/sRm5FlRwDFMPQzkkNpjfXTIZ4Jmoqv7A49s96gxjolKAak0LN0QfU+j+7kpiowdR3SiCZRieSTVplyIWEcEUUPKEIZK85p/hChwKzJxgRYSyJvVXk+2k0abv187rWb1EGP8o1u/QlW3dZLi24lxHqPjjAp1RT1twgkRb4Z6IwO6ATfDsQoKkqs/xmBETIZ0e6GLW2H9LgVe5I2pLqNlmCmLTF120Ovq2gZe9AOa3lEK0Gl5ag0lWxZ6xAhWPSLEqJFJqhFnVB/WnuB6c59qNbG5J5+XSN44aTZ0+qlftg2eEkPWDSPecprY9Aqg2fUyZnlTLfObD2brZ3pZHm5OLNOStOUbjfaWMi47la3XM39Sh/VBqXkaWTfiWPXwFRMte7W0giMiqMvjbVkA7CKtb2yafkkmIpJ0ndaKhmn4uroZi1bF6niG2jCs2pRi1bx1kpdyyYwKg5+edESlABFP3zplOxPbk9wnnaHX9u9zC9VPjpEKZDjQAXYyooU+iFGzfwGg8+iO4Ioh77rTFzXWdnvr69v7u8nPCYTb7X0PNcZ9VNZPctRgknMjv53GBoZAQlF5Q2Wiz2zcQ8Cdu7oafct1/PmwDp1c1FiISyvSc9dOud4llMCoyrZWTHyKYx2o7Qd1PjJGTEbOYkjqJGjuOFJWqZy22XzzApwyG6qly67kCxWjnkqy+0WOSaWWe9LI1BYKAnhE1PNpj4lelqZp+XUmjpbz1szYTt3JjP38hyt3Od9raSXfVR19/TBqHBWEPHjr8192Wr8gl+RSJuzWi5nlrtyp+P3fJ2H3t1/yNS9++uoTn4eMGpsPztAvZCWd4Rrgillt/Q+XfcCoXGsAJXZkqEsOmOLK9g9K1CR9ZFdnBN+kzdu2WnNCTTuQEbQk3HNMp3VvlIXGnflZwfGDhPjI6y+FDC+wBQyJnbHMm7Ze0iMO3yElba7JTg2biIYZATzzzXSA4jwnoDYuEd7lvK0WZRmyhv71KLOb2oK9Hnn5YWam4ryVRqcytlbNznVPF690akcv1SzK/nPangq5An99W8jpIxKXSP4Gf2LlRI+CUAyFERQZJry+DZFuOyb1eeJ6pYjWxRM95fNrJlf+UQfpPPcVOsRS6nKxKebmxvjfXl+60V1x0fUyEBn9LS7rRfvP6rt64/GVlt3vnYXa8ebLJz5T6jt53ObB8OeLl2m2WZvJurP8fviav4cpz+BjF+4znzqzd3TMr5FvryMP5GBPyjjXyC/ZR+/ZPwvGd+Rzh8IQIl1jWOWVkyDf+L/PLMDATSuDyBJYGTdQ67DuYq/ZxUwg/vC+AAoq4fsyXuWtwVF1MA74+bIA/GFlwc2+BHSIgkOBCfoe1kvjC1OuYRPD4WBSi78DRq/szGu+H/p+ddqaiovb9bYVBN4veam8vj/l+6q0PwnNbu7OkOzy3bslxf3ZWNWPThpF4LC91or/va17gefq3e83v0GQZQdAkCgcZPsUQIhQcn+DW4NnbHyqwjxxaP2S0b/YmN3/tnSv/gH9+klwrUpAAA=
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,106 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
|
||||||
|
path="/0000-0003-2760-1191/work/28776099" visibility="public">
|
||||||
|
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
|
||||||
|
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
|
||||||
|
<common:path>0000-0002-9157-3431</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Europe PubMed Central</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
|
||||||
|
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
|
||||||
|
ST-Segment-Elevation Myocardial Infarction.</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>formatted-unspecified</work:citation-type>
|
||||||
|
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
|
||||||
|
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2016</common:year>
|
||||||
|
<common:month>11</common:month>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmid</common:external-id-type>
|
||||||
|
<common:external-id-value>27899851</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmc</common:external-id-type>
|
||||||
|
<common:external-id-value>PMC5126442</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true"
|
||||||
|
>PMC5126442</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||||
|
<work:contributors>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Abdel-Dayem K</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Eweda II</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>El-Sherbiny A</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Dimitry MO</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Nammas W</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
</work:contributors>
|
||||||
|
</work:work>
|
|
@ -0,0 +1,770 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record:record path="/8888-8888-8888-8880" xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education" xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:history="http://www.orcid.org/ns/history" xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences" xmlns:record="http://www.orcid.org/ns/record"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction" xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership" xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://www.orcid.org/ns/record ../record-3.0.xsd ">
|
||||||
|
<common:orcid-identifier>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:orcid-identifier>
|
||||||
|
<preferences:preferences>
|
||||||
|
<preferences:locale>zh_CN</preferences:locale>
|
||||||
|
</preferences:preferences>
|
||||||
|
<history:history visibility="private">
|
||||||
|
<history:creation-method>API</history:creation-method>
|
||||||
|
<history:completion-date>2001-12-31T12:00:00</history:completion-date>
|
||||||
|
<history:submission-date>2001-12-31T12:00:00</history:submission-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<history:claimed>true</history:claimed>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<history:deactivation-date>2001-12-31T12:00:00</history:deactivation-date>
|
||||||
|
<history:verified-email>true</history:verified-email>
|
||||||
|
<history:verified-primary-email>true</history:verified-primary-email>
|
||||||
|
</history:history>
|
||||||
|
<person:person path="/8888-8888-8888-8880">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<person:name visibility="public"
|
||||||
|
path="/8888-8888-8888-8880/personal-details">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<personal-details:given-names>give-names</personal-details:given-names>
|
||||||
|
<personal-details:family-name>family-name</personal-details:family-name>
|
||||||
|
<personal-details:credit-name>credit-name</personal-details:credit-name>
|
||||||
|
</person:name>
|
||||||
|
<other-name:other-names
|
||||||
|
path="/8888-8888-8888-8880/other-names">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<other-name:other-name visibility="public"
|
||||||
|
put-code="1" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<other-name:content>other-name-1</other-name:content>
|
||||||
|
</other-name:other-name>
|
||||||
|
</other-name:other-names>
|
||||||
|
<person:biography visibility="public"
|
||||||
|
path="/8888-8888-8888-8880/biography">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<personal-details:content>biography</personal-details:content>
|
||||||
|
</person:biography>
|
||||||
|
<researcher-url:researcher-urls
|
||||||
|
path="/8888-8888-8888-8880/researcher-urls">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<researcher-url:researcher-url
|
||||||
|
put-code="1248" visibility="public" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<researcher-url:url-name>url-name-1</researcher-url:url-name>
|
||||||
|
<researcher-url:url>http://url.com/</researcher-url:url>
|
||||||
|
</researcher-url:researcher-url>
|
||||||
|
</researcher-url:researcher-urls>
|
||||||
|
<email:emails path="/8888-8888-8888-8880/email">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<email:email visibility="public" put-code="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<email:email>user1@email.com</email:email>
|
||||||
|
</email:email>
|
||||||
|
</email:emails>
|
||||||
|
<address:addresses path="/8888-8888-8888-8880/address">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<address:address visibility="public" put-code="1"
|
||||||
|
display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<address:country>US</address:country>
|
||||||
|
</address:address>
|
||||||
|
</address:addresses>
|
||||||
|
<keyword:keywords path="/8888-8888-8888-8880/keywords">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<keyword:keyword visibility="public" put-code="1"
|
||||||
|
display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<keyword:content>keyword1</keyword:content>
|
||||||
|
</keyword:keyword>
|
||||||
|
</keyword:keywords>
|
||||||
|
<external-identifier:external-identifiers
|
||||||
|
path="/8888-8888-8888-8880/external-identifiers">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<external-identifier:external-identifier
|
||||||
|
visibility="public" put-code="1" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:external-id-type>type-1</common:external-id-type>
|
||||||
|
<common:external-id-value>value-1</common:external-id-value>
|
||||||
|
<common:external-id-url>http://url.com/1</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</external-identifier:external-identifier>
|
||||||
|
</external-identifier:external-identifiers>
|
||||||
|
</person:person>
|
||||||
|
<activities:activities-summary>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:distinctions path="/8888-8888-8888-8880/distinctions">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<distinction:distinction-summary put-code="0"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>distinction:department-name</common:department-name>
|
||||||
|
<common:role-title>distinction:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>distinction-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-distinction</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>GRID</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</distinction:distinction-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:distinctions>
|
||||||
|
<activities:educations>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<education:education-summary put-code="0"
|
||||||
|
visibility="private">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>education:department-name</common:department-name>
|
||||||
|
<common:role-title>education:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>2019</common:year>
|
||||||
|
<common:month>01</common:month>
|
||||||
|
<common:day>01</common:day>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>education-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-education</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>GRID</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</education:education-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:educations>
|
||||||
|
<activities:employments>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<employment:employment-summary
|
||||||
|
put-code="0" visibility="private">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>employment:department-name</common:department-name>
|
||||||
|
<common:role-title>employment:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>2025</common:year>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>employment-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-employment</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>GRID</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</employment:employment-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:employments>
|
||||||
|
<activities:fundings>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>grant_number</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value-1</common:external-id-value>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<funding:funding-summary put-code="0" visibility="private">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<funding:title>
|
||||||
|
<common:title>common:title</common:title>
|
||||||
|
<common:translated-title language-code="en">common:translated-title</common:translated-title>
|
||||||
|
</funding:title>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>grant_number</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value-1</common:external-id-value>
|
||||||
|
<common:external-id-url>http://tempuri.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<funding:type>grant</funding:type>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>common:name</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-funding</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>FUNDREF</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</funding:funding-summary>
|
||||||
|
</activities:group>
|
||||||
|
</activities:fundings>
|
||||||
|
<activities:invited-positions path="/8888-8888-8888-8880/invited-positions">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<invited-position:invited-position-summary put-code="0"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>invited-position:department-name</common:department-name>
|
||||||
|
<common:role-title>invited-position:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>2019</common:year>
|
||||||
|
<common:month>01</common:month>
|
||||||
|
<common:day>01</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>2025</common:year>
|
||||||
|
<common:month>01</common:month>
|
||||||
|
<common:day>01</common:day>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>invited-position-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-invited-position</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>GRID</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</invited-position:invited-position-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:invited-positions>
|
||||||
|
<activities:memberships path="/8888-8888-8888-8880/memberships">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<membership:membership-summary put-code="0"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>membership:department-name</common:department-name>
|
||||||
|
<common:role-title>membership:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>membership-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-membership</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</membership:membership-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:memberships>
|
||||||
|
<activities:peer-reviews>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>something</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<activities:peer-review-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>something</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<peer-review:peer-review-summary put-code="12345"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-client-id>
|
||||||
|
<common:uri>https://orcid.org/client/APP-9999999999999901</common:uri>
|
||||||
|
<common:path>APP-9999999999999901</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-client-id>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<peer-review:reviewer-role>reviewer</peer-review:reviewer-role>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>something</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<peer-review:review-url>http://orcid.org</peer-review:review-url>
|
||||||
|
<peer-review:review-type>review</peer-review:review-type>
|
||||||
|
<peer-review:completion-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</peer-review:completion-date>
|
||||||
|
<peer-review:review-group-id>orcid-generated:12345</peer-review:review-group-id>
|
||||||
|
<peer-review:convening-organization>
|
||||||
|
<common:name>common:name</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-peer-review</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</peer-review:convening-organization>
|
||||||
|
</peer-review:peer-review-summary>
|
||||||
|
</activities:peer-review-group>
|
||||||
|
</activities:group>
|
||||||
|
</activities:peer-reviews>
|
||||||
|
<activities:qualifications path="/8888-8888-8888-8880/qualifications">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<qualification:qualification-summary put-code="0"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>qualification:department-name</common:department-name>
|
||||||
|
<common:role-title>qualification:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>2025</common:year>
|
||||||
|
<common:month>12</common:month>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>qualification-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-qualification</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</qualification:qualification-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:qualifications>
|
||||||
|
<activities:research-resources path="/8888-8888-8888-8880/research-resources">
|
||||||
|
<common:last-modified-date>2017-01-18T15:06:05.147-06:00</common:last-modified-date>
|
||||||
|
<activities:group>
|
||||||
|
<common:last-modified-date>2017-01-18T15:03:56.856-06:00</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>proposal_id</common:external-id-type>
|
||||||
|
<common:external-id-value>123456</common:external-id-value>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<research-resource:research-resource-summary
|
||||||
|
put-code="1234" path="/0000-0003-0902-4386/research-resource/1234" visibility="public">
|
||||||
|
<!-- common metadata -->
|
||||||
|
<common:created-date>2015-06-25T16:01:12.718Z</common:created-date>
|
||||||
|
<common:last-modified-date>2017-09-08T13:31:19.987Z
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0000-0000-0000</common:uri>
|
||||||
|
<common:path>0000-0000-0000-0000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>XSEDE ORCID integration</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<!-- proposal title and host(s) -->
|
||||||
|
<research-resource:proposal> <!-- proposal/award/credential section -->
|
||||||
|
<research-resource:title>
|
||||||
|
<common:title>Giant Laser Award</common:title>
|
||||||
|
</research-resource:title>
|
||||||
|
<research-resource:hosts>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>XSEDE</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>city</common:city>
|
||||||
|
<common:region>region</common:region>
|
||||||
|
<common:country>US</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>XX</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>grid</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</research-resource:hosts>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>proposal_id</common:external-id-type>
|
||||||
|
<common:external-id-value>123456</common:external-id-value>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1999</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>2012</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:end-date>
|
||||||
|
<common:url>http://xsede.org/GiantLaserAward</common:url>
|
||||||
|
</research-resource:proposal>
|
||||||
|
</research-resource:research-resource-summary>
|
||||||
|
</activities:group>
|
||||||
|
</activities:research-resources>
|
||||||
|
<activities:services path="/8888-8888-8888-8880/services">
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00
|
||||||
|
</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value
|
||||||
|
</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<service:service-summary put-code="0"
|
||||||
|
visibility="private" display-index="0">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>service:department-name</common:department-name>
|
||||||
|
<common:role-title>service:role-title</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:start-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>service-org</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>common:city</common:city>
|
||||||
|
<common:region>common:region</common:region>
|
||||||
|
<common:country>AF</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>common:disambiguated-organization-identifier-service</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</service:service-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:services>
|
||||||
|
<activities:works>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<activities:group>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value</common:external-id-value>
|
||||||
|
<common:external-id-url>http://orcid.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>part-of</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<work:work-summary put-code="0" visibility="private">
|
||||||
|
<common:created-date>2001-12-31T12:00:00</common:created-date>
|
||||||
|
<common:last-modified-date>2001-12-31T12:00:00</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-client-id>
|
||||||
|
<common:uri>https://orcid.org/client/8888-8888-8888-8880</common:uri>
|
||||||
|
<common:path>8888-8888-8888-8880</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-client-id>
|
||||||
|
<common:source-name />
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>common:title</common:title>
|
||||||
|
<common:subtitle />
|
||||||
|
<common:translated-title language-code="en">common:translated-title</common:translated-title>
|
||||||
|
</work:title>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>agr</common:external-id-type>
|
||||||
|
<common:external-id-value>external-id-value</common:external-id-value>
|
||||||
|
<common:external-id-url>http://tempuri.org</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<work:type>artistic-performance</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>1948</common:year>
|
||||||
|
<common:month>02</common:month>
|
||||||
|
<common:day>02</common:day>
|
||||||
|
</common:publication-date>
|
||||||
|
<work:journal-title>Procedia Computer Science</work:journal-title>
|
||||||
|
</work:work-summary>
|
||||||
|
</activities:group>
|
||||||
|
</activities:works>
|
||||||
|
</activities:activities-summary>
|
||||||
|
</record:record>
|
|
@ -0,0 +1,196 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<record:record xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email
|
||||||
|
" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" path="/0000-0001-5109-1000">
|
||||||
|
<common:orcid-identifier>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:orcid-identifier>
|
||||||
|
<preferences:preferences>
|
||||||
|
<preferences:locale>en</preferences:locale>
|
||||||
|
</preferences:preferences>
|
||||||
|
<history:history>
|
||||||
|
<history:creation-method>Member-referred</history:creation-method>
|
||||||
|
<history:submission-date>2019-05-01T13:04:57.507Z</history:submission-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:59:54.268Z</common:last-modified-date>
|
||||||
|
<history:claimed>true</history:claimed>
|
||||||
|
<history:verified-email>true</history:verified-email>
|
||||||
|
<history:verified-primary-email>true</history:verified-primary-email>
|
||||||
|
</history:history>
|
||||||
|
<person:person path="/0000-0001-5109-1000/person">
|
||||||
|
<common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
|
||||||
|
<person:name visibility="public" path="0000-0001-5109-1000">
|
||||||
|
<common:created-date>2019-05-01T13:04:57.507Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:04:57.740Z</common:last-modified-date>
|
||||||
|
<personal-details:given-names>Andrew</personal-details:given-names>
|
||||||
|
<personal-details:family-name>Porteus</personal-details:family-name>
|
||||||
|
</person:name>
|
||||||
|
<other-name:other-names path="/0000-0001-5109-1000/other-names">
|
||||||
|
<common:last-modified-date>2019-05-01T13:44:57.072Z</common:last-modified-date>
|
||||||
|
<other-name:other-name put-code="1238811" visibility="public" path="/0000-0001-5109-1000/other-names/1238811" display-index="1">
|
||||||
|
<common:created-date>2019-05-01T13:44:57.072Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:44:57.072Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Andrew Porteus</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<other-name:content>Andrew C. Porteus</other-name:content>
|
||||||
|
</other-name:other-name>
|
||||||
|
</other-name:other-names>
|
||||||
|
<person:biography visibility="public" path="/0000-0001-5109-1000/biography">
|
||||||
|
<common:created-date>2019-05-01T13:59:54.263Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:59:54.263Z</common:last-modified-date>
|
||||||
|
<personal-details:content>Retired Librarian</personal-details:content>
|
||||||
|
</person:biography>
|
||||||
|
<researcher-url:researcher-urls path="/0000-0001-5109-1000/researcher-urls">
|
||||||
|
<common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
|
||||||
|
<researcher-url:researcher-url put-code="1722812" visibility="public" path="/0000-0001-5109-1000/researcher-urls/1722812" display-index="1">
|
||||||
|
<common:created-date>2019-05-01T13:45:47.727Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:45:47.727Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Andrew Porteus</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<researcher-url:url-name>Niagara Falls Poetry Project</researcher-url:url-name>
|
||||||
|
<researcher-url:url>http://niagarapoetry.ca</researcher-url:url>
|
||||||
|
</researcher-url:researcher-url>
|
||||||
|
</researcher-url:researcher-urls>
|
||||||
|
<email:emails path="/0000-0001-5109-1000/email"/>
|
||||||
|
<address:addresses path="/0000-0001-5109-1000/address">
|
||||||
|
<common:last-modified-date>2019-05-01T13:45:09.764Z</common:last-modified-date>
|
||||||
|
<address:address put-code="1247706" visibility="public" path="/0000-0001-5109-1000/address/1247706" display-index="1">
|
||||||
|
<common:created-date>2019-05-01T13:45:09.764Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:45:09.764Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Andrew Porteus</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<address:country>CA</address:country>
|
||||||
|
</address:address>
|
||||||
|
</address:addresses>
|
||||||
|
<keyword:keywords path="/0000-0001-5109-1000/keywords"/>
|
||||||
|
<external-identifier:external-identifiers path="/0000-0001-5109-1000/external-identifiers"/>
|
||||||
|
</person:person>
|
||||||
|
<activities:activities-summary path="/0000-0001-5109-1000/activities">
|
||||||
|
<common:last-modified-date>2019-05-01T13:57:45.787Z</common:last-modified-date>
|
||||||
|
<activities:distinctions path="/0000-0001-5109-1000/distinctions"/>
|
||||||
|
<activities:educations path="/0000-0001-5109-1000/educations">
|
||||||
|
<common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
|
||||||
|
<common:external-ids/>
|
||||||
|
<education:education-summary put-code="7801952" display-index="1" path="/0000-0001-5109-1000/education/7801952" visibility="public">
|
||||||
|
<common:created-date>2019-05-01T13:15:26.102Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:15:26.102Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Andrew Porteus</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<common:role-title>Library Technician Diploma</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>1976</common:year>
|
||||||
|
<common:month>09</common:month>
|
||||||
|
</common:start-date>
|
||||||
|
<common:end-date>
|
||||||
|
<common:year>1978</common:year>
|
||||||
|
<common:month>05</common:month>
|
||||||
|
</common:end-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>Niagara College</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>Welland</common:city>
|
||||||
|
<common:region>ON</common:region>
|
||||||
|
<common:country>CA</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>125147</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</education:education-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:educations>
|
||||||
|
<activities:employments path="/0000-0001-5109-1000/employments"/>
|
||||||
|
<activities:fundings path="/0000-0001-5109-1000/fundings"/>
|
||||||
|
<activities:invited-positions path="/0000-0001-5109-1000/invited-positions"/>
|
||||||
|
<activities:memberships path="/0000-0001-5109-1000/memberships"/>
|
||||||
|
<activities:peer-reviews path="/0000-0001-5109-1000/peer-reviews"/>
|
||||||
|
<activities:qualifications path="/0000-0001-5109-1000/qualifications">
|
||||||
|
<common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
|
||||||
|
<activities:affiliation-group>
|
||||||
|
<common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
|
||||||
|
<common:external-ids/>
|
||||||
|
<qualification:qualification-summary put-code="7801973" display-index="1" path="/0000-0001-5109-1000/qualification/7801973" visibility="public">
|
||||||
|
<common:created-date>2019-05-01T13:19:49.021Z</common:created-date>
|
||||||
|
<common:last-modified-date>2019-05-01T13:19:49.021Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0001-5109-1000</common:uri>
|
||||||
|
<common:path>0000-0001-5109-1000</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Andrew Porteus</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<common:department-name>Communication, Film & Popular Culture</common:department-name>
|
||||||
|
<common:role-title>Master's Candidate</common:role-title>
|
||||||
|
<common:start-date>
|
||||||
|
<common:year>2018</common:year>
|
||||||
|
<common:month>09</common:month>
|
||||||
|
</common:start-date>
|
||||||
|
<common:organization>
|
||||||
|
<common:name>Brock University</common:name>
|
||||||
|
<common:address>
|
||||||
|
<common:city>Saint Catharines</common:city>
|
||||||
|
<common:region>ON</common:region>
|
||||||
|
<common:country>CA</common:country>
|
||||||
|
</common:address>
|
||||||
|
<common:disambiguated-organization>
|
||||||
|
<common:disambiguated-organization-identifier>7497</common:disambiguated-organization-identifier>
|
||||||
|
<common:disambiguation-source>RINGGOLD</common:disambiguation-source>
|
||||||
|
</common:disambiguated-organization>
|
||||||
|
</common:organization>
|
||||||
|
</qualification:qualification-summary>
|
||||||
|
</activities:affiliation-group>
|
||||||
|
</activities:qualifications>
|
||||||
|
</activities:activities-summary>
|
||||||
|
</record:record>
|
|
@ -0,0 +1 @@
|
||||||
|
{"oid":"0000-0002-4147-3387","id":"60153327","sourceName":"The Chinese University of Hong Kong","type":"conference-paper","titles":["Evaluation of a percutaneous optical fibre glucose sensor (FiberSense) across the glycemic range with rapid glucoseexcursions using the glucose clamp"],"extIds":[{"type":"wosuid","value":"000425015800225","relationShip":"self"},{"type":"other-id","value":"441f521e-ab19-448d-ba32-83157b348ada","relationShip":"self"}],"publicationDates":[],"contributors":[{"sequence":"1","oid":"0000-0002-4147-3387","name":"Elaine","surname":"Chow","creditName":"Elaine Chow"},{"sequence":"2","creditName":"Victor Tsui"},{"sequence":"3","creditName":"Achim Müller"},{"sequence":"4","creditName":"Vincy Lee"},{"sequence":"5","creditName":"Lucia Krivánekova"},{"sequence":"6","creditName":"Roland Krivánek"},{"sequence":"7","creditName":"Juliana CN Chan"}]}
|
|
@ -0,0 +1,72 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="63461376"
|
||||||
|
path="/0000-0002-2536-4498/work/63461376" visibility="public">
|
||||||
|
<common:created-date>2019-10-22T03:18:13.755Z</common:created-date>
|
||||||
|
<common:last-modified-date>2020-06-17T11:07:13.703Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-client-id>
|
||||||
|
<common:uri>https://orcid.org/client/0000-0001-8607-8906</common:uri>
|
||||||
|
<common:path>0000-0001-8607-8906</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-client-id>
|
||||||
|
<common:source-name>INSPIRE-HEP</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Measurement of the $t\bar{t}$ production cross-section and lepton differential distributions in $e\mu$ dilepton events from $pp$ collisions at $\sqrt{s}=13$ TeV with the ATLAS detector</common:title>
|
||||||
|
</work:title>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>other-id</common:external-id-type>
|
||||||
|
<common:external-id-value>1759875</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">1759875</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://inspirehep.net/record/1759875</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>doi</common:external-id-type>
|
||||||
|
<common:external-id-value>10.1140/epjc/s10052-020-7907-9</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">10.1140/epjc/s10052-020-7907-9</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://dx.doi.org/10.1140/epjc/s10052-020-7907-9</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>arxiv</common:external-id-type>
|
||||||
|
<common:external-id-value>1910.08819</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">arXiv:1910.08819</common:external-id-normalized>
|
||||||
|
<common:external-id-url>http://arxiv.org/abs/1910.08819</common:external-id-url>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://inspirehep.net/record/1759875</common:url>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2020</common:year>
|
||||||
|
<common:month>06</common:month>
|
||||||
|
<common:day>12</common:day>
|
||||||
|
</common:publication-date>
|
||||||
|
<work:journal-title>Eur.Phys.J.C</work:journal-title>
|
||||||
|
</work:work>
|
|
@ -0,0 +1,113 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
|
||||||
|
path="/0000-0003-2760-1191/work/28776099" visibility="public">
|
||||||
|
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
|
||||||
|
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
|
||||||
|
<common:path>0000-0002-9157-3431</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Europe PubMed Central</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
|
||||||
|
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
|
||||||
|
ST-Segment-Elevation Myocardial Infarction.</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>formatted-unspecified</work:citation-type>
|
||||||
|
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
|
||||||
|
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2016</common:year>
|
||||||
|
<common:month>11</common:month>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmid</common:external-id-type>
|
||||||
|
<common:external-id-value>27899851</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmc</common:external-id-type>
|
||||||
|
<common:external-id-value>PMC5126442</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true"
|
||||||
|
>PMC5126442</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||||
|
<work:contributors>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Abdel-Dayem K</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Abdel-Dayem Fake</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Eweda II</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>El-Sherbiny A</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Dimitry MO</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Nammas W</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
</work:contributors>
|
||||||
|
</work:work>
|
|
@ -0,0 +1,106 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
|
||||||
|
path="/0000-0003-2760-1191/work/28776099" visibility="public">
|
||||||
|
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
|
||||||
|
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
|
||||||
|
<common:path>0000-0002-9157-3431</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Europe PubMed Central</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
|
||||||
|
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
|
||||||
|
ST-Segment-Elevation Myocardial Infarction.</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>formatted-unspecified</work:citation-type>
|
||||||
|
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
|
||||||
|
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2016</common:year>
|
||||||
|
<common:month>11</common:month>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmid</common:external-id-type>
|
||||||
|
<common:external-id-value>27899851</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmc</common:external-id-type>
|
||||||
|
<common:external-id-value>PMC5126442</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true"
|
||||||
|
>PMC5126442</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||||
|
<work:contributors>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Khair Abde Daye</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Eweda II</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>El-Sherbiny A</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Dimitry MO</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>Nammas W</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>first</work:contributor-sequence>
|
||||||
|
<work:contributor-role>author</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
</work:contributors>
|
||||||
|
</work:work>
|
|
@ -0,0 +1,101 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<work:work xmlns:address="http://www.orcid.org/ns/address"
|
||||||
|
xmlns:email="http://www.orcid.org/ns/email" xmlns:history="http://www.orcid.org/ns/history"
|
||||||
|
xmlns:employment="http://www.orcid.org/ns/employment"
|
||||||
|
xmlns:education="http://www.orcid.org/ns/education"
|
||||||
|
xmlns:other-name="http://www.orcid.org/ns/other-name"
|
||||||
|
xmlns:deprecated="http://www.orcid.org/ns/deprecated"
|
||||||
|
xmlns:funding="http://www.orcid.org/ns/funding"
|
||||||
|
xmlns:research-resource="http://www.orcid.org/ns/research-resource"
|
||||||
|
xmlns:service="http://www.orcid.org/ns/service"
|
||||||
|
xmlns:researcher-url="http://www.orcid.org/ns/researcher-url"
|
||||||
|
xmlns:distinction="http://www.orcid.org/ns/distinction"
|
||||||
|
xmlns:internal="http://www.orcid.org/ns/internal"
|
||||||
|
xmlns:membership="http://www.orcid.org/ns/membership"
|
||||||
|
xmlns:person="http://www.orcid.org/ns/person"
|
||||||
|
xmlns:personal-details="http://www.orcid.org/ns/personal-details"
|
||||||
|
xmlns:bulk="http://www.orcid.org/ns/bulk" xmlns:common="http://www.orcid.org/ns/common"
|
||||||
|
xmlns:record="http://www.orcid.org/ns/record" xmlns:keyword="http://www.orcid.org/ns/keyword"
|
||||||
|
xmlns:activities="http://www.orcid.org/ns/activities"
|
||||||
|
xmlns:qualification="http://www.orcid.org/ns/qualification"
|
||||||
|
xmlns:external-identifier="http://www.orcid.org/ns/external-identifier"
|
||||||
|
xmlns:error="http://www.orcid.org/ns/error"
|
||||||
|
xmlns:preferences="http://www.orcid.org/ns/preferences"
|
||||||
|
xmlns:invited-position="http://www.orcid.org/ns/invited-position"
|
||||||
|
xmlns:work="http://www.orcid.org/ns/work"
|
||||||
|
xmlns:peer-review="http://www.orcid.org/ns/peer-review" put-code="28776099"
|
||||||
|
path="/0000-0003-2760-1191/work/28776099" visibility="public">
|
||||||
|
<common:created-date>2016-12-12T23:02:05.233Z</common:created-date>
|
||||||
|
<common:last-modified-date>2016-12-13T09:08:16.412Z</common:last-modified-date>
|
||||||
|
<common:source>
|
||||||
|
<common:source-orcid>
|
||||||
|
<common:uri>https://orcid.org/0000-0002-9157-3431</common:uri>
|
||||||
|
<common:path>0000-0002-9157-3431</common:path>
|
||||||
|
<common:host>orcid.org</common:host>
|
||||||
|
</common:source-orcid>
|
||||||
|
<common:source-name>Europe PubMed Central</common:source-name>
|
||||||
|
</common:source>
|
||||||
|
<work:title>
|
||||||
|
<common:title>Cutoff Value of Admission N-Terminal Pro-Brain Natriuretic Peptide Which
|
||||||
|
Predicts Poor Myocardial Perfusion after Primary Percutaneous Coronary Intervention for
|
||||||
|
ST-Segment-Elevation Myocardial Infarction.</common:title>
|
||||||
|
</work:title>
|
||||||
|
<work:citation>
|
||||||
|
<work:citation-type>formatted-unspecified</work:citation-type>
|
||||||
|
<work:citation-value>Abdel-Dayem K, Eweda II, El-Sherbiny A, Dimitry MO, Nammas W, Acta
|
||||||
|
Cardiologica Sinica, 2016, vol. 32, no. 6, pp. 649-655, 2016</work:citation-value>
|
||||||
|
</work:citation>
|
||||||
|
<work:type>journal-article</work:type>
|
||||||
|
<common:publication-date>
|
||||||
|
<common:year>2016</common:year>
|
||||||
|
<common:month>11</common:month>
|
||||||
|
</common:publication-date>
|
||||||
|
<common:external-ids>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmid</common:external-id-type>
|
||||||
|
<common:external-id-value>27899851</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true">27899851</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
<common:external-id>
|
||||||
|
<common:external-id-type>pmc</common:external-id-type>
|
||||||
|
<common:external-id-value>PMC5126442</common:external-id-value>
|
||||||
|
<common:external-id-normalized transient="true"
|
||||||
|
>PMC5126442</common:external-id-normalized>
|
||||||
|
<common:external-id-relationship>self</common:external-id-relationship>
|
||||||
|
</common:external-id>
|
||||||
|
</common:external-ids>
|
||||||
|
<common:url>http://europepmc.org/abstract/med/27899851</common:url>
|
||||||
|
<work:contributors>
|
||||||
|
<work:contributor>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>seq0</work:contributor-sequence>
|
||||||
|
<work:contributor-role>role0</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>creditname1</work:credit-name>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>creditname2</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>seq2</work:contributor-sequence>
|
||||||
|
<work:contributor-role></work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name>creditname3</work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence></work:contributor-sequence>
|
||||||
|
<work:contributor-role>role3</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
<work:contributor>
|
||||||
|
<work:credit-name></work:credit-name>
|
||||||
|
<work:contributor-attributes>
|
||||||
|
<work:contributor-sequence>seq4</work:contributor-sequence>
|
||||||
|
<work:contributor-role>role4</work:contributor-role>
|
||||||
|
</work:contributor-attributes>
|
||||||
|
</work:contributor>
|
||||||
|
</work:contributors>
|
||||||
|
</work:work>
|
|
@ -44,8 +44,6 @@ public class PropagationConstant {
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result";
|
||||||
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations";
|
||||||
|
|
||||||
public static final String PROPAGATION_AUTHOR_PID = "ORCID";
|
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
|
private static final String cfHbforResultQuery = "select distinct r.id, inst.collectedfrom.key cf, inst.hostedby.key hb "
|
||||||
|
|
|
@ -22,6 +22,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
|
@ -102,7 +103,8 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
+ " FROM result "
|
+ " FROM result "
|
||||||
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
||||||
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
||||||
+ " WHERE lower(MyP.qualifier.classid) = 'orcid') tmp "
|
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
|
||||||
|
+ " lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
|
||||||
+ " GROUP BY id) r_t "
|
+ " GROUP BY id) r_t "
|
||||||
+ " JOIN ("
|
+ " JOIN ("
|
||||||
+ " SELECT source, target "
|
+ " SELECT source, target "
|
||||||
|
|
|
@ -23,6 +23,7 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
@ -176,7 +177,7 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
if (toaddpid) {
|
if (toaddpid) {
|
||||||
StructuredProperty p = new StructuredProperty();
|
StructuredProperty p = new StructuredProperty();
|
||||||
p.setValue(autoritative_author.getOrcid());
|
p.setValue(autoritative_author.getOrcid());
|
||||||
p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID));
|
p.setQualifier(getQualifier(ModelConstants.ORCID_PENDING, ModelConstants.ORCID_CLASSNAME));
|
||||||
p
|
p
|
||||||
.setDataInfo(
|
.setDataInfo(
|
||||||
getDataInfo(
|
getDataInfo(
|
||||||
|
@ -201,7 +202,8 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (StructuredProperty pid : pids.get()) {
|
for (StructuredProperty pid : pids.get()) {
|
||||||
if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) {
|
if (ModelConstants.ORCID_PENDING.equals(pid.getQualifier().getClassid().toLowerCase()) ||
|
||||||
|
ModelConstants.ORCID.equals(pid.getQualifier().getClassid().toLowerCase())) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,11 @@ import org.junit.jupiter.api.Test;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.cloudera.org.codehaus.jackson.map.jsontype.impl.ClassNameIdResolver;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.PropagationConstant;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
|
||||||
public class OrcidPropagationJobTest {
|
public class OrcidPropagationJobTest {
|
||||||
|
@ -166,7 +169,9 @@ public class OrcidPropagationJobTest {
|
||||||
propagatedAuthors
|
propagatedAuthors
|
||||||
.filter(
|
.filter(
|
||||||
"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
|
"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
|
||||||
+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'")
|
+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = '" +
|
||||||
|
|
||||||
|
ModelConstants.ORCID_PENDING + "'")
|
||||||
.count());
|
.count());
|
||||||
|
|
||||||
Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count());
|
Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count());
|
||||||
|
|
|
@ -189,6 +189,7 @@ public class CleaningFunctions {
|
||||||
author.setRank(i++);
|
author.setRank(i++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
|
@ -201,13 +202,29 @@ public class CleaningFunctions {
|
||||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
.map(p -> {
|
.map(p -> {
|
||||||
|
// hack to distinguish orcid from orcid_pending
|
||||||
|
String pidProvenance = Optional
|
||||||
|
.ofNullable(p.getDataInfo())
|
||||||
|
.map(
|
||||||
|
d -> Optional
|
||||||
|
.ofNullable(d.getProvenanceaction())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(""))
|
||||||
|
.orElse("");
|
||||||
|
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
||||||
|
p.getQualifier().setClassid(ModelConstants.ORCID);
|
||||||
|
} else {
|
||||||
|
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
||||||
|
}
|
||||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||||
return p;
|
return p;
|
||||||
})
|
})
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
p -> p.getQualifier().getClassid() + p.getValue(),
|
||||||
|
Function.identity(),
|
||||||
|
(p1, p2) -> p1,
|
||||||
LinkedHashMap::new))
|
LinkedHashMap::new))
|
||||||
.values()
|
.values()
|
||||||
.stream()
|
.stream()
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue