merge branch with master
This commit is contained in:
commit
7c86e66697
|
@ -1,10 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
public class Constants {
|
||||
|
||||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||
|
@ -10,406 +14,399 @@ import eu.dnetlib.dhp.schema.oaf.Field;
|
|||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class GraphResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
|
||||
CommunityResult out = new CommunityResult();
|
||||
CommunityResult out = new CommunityResult();
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
|
||||
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
out.setSubjects(subjectList);
|
||||
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return out;
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
}
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
|
||||
setCommonValue(i, instance);
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
return instance;
|
||||
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
}
|
||||
|
||||
return instance;
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
}
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
|
|
|
@ -2,8 +2,12 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public abstract class Oaf implements Serializable {
|
||||
|
||||
|
@ -40,9 +44,34 @@ public abstract class Oaf implements Serializable {
|
|||
this.lastupdatetimestamp = lastupdatetimestamp;
|
||||
}
|
||||
|
||||
public void mergeOAFDataInfo(Oaf e) {
|
||||
if (e.getDataInfo() != null && compareTrust(this, e) < 0)
|
||||
dataInfo = e.getDataInfo();
|
||||
public void mergeFrom(Oaf o) {
|
||||
if (Objects.isNull(o)) {
|
||||
return;
|
||||
}
|
||||
setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(o.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
setLastupdatetimestamp(
|
||||
Math
|
||||
.max(
|
||||
Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
|
||||
Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
|
||||
}
|
||||
|
||||
public void mergeOAFDataInfo(Oaf o) {
|
||||
if (o.getDataInfo() != null && compareTrust(this, o) < 0)
|
||||
dataInfo = o.getDataInfo();
|
||||
}
|
||||
|
||||
protected String extractTrust(Oaf e) {
|
||||
|
@ -62,7 +91,7 @@ public abstract class Oaf implements Serializable {
|
|||
if (o == null || getClass() != o.getClass())
|
||||
return false;
|
||||
Oaf oaf = (Oaf) o;
|
||||
return Objects.equals(dataInfo, oaf.dataInfo)
|
||||
return Objects.equals(getDataInfo(), oaf.getDataInfo())
|
||||
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
|
||||
}
|
||||
|
||||
|
|
|
@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
|||
}
|
||||
|
||||
public void mergeFrom(OafEntity e) {
|
||||
|
||||
if (e == null)
|
||||
return;
|
||||
super.mergeFrom(e);
|
||||
|
||||
originalId = mergeLists(originalId, e.getOriginalId());
|
||||
|
||||
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
||||
|
||||
pid = mergeLists(pid, e.getPid());
|
||||
|
||||
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
||||
|
|
|
@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
|
|||
? p.getFundedamount()
|
||||
: fundedamount;
|
||||
|
||||
// programme = mergeLists(programme, p.getProgramme());
|
||||
|
||||
h2020classification = mergeLists(h2020classification, p.getH2020classification());
|
||||
|
||||
mergeOAFDataInfo(e);
|
||||
|
|
|
@ -130,19 +130,7 @@ public class Relation extends Oaf {
|
|||
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
||||
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
||||
|
||||
setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(r.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
super.mergeFrom(r);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.orcid;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class OrcidDOI {
|
||||
private String doi;
|
||||
private List<AuthorData> authors;
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public List<AuthorData> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<AuthorData> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
}
|
|
@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
|
|||
IOUtils
|
||||
.toString(
|
||||
CheckDuplictedIdsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String countPath = parser.get("workingPath") + "/counts";
|
||||
final String countPath = parser.get("outputDir") + "/counts";
|
||||
log.info("countPath: {}", countPath);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
|
|||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
|
|
|
@ -44,10 +44,10 @@ public class GenerateEventsJob {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String eventsPath = workingPath + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
|
@ -59,6 +59,9 @@ public class GenerateEventsJob {
|
|||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||
|
||||
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
|
||||
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
@ -70,12 +73,12 @@ public class GenerateEventsJob {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
|
||||
|
||||
final Dataset<ResultGroup> groups = ClusterUtils
|
||||
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
|
||||
.readPath(spark, workingDir + "/duplicates", ResultGroup.class);
|
||||
|
||||
final Dataset<Event> dataset = groups
|
||||
.map(
|
||||
g -> EventFinder
|
||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
|
||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
|
||||
Encoders
|
||||
.bean(EventGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
|
|
|
@ -46,7 +46,7 @@ public class GenerateStatsJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String dbUrl = parser.get("dbUrl");
|
||||
|
|
|
@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
|
||||
log.info("maxEventsForTopic: {}", maxEventsForTopic);
|
||||
|
||||
|
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
|
|||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
log.info("*** Start indexing");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
|
|
|
@ -54,7 +54,7 @@ public class IndexNotificationsJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
|
||||
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
|
||||
|
||||
|
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
|
|||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
log.info("*** Start indexing");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
|
|
|
@ -36,7 +36,7 @@ public class IndexOnESJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -45,6 +45,18 @@ public class IndexOnESJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
|
@ -53,15 +65,13 @@ public class IndexOnESJob {
|
|||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
||||
|
||||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
}
|
||||
|
|
|
@ -42,10 +42,10 @@ public class JoinStep0Job {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -57,10 +57,10 @@ public class JoinStep0Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedDatasource> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
|
||||
.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep1Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep1Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedProject> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
|
||||
.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -39,10 +39,10 @@ public class JoinStep2Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -54,10 +54,10 @@ public class JoinStep2Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedSoftware> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
|
||||
.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep3Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep3Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedDataset> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
|
||||
.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep4Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep4Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedPublication> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
|
||||
.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -29,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
|||
public class PartitionEventsByDsIdJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
||||
private static final String OPENDOAR_NSPREFIX = "opendoar____::";
|
||||
private static final String OPENDOAR_NSPREFIX = "10|opendoar____::";
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
|
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
|
|||
IOUtils
|
||||
.toString(
|
||||
PartitionEventsByDsIdJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -48,24 +55,43 @@ public class PartitionEventsByDsIdJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
||||
final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
|
||||
log.info("partitionPath: {}", partitionPath);
|
||||
|
||||
final String opendoarIds = parser.get("opendoarIds");
|
||||
log.info("opendoarIds: {}", opendoarIds);
|
||||
|
||||
final Set<String> validOpendoarIds = new HashSet<>();
|
||||
if (!opendoarIds.trim().equals("-")) {
|
||||
validOpendoarIds
|
||||
.addAll(
|
||||
Arrays
|
||||
.stream(opendoarIds.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
log.info("validOpendoarIds: {}", validOpendoarIds);
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
|
||||
.limit(10000)
|
||||
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
|
||||
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
|
||||
.map(
|
||||
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
|
||||
Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||
.coalesce(1)
|
||||
.write()
|
||||
.partitionBy("group")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(partitionPath);
|
||||
|
||||
});
|
||||
|
|
|
@ -45,10 +45,10 @@ public class PrepareGroupsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String groupsPath = workingPath + "/duplicates";
|
||||
final String groupsPath = workingDir + "/duplicates";
|
||||
log.info("groupsPath: {}", groupsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> results = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<Relation> mergedRels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
|
|
|
@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedDatasets";
|
||||
final String relsPath = workingDir + "/relatedDatasets";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
|
|||
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
|
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
|
|||
final Dataset<RelatedDataset> dataset = rels
|
||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
|
||||
t._2);
|
||||
rel.getRelDataset().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedDataset.class));
|
||||
|
|
|
@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedDatasources";
|
||||
final String relsPath = workingDir + "/relatedDatasources";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedProjects";
|
||||
final String relsPath = workingDir + "/relatedProjects";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
|
|||
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
|
|
|
@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedPublications";
|
||||
final String relsPath = workingDir + "/relatedPublications";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
|
|||
Encoders.bean(OaBrokerRelatedPublication.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
|
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
|
|||
final Dataset<RelatedPublication> dataset = rels
|
||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
|
||||
final RelatedPublication rel = new RelatedPublication(
|
||||
t._1.getSource(), t._2);
|
||||
rel.getRelPublication().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedPublication.class));
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedSoftwares";
|
||||
final String relsPath = workingDir + "/relatedSoftwares";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
|
|||
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String simpleEntitiesPath = workingPath + "/simpleEntities";
|
||||
final String simpleEntitiesPath = workingDir + "/simpleEntities";
|
||||
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
|||
|
||||
public EnrichMissingSubject() {
|
||||
super(20,
|
||||
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
||||
s -> {
|
||||
switch (s.getType().toLowerCase()) {
|
||||
case "acm":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_ACM;
|
||||
case "arxiv":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
|
||||
case "ddc":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_DDC;
|
||||
case "jel":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_JEL;
|
||||
case "mesh":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
|
||||
case "rvk":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_RVK;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
},
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
|||
|
||||
public EnrichMoreSubject() {
|
||||
super(20,
|
||||
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
||||
s -> {
|
||||
switch (s.getType().toLowerCase()) {
|
||||
case "acm":
|
||||
return Topic.ENRICH_MORE_SUBJECT_ACM;
|
||||
case "arxiv":
|
||||
return Topic.ENRICH_MORE_SUBJECT_ARXIV;
|
||||
case "ddc":
|
||||
return Topic.ENRICH_MORE_SUBJECT_DDC;
|
||||
case "jel":
|
||||
return Topic.ENRICH_MORE_SUBJECT_JEL;
|
||||
case "mesh":
|
||||
return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
|
||||
case "rvk":
|
||||
return Topic.ENRICH_MORE_SUBJECT_RVK;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
},
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class ClusterUtils {
|
||||
|
||||
|
@ -30,6 +31,16 @@ public class ClusterUtils {
|
|||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.map(r -> {
|
||||
r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
|
||||
r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
|
||||
return r;
|
||||
}, Encoders.bean(Relation.class));
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
final SparkSession spark,
|
||||
final String inputPath,
|
||||
|
@ -67,6 +78,7 @@ public class ClusterUtils {
|
|||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(path);
|
||||
}
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||
res.setOpenaireId(d.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(d.getId()));
|
||||
res.setOriginalId(first(d.getOriginalId()));
|
||||
res.setTitle(structPropValue(d.getTitle()));
|
||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -89,7 +89,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||
res.setOriginalId(first(p.getOriginalId()));
|
||||
res.setTitle(structPropValue(p.getTitle()));
|
||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -106,7 +106,7 @@ public class ConversionUtils {
|
|||
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
|
||||
res.setOpenaireId(result.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(result.getId()));
|
||||
res.setOriginalId(first(result.getOriginalId()));
|
||||
res.setTypology(classId(result.getResulttype()));
|
||||
res.setTitles(structPropList(result.getTitle()));
|
||||
|
@ -129,6 +129,10 @@ public class ConversionUtils {
|
|||
return res;
|
||||
}
|
||||
|
||||
public static String cleanOpenaireId(final String id) {
|
||||
return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
|
||||
}
|
||||
|
||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||
if (author == null) {
|
||||
return null;
|
||||
|
@ -188,7 +192,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerProject res = new OaBrokerProject();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||
res.setTitle(fieldValue(p.getTitle()));
|
||||
res.setAcronym(fieldValue(p.getAcronym()));
|
||||
res.setCode(fieldValue(p.getCode()));
|
||||
|
@ -214,7 +218,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||
res.setOpenaireId(sw.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(sw.getId()));
|
||||
res.setName(structPropValue(sw.getTitle()));
|
||||
res.setDescription(fieldValue(sw.getDescription()));
|
||||
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
|
||||
|
@ -230,7 +234,7 @@ public class ConversionUtils {
|
|||
|
||||
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
|
||||
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
|
||||
res.setOpenaireId(ds.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(ds.getId()));
|
||||
res.setType(classId(ds.getDatasourcetype()));
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
|
|||
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
|
||||
collectedFromSet
|
||||
.stream()
|
||||
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
|
||||
.map(
|
||||
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||
BrokerConstants.COLLECTED_FROM_REL))
|
||||
.forEach(res::addTuple);
|
||||
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
|
||||
|
||||
hostedBySet
|
||||
.stream()
|
||||
.map(
|
||||
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||
BrokerConstants.HOSTED_BY_REL))
|
||||
.forEach(res::addTuple);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -76,6 +76,7 @@ public class EventFinder {
|
|||
final Set<String> dsIdWhitelist,
|
||||
final Set<String> dsIdBlacklist,
|
||||
final Set<String> dsTypeWhitelist,
|
||||
final Set<String> topicWhitelist,
|
||||
final Map<String, LongAccumulator> accumulators) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
@ -84,7 +85,13 @@ public class EventFinder {
|
|||
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
||||
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
||||
for (final UpdateMatcher<?> matcher : matchers) {
|
||||
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
|
||||
for (final UpdateInfo<?> info : matcher
|
||||
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
|
||||
if (topicWhitelist == null || topicWhitelist.isEmpty()
|
||||
|| topicWhitelist.contains(info.getTopic().getPath())) {
|
||||
list.add(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[
|
||||
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the data are stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -7,7 +7,7 @@
|
|||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path where the temporary data will be stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
|
@ -24,6 +24,11 @@
|
|||
<value>-</value>
|
||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>topicWhitelist</name>
|
||||
<value>*</value>
|
||||
<description>a white list (comma separeted, * for all) of topics</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esEventIndexName</name>
|
||||
<description>the elasticsearch index name for events</description>
|
||||
|
@ -36,6 +41,26 @@
|
|||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -111,15 +136,15 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ensure_working_path"/>
|
||||
<start to="ensure_output_dir"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ensure_working_path">
|
||||
<action name="ensure_output_dir">
|
||||
<fs>
|
||||
<mkdir path='${workingPath}'/>
|
||||
<mkdir path='${outputDir}'/>
|
||||
</fs>
|
||||
<ok to="start_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -152,7 +177,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -176,7 +201,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -201,7 +226,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -225,7 +250,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -249,7 +274,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -273,7 +298,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -299,7 +324,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step1"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -323,7 +348,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -347,7 +372,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step3"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -371,7 +396,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step4"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -395,7 +420,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="prepare_groups"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -419,7 +444,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="generate_events"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -442,10 +467,12 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
|
||||
</spark>
|
||||
<ok to="index_event_subset"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -468,9 +495,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
|
@ -495,9 +526,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="stats"/>
|
||||
|
@ -521,7 +556,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
|
||||
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
|
||||
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
||||
|
|
|
@ -1,7 +1,13 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path where the temporary data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the generated events will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
@ -22,5 +28,11 @@
|
|||
"paramLongName": "datasourceIdBlacklist",
|
||||
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "topicWhitelist",
|
||||
"paramLongName": "topicWhitelist",
|
||||
"paramDescription": "a white list (comma separeted, * for all) of topics",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the data path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -16,5 +16,29 @@
|
|||
"paramLongName": "esHost",
|
||||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the generated data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -16,7 +16,31 @@
|
|||
"paramLongName": "esHost",
|
||||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "maxEventsForTopic",
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the dir that contains the events folder",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -17,6 +17,30 @@
|
|||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "broker",
|
||||
"paramLongName": "brokerApiBaseUrl",
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data are stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
|
@ -36,6 +36,26 @@
|
|||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -122,9 +142,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the data will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "list",
|
||||
"paramLongName": "opendoarIds",
|
||||
"paramDescription": "the opendoar IDs whitelist (comma separated)",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>opendoarIds</name>
|
||||
<description>the opendoar IDs whitelist (comma separated)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="opendoarPartition"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="opendoarPartition">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PartitionEventsByDsIdJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,41 +1,38 @@
|
|||
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceTypeWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource types</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdBlacklist</name>
|
||||
<value>-</value>
|
||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esEventIndexName</name>
|
||||
<description>the elasticsearch index name for events</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNotificationsIndexName</name>
|
||||
<description>the elasticsearch index name for notifications</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -44,18 +41,6 @@
|
|||
<name>brokerApiBaseUrl</name>
|
||||
<description>the url of the broker service api</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbUrl</name>
|
||||
<description>the url of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbUser</name>
|
||||
<description>the user of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbPassword</name>
|
||||
<description>the password of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -111,36 +96,45 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="partition"/>
|
||||
<start to="index_event_subset"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="partition">
|
||||
|
||||
<action name="index_event_subset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PartitionEventsByDsIdJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
|
||||
<name>IndexEventSubsetOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where generated data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -35,7 +35,8 @@ object DoiBoostMappingUtil {
|
|||
//STATIC STRING
|
||||
val MAG = "microsoft"
|
||||
val MAG_NAME = "Microsoft Academic Graph"
|
||||
val ORCID = "ORCID"
|
||||
val ORCID = "orcid"
|
||||
val ORCID_PENDING = "orcid_pending"
|
||||
val CROSSREF = "Crossref"
|
||||
val UNPAYWALL = "UnpayWall"
|
||||
val GRID_AC = "grid.ac"
|
||||
|
|
|
@ -62,7 +62,7 @@ object SparkGenerateDoiBoost {
|
|||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||
|
||||
logger.info("Phase 3) Join Result with MAG")
|
||||
logger.info("Phase 4) Join Result with MAG")
|
||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||
|
|
|
@ -200,7 +200,7 @@ case object Crossref2Oaf {
|
|||
a.setSurname(family)
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
||||
a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
@ -248,7 +248,7 @@ case object Crossref2Oaf {
|
|||
|
||||
|
||||
def snsfRule(award:String): String = {
|
||||
var tmp1 = StringUtils.substringAfter(award,"_")
|
||||
val tmp1 = StringUtils.substringAfter(award,"_")
|
||||
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
||||
logger.debug(s"From $award to $tmp2")
|
||||
tmp2
|
||||
|
@ -294,7 +294,7 @@ case object Crossref2Oaf {
|
|||
}
|
||||
|
||||
def getProjectId (nsPrefix:String, targetId:String):String = {
|
||||
"40|$nsPrefix::$targetId"
|
||||
s"40|$nsPrefix::$targetId"
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost.crossref
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.{IntWritable, Text}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
|
@ -12,21 +13,23 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
object CrossrefDataset {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
|
||||
def extractTimestamp(input:String): Long = {
|
||||
|
||||
def to_item(input:String):CrossrefDT = {
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
(json\"indexed"\"timestamp").extractOrElse[Long](0)
|
||||
val ts:Long = (json \ "indexed" \ "timestamp").extract[Long]
|
||||
val doi:String = (json \ "DOI").extract[String]
|
||||
CrossrefDT(doi, input, ts)
|
||||
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefDataset.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref_to_dataset_params.json")))
|
||||
parser.parseArgument(args)
|
||||
|
@ -49,9 +52,8 @@ object CrossrefDataset {
|
|||
if (a == null)
|
||||
return b
|
||||
|
||||
val tb = extractTimestamp(b.json)
|
||||
val ta = extractTimestamp(a.json)
|
||||
if(ta >tb) {
|
||||
|
||||
if(a.timestamp >b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
|
@ -63,9 +65,7 @@ object CrossrefDataset {
|
|||
if (a == null)
|
||||
return b
|
||||
|
||||
val tb = extractTimestamp(b.json)
|
||||
val ta = extractTimestamp(a.json)
|
||||
if(ta >tb) {
|
||||
if(a.timestamp >b.timestamp) {
|
||||
return a
|
||||
}
|
||||
b
|
||||
|
@ -78,15 +78,21 @@ object CrossrefDataset {
|
|||
override def finish(reduction: CrossrefDT): CrossrefDT = reduction
|
||||
}
|
||||
|
||||
val sourcePath:String = parser.get("sourcePath")
|
||||
val targetPath:String = parser.get("targetPath")
|
||||
val workingPath:String = parser.get("workingPath")
|
||||
|
||||
val ds:Dataset[CrossrefDT] = spark.read.load(sourcePath).as[CrossrefDT]
|
||||
|
||||
ds.groupByKey(_.doi)
|
||||
val main_ds:Dataset[CrossrefDT] = spark.read.load(s"$workingPath/crossref_ds").as[CrossrefDT]
|
||||
|
||||
|
||||
val update =
|
||||
spark.createDataset(spark.sparkContext.sequenceFile(s"$workingPath/index_update", classOf[IntWritable], classOf[Text])
|
||||
.map(i =>CrossrefImporter.decompressBlob(i._2.toString))
|
||||
.map(i =>to_item(i)))
|
||||
|
||||
main_ds.union(update).groupByKey(_.doi)
|
||||
.agg(crossrefAggregator.toColumn)
|
||||
.map(s=>s._2)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/crossref_ds_updated")
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -34,85 +34,21 @@ object SparkMapDumpIntoOAF {
|
|||
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
|
||||
|
||||
val sc = spark.sparkContext
|
||||
val targetPath = parser.get("targetPath")
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
spark.read.load(parser.get("sourcePath")).as[CrossrefDT]
|
||||
.flatMap(k => Crossref2Oaf.convert(k.json))
|
||||
.filter(o => o != null)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$targetPath/mixObject")
|
||||
|
||||
|
||||
val ds:Dataset[Oaf] = spark.read.load(s"$targetPath/mixObject").as[Oaf]
|
||||
|
||||
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.save(s"$targetPath/publication")
|
||||
ds.filter(o => o.isInstanceOf[Publication]).map(o => o.asInstanceOf[Publication]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefPublication")
|
||||
|
||||
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.save(s"$targetPath/relation")
|
||||
ds.filter(o => o.isInstanceOf[Relation]).map(o => o.asInstanceOf[Relation]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefRelation")
|
||||
|
||||
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.save(s"$targetPath/dataset")
|
||||
|
||||
|
||||
|
||||
//
|
||||
//
|
||||
//
|
||||
// sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
|
||||
// .map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
|
||||
// .flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
|
||||
//
|
||||
// val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
|
||||
//
|
||||
// val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
|
||||
// .map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
|
||||
// var r = if (p1 == null) p2 else p1
|
||||
// if (p1 != null && p2 != null) {
|
||||
// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
||||
// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
||||
// r = p2
|
||||
// else
|
||||
// r = p1
|
||||
// } else {
|
||||
// r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
||||
// }
|
||||
// }
|
||||
// r
|
||||
// }.map(_._2)
|
||||
//
|
||||
// val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
|
||||
// pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
|
||||
//
|
||||
//
|
||||
// val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
|
||||
// .map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
|
||||
// var r = if (p1 == null) p2 else p1
|
||||
// if (p1 != null && p2 != null) {
|
||||
// if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
||||
// if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
||||
// r = p2
|
||||
// else
|
||||
// r = p1
|
||||
// } else {
|
||||
// r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
||||
// }
|
||||
// }
|
||||
// r
|
||||
// }.map(_._2)
|
||||
//
|
||||
// spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
|
||||
//
|
||||
//
|
||||
//
|
||||
// val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
|
||||
// .map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
|
||||
// .reduceByKey { case (p1: Relation, p2: Relation) =>
|
||||
// if (p1 == null) p2 else p1
|
||||
// }.map(_._2)
|
||||
//
|
||||
// val rels: Dataset[Relation] = spark.createDataset(distinctRels)
|
||||
//
|
||||
// rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
|
||||
ds.filter(o => o.isInstanceOf[OafDataset]).map(o => o.asInstanceOf[OafDataset]).write.mode(SaveMode.Overwrite).save(s"$targetPath/crossrefDataset")
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
|
|||
|
||||
|
||||
val stream = Map(
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
// ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
||||
|
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
|
|||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
|
||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
||||
)
|
||||
|
||||
|
|
|
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
|
|||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
import spark.implicits._
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
||||
|
||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
|
||||
|
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
|
|||
.map(_._2)
|
||||
|
||||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
|
||||
|
||||
logger.info("Phase 0) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
|
||||
|
||||
logger.info("Phase 3) Group Author by PaperId")
|
||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||
|
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
|
|||
} else
|
||||
mpa
|
||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
|
||||
|
||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||
|
||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||
|
||||
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
||||
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
|
||||
|
||||
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
|
||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
|
||||
|
||||
|
||||
var magPubs: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
|
||||
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
|
||||
|
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
|
|||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
|
||||
.save(s"$workingPath/merge_step_2_conference")
|
||||
|
||||
|
||||
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
|
||||
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||
|
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
|
|||
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
||||
.save(s"$workingPath/merge_step_3")
|
||||
|
||||
|
||||
// logger.info("Phase 6) Enrich Publication with description")
|
||||
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
|
||||
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
||||
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
|
||||
|
||||
|
||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||
|
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
|
|||
.equalTo(paperField("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/mag_publication")
|
||||
.save(s"$workingPath/mag_publication")
|
||||
|
||||
|
||||
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
|
||||
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||
import org.apache.commons.lang.StringUtils
|
||||
|
@ -43,16 +44,19 @@ object ORCIDToOAF {
|
|||
}
|
||||
|
||||
|
||||
def convertTOOAF(input:ORCIDElement) :Publication = {
|
||||
val doi = input.doi
|
||||
def convertTOOAF(input:OrcidDOI) :Publication = {
|
||||
val doi = input.getDoi
|
||||
val pub:Publication = new Publication
|
||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
try{
|
||||
pub.setAuthor(input.authors.map(a=> {
|
||||
generateAuthor(a.name, a.surname, a.creditName, a.oid)
|
||||
}).asJava)
|
||||
|
||||
val l:List[Author]= input.getAuthors.asScala.map(a=> {
|
||||
generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
|
||||
})(collection.breakOut)
|
||||
|
||||
pub.setAuthor(l.asJava)
|
||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
pub
|
||||
|
@ -63,6 +67,13 @@ object ORCIDToOAF {
|
|||
}
|
||||
}
|
||||
|
||||
def generateOricPIDDatainfo():DataInfo = {
|
||||
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
|
||||
di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
|
||||
di.getProvenanceaction.setClassname("Harvested")
|
||||
di
|
||||
}
|
||||
|
||||
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
|
@ -72,7 +83,7 @@ object ORCIDToOAF {
|
|||
else
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
|
|
@ -1,21 +1,72 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkConvertORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
|
||||
|
||||
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
|
||||
.map(d => (d.getId, d))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(getPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
||||
parser.parseArgument(args)
|
||||
|
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
|
|||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
|
||||
run(spark, sourcePath, targetPath)
|
||||
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,88 +16,86 @@
|
|||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<!-- <property>-->
|
||||
<!-- <name>timestamp</name>-->
|
||||
<!-- <description>Timestamp for incremental Harvesting</description>-->
|
||||
<!-- </property>-->
|
||||
<property>
|
||||
<name>timestamp</name>
|
||||
<description>Timestamp for incremental Harvesting</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="ExtractCrossrefToOAF"/>
|
||||
<start to="ImportCrossRef"/>
|
||||
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<!-- <action name="ResetWorkingPath">-->
|
||||
<!-- <fs>-->
|
||||
<!-- <delete path='${workingPath}/input/crossref/index_dump'/>-->
|
||||
<!--<!– <mkdir path='${workingPath}/input/crossref'/>–>-->
|
||||
<!-- </fs>-->
|
||||
<!-- <ok to="ImportCrossRef"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
<action name="ImportCrossRef">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||
<arg>-t</arg><arg>${workingPath}/input/crossref/index_update</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-ts</arg><arg>${timestamp}</arg>
|
||||
</java>
|
||||
<ok to="GenerateDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractCrossrefToOAF</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>/data/doiboost/input/crossref</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="RenameDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RenameDataset">
|
||||
<fs>
|
||||
<delete path='${workingPath}/input/crossref/crossref_ds'/>
|
||||
<move source="${workingPath}/input/crossref/crossref_ds_updated"
|
||||
target="${workingPath}/input/crossref/crossref_ds"/>
|
||||
</fs>
|
||||
<ok to="ConvertCrossrefToOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<!-- <action name="ImportCrossRef">-->
|
||||
<!-- <java>-->
|
||||
<!-- <job-tracker>${jobTracker}</job-tracker>-->
|
||||
<!-- <name-node>${nameNode}</name-node>-->
|
||||
<!-- <main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>-->
|
||||
<!-- <arg>-t</arg><arg>${workingPath}/input/crossref/index_dump_1</arg>-->
|
||||
<!-- <arg>-n</arg><arg>${nameNode}</arg>-->
|
||||
<!-- <arg>-ts</arg><arg>${timestamp}</arg>-->
|
||||
<!-- </java>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
|
||||
<action name="ExtractCrossrefToOAF">
|
||||
<action name="ConvertCrossrefToOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ExtractCrossrefToOAF</name>
|
||||
<name>ConvertCrossrefToOAF</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}/input/crossref/crossref_ds</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/input/crossref</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/process/</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<!-- <action name="GenerateDataset">-->
|
||||
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||
<!-- <master>yarn-cluster</master>-->
|
||||
<!-- <mode>cluster</mode>-->
|
||||
<!-- <name>ExtractCrossrefToOAF</name>-->
|
||||
<!-- <class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>-->
|
||||
<!-- <jar>dhp-doiboost-${projectVersion}.jar</jar>-->
|
||||
<!-- <spark-opts>-->
|
||||
<!-- --executor-memory=${sparkExecutorMemory}-->
|
||||
<!-- --executor-cores=${sparkExecutorCores}-->
|
||||
<!-- --driver-memory=${sparkDriverMemory}-->
|
||||
<!-- ${sparkExtraOPT}-->
|
||||
<!-- </spark-opts>-->
|
||||
<!-- <arg>--sourcePath</arg><arg>/data/doiboost/crossref/cr_dataset</arg>-->
|
||||
<!-- <arg>--targetPath</arg><arg>/data/doiboost/crossref/crossrefDataset</arg>-->
|
||||
<!-- <arg>--master</arg><arg>yarn-cluster</arg>-->
|
||||
<!-- </spark>-->
|
||||
<!-- <ok to="End"/>-->
|
||||
<!-- <error to="Kill"/>-->
|
||||
<!-- </action>-->
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,6 +1,5 @@
|
|||
[
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -39,14 +39,7 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingDirPath}'/>
|
||||
<mkdir path='${workingDirPath}'/>
|
||||
</fs>
|
||||
<ok to="CreateDOIBoost"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateDOIBoost">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
|
|
@ -8,6 +8,10 @@
|
|||
<name>targetPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -31,10 +35,10 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${targetPath}'/>
|
||||
<mkdir path='${targetPath}'/>
|
||||
<delete path='${workingPath}'/>
|
||||
<mkdir path='${workingPath}'/>
|
||||
</fs>
|
||||
<ok to="PreprocessMag"/>
|
||||
<ok to="ConvertMagToDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -52,10 +56,10 @@
|
|||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<ok to="PreprocessMag"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -65,7 +69,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to Dataset</name>
|
||||
<name>Convert Mag to OAF Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -75,7 +79,8 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}/process</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target dir path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -1,5 +1,8 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
|
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
|
|||
})
|
||||
}
|
||||
|
||||
// @Test
|
||||
// def testOAFConvert():Unit ={
|
||||
//
|
||||
// val spark: SparkSession =
|
||||
// SparkSession
|
||||
// .builder()
|
||||
// .appName(getClass.getSimpleName)
|
||||
// .master("local[*]").getOrCreate()
|
||||
//
|
||||
//
|
||||
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
|
||||
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
//
|
||||
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
|
||||
// println(df.first.getId)
|
||||
// println(mapper.writeValueAsString(df.first()))
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// }
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
|||
.stream()
|
||||
.map(con -> con.getId())
|
||||
.collect(Collectors.toList());
|
||||
Result res = new Result();
|
||||
R res = (R) ret.getClass().newInstance();
|
||||
res.setId(ret.getId());
|
||||
List<Context> propagatedContexts = new ArrayList<>();
|
||||
for (String cId : communitySet) {
|
||||
|
|
|
@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob {
|
|||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
Result r = new Result();
|
||||
R r = (R) ret.getClass().newInstance();
|
||||
r.setId(ret.getId());
|
||||
r.setContext(contextList);
|
||||
ret.mergeFrom(r);
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class ResultToCommunityJobTest {
|
||||
|
@ -66,7 +65,7 @@ public class ResultToCommunityJobTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void test1() throws Exception {
|
||||
public void testSparkResultToCommunityThroughSemRelJob() throws Exception {
|
||||
SparkResultToCommunityThroughSemRelJob
|
||||
.main(
|
||||
new String[] {
|
||||
|
|
|
@ -190,15 +190,6 @@ public class CleaningFunctions {
|
|||
}
|
||||
}
|
||||
|
||||
final Set<String> collectedFrom = Optional
|
||||
.ofNullable(r.getCollectedfrom())
|
||||
.map(
|
||||
c -> c
|
||||
.stream()
|
||||
.map(KeyValue::getKey)
|
||||
.collect(Collectors.toCollection(HashSet::new)))
|
||||
.orElse(new HashSet<>());
|
||||
|
||||
for (Author a : r.getAuthor()) {
|
||||
if (Objects.isNull(a.getPid())) {
|
||||
a.setPid(Lists.newArrayList());
|
||||
|
|
|
@ -75,6 +75,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final Qualifier MAG_PID_TYPE = qualifier(
|
||||
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
|
||||
|
||||
protected static final String DEFAULT_TRUST_FOR_VALIDATED_RELS = "0.999";
|
||||
|
||||
protected static final Map<String, String> nsContext = new HashMap<>();
|
||||
|
||||
static {
|
||||
|
@ -244,25 +246,54 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
final String originalId = ((Node) o).getText();
|
||||
|
||||
final String validationdDate = ((Node) o).valueOf("@validationDate");
|
||||
|
||||
if (StringUtils.isNotBlank(originalId)) {
|
||||
final String projectId = createOpenaireId(40, originalId, true);
|
||||
|
||||
res
|
||||
.add(
|
||||
getRelation(
|
||||
getRelationWithValidationDate(
|
||||
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info,
|
||||
lastUpdateTimestamp));
|
||||
lastUpdateTimestamp, validationdDate));
|
||||
res
|
||||
.add(
|
||||
getRelation(
|
||||
getRelationWithValidationDate(
|
||||
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info,
|
||||
lastUpdateTimestamp));
|
||||
lastUpdateTimestamp, validationdDate));
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
protected Relation getRelationWithValidationDate(final String source,
|
||||
final String target,
|
||||
final String relType,
|
||||
final String subRelType,
|
||||
final String relClass,
|
||||
final KeyValue collectedFrom,
|
||||
final DataInfo info,
|
||||
final long lastUpdateTimestamp,
|
||||
final String validationDate) {
|
||||
|
||||
final Relation r = getRelation(
|
||||
source, target, relType, subRelType, relClass, collectedFrom, info, lastUpdateTimestamp);
|
||||
r.setValidated(StringUtils.isNotBlank(validationDate));
|
||||
r.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
|
||||
|
||||
if (StringUtils.isNotBlank(validationDate)) {
|
||||
r.setValidated(true);
|
||||
r.setValidationDate(validationDate);
|
||||
r.getDataInfo().setTrust(DEFAULT_TRUST_FOR_VALIDATED_RELS);
|
||||
} else {
|
||||
r.setValidated(false);
|
||||
r.setValidationDate(null);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
protected Relation getRelation(final String source,
|
||||
final String target,
|
||||
final String relType,
|
||||
|
|
|
@ -23,7 +23,15 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
|||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -462,44 +470,48 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
|
||||
return Arrays.asList(r);
|
||||
} else {
|
||||
final String validationDate = rs.getString("curation_date");
|
||||
|
||||
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
|
||||
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
|
||||
|
||||
final Relation r1 = new Relation();
|
||||
final Relation r2 = new Relation();
|
||||
|
||||
if (rs.getString(SOURCE_TYPE).equals("project")) {
|
||||
r1.setCollectedfrom(collectedFrom);
|
||||
r1.setRelType(RESULT_PROJECT);
|
||||
r1.setSubRelType(OUTCOME);
|
||||
r1.setRelClass(PRODUCES);
|
||||
|
||||
r2.setCollectedfrom(collectedFrom);
|
||||
r2.setRelType(RESULT_PROJECT);
|
||||
r2.setSubRelType(OUTCOME);
|
||||
r2.setRelClass(IS_PRODUCED_BY);
|
||||
} else {
|
||||
r1.setCollectedfrom(collectedFrom);
|
||||
r1.setRelType(RESULT_RESULT);
|
||||
r1.setSubRelType(RELATIONSHIP);
|
||||
r1.setRelClass(IS_RELATED_TO);
|
||||
|
||||
r2.setCollectedfrom(collectedFrom);
|
||||
r2.setRelType(RESULT_RESULT);
|
||||
r2.setSubRelType(RELATIONSHIP);
|
||||
r2.setRelClass(IS_RELATED_TO);
|
||||
}
|
||||
|
||||
r1.setValidated(true);
|
||||
r1.setValidationDate(validationDate);
|
||||
r1.setCollectedfrom(collectedFrom);
|
||||
r1.setSource(sourceId);
|
||||
r1.setTarget(targetId);
|
||||
r1.setDataInfo(info);
|
||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
r2.setValidationDate(validationDate);
|
||||
r2.setValidated(true);
|
||||
r2.setCollectedfrom(collectedFrom);
|
||||
r2.setSource(targetId);
|
||||
r2.setTarget(sourceId);
|
||||
r2.setDataInfo(info);
|
||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||
|
||||
if (rs.getString(SOURCE_TYPE).equals("project")) {
|
||||
r1.setRelType(RESULT_PROJECT);
|
||||
r1.setSubRelType(OUTCOME);
|
||||
r1.setRelClass(PRODUCES);
|
||||
|
||||
r2.setRelType(RESULT_PROJECT);
|
||||
r2.setSubRelType(OUTCOME);
|
||||
r2.setRelClass(IS_PRODUCED_BY);
|
||||
} else {
|
||||
r1.setRelType(RESULT_RESULT);
|
||||
r1.setSubRelType(RELATIONSHIP);
|
||||
r1.setRelClass(IS_RELATED_TO);
|
||||
|
||||
r2.setRelType(RESULT_RESULT);
|
||||
r2.setSubRelType(RELATIONSHIP);
|
||||
r2.setRelClass(IS_RELATED_TO);
|
||||
}
|
||||
|
||||
return Arrays.asList(r1, r2);
|
||||
}
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;
|
||||
SELECT source_type, source_id, target_type, target_id, semantics, curation_date::text FROM claim WHERE approved=TRUE;
|
|
@ -141,7 +141,10 @@ public class MappersTest {
|
|||
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||
|
||||
assertTrue(r1.getValidated());
|
||||
assertTrue(r2.getValidated());
|
||||
assertEquals(r1.getValidationDate(), "2020-01-01");
|
||||
assertEquals(r2.getValidationDate(), "2020-01-01");
|
||||
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
||||
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
||||
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
||||
|
@ -246,6 +249,10 @@ public class MappersTest {
|
|||
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||
assertTrue(r1.getValidated());
|
||||
assertTrue(r2.getValidated());
|
||||
assertEquals(r1.getValidationDate(), "2020-01-01");
|
||||
assertEquals(r2.getValidationDate(), "2020-01-01");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -355,7 +362,21 @@ public class MappersTest {
|
|||
assertValidId(p.getId());
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
System.out.println(p.getTitle().get(0).getValue());
|
||||
assertEquals(1, p.getAuthor().size());
|
||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||
assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getValue()));
|
||||
assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getQualifier().getClassid()));
|
||||
assertEquals("dataset", p.getResulttype().getClassname());
|
||||
assertEquals(1, p.getInstance().size());
|
||||
assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid());
|
||||
assertValidId(p.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertValidId(p.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals(
|
||||
"http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue());
|
||||
assertEquals(1, p.getInstance().get(0).getUrl().size());
|
||||
// System.out.println(p.getInstance().get(0).getUrl().get(0));
|
||||
// System.out.println(p.getInstance().get(0).getHostedby().getValue());
|
||||
System.out.println(p.getPid().get(0).getValue());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -51,7 +51,7 @@
|
|||
<!--<dr:CobjCategory type="publication">0001</dr:CobjCategory>-->
|
||||
<dr:CobjCategory>0001</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
|
||||
<oaf:projectid>corda_______::226852</oaf:projectid>
|
||||
<oaf:projectid validationDate="2020-01-01">corda_______::226852</oaf:projectid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="openaire____::issn226852" name="One Ecosystem"/>
|
||||
<oaf:collectedFrom
|
||||
|
|
|
@ -89,7 +89,7 @@
|
|||
<oaf:language>und</oaf:language>
|
||||
<oaf:concept id="https://zenodo.org/communities/epfl"/>
|
||||
<oaf:hostedBy id="re3data_____::r3d100010468" name="Zenodo"/>
|
||||
<oaf:projectid>corda_______::226852</oaf:projectid>
|
||||
<oaf:projectid validationDate="2020-01-01">corda_______::226852</oaf:projectid>
|
||||
<oaf:collectedFrom id="re3data_____::r3d100010468" name="Zenodo"/>
|
||||
<oaf:refereed>0001</oaf:refereed>s
|
||||
</metadata>
|
||||
|
|
|
@ -54,6 +54,13 @@
|
|||
<artifactId>spark-solr</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- the solr-test-framework requires the old junit:junit test framework -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.12</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
|
@ -140,6 +147,12 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class AuthorPidTypeComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
String lClass = Optional
|
||||
.ofNullable(left)
|
||||
.map(StructuredProperty::getQualifier)
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(null);
|
||||
|
||||
String rClass = Optional
|
||||
.ofNullable(right)
|
||||
.map(StructuredProperty::getQualifier)
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(null);
|
||||
|
||||
if (lClass == null && rClass == null)
|
||||
return 0;
|
||||
if (lClass == null)
|
||||
return 1;
|
||||
if (rClass == null)
|
||||
return -1;
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(ModelConstants.ORCID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.ORCID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.ORCID_PENDING))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.ORCID_PENDING))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
|
@ -254,6 +254,18 @@ public class XmlRecordFactory implements Serializable {
|
|||
p -> p,
|
||||
(p1, p2) -> p1))
|
||||
.values()
|
||||
.stream()
|
||||
.collect(
|
||||
Collectors
|
||||
.groupingBy(
|
||||
p -> p.getValue(),
|
||||
Collectors
|
||||
.mapping(
|
||||
p -> p,
|
||||
Collectors.minBy(new AuthorPidTypeComparator()))))
|
||||
.values()
|
||||
.stream()
|
||||
.map(op -> op.get())
|
||||
.forEach(
|
||||
sp -> {
|
||||
String pidType = getAuthorPidType(sp.getQualifier().getClassid());
|
||||
|
|
|
@ -6,15 +6,13 @@ import org.apache.solr.client.solrj.response.UpdateResponse;
|
|||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
public class SolrAdminApplicationTest extends SolrTest {
|
||||
|
||||
@Test
|
||||
public void testPing() throws Exception {
|
||||
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
|
||||
log.info("pingResponse: '{}'", pingResponse.getStatus());
|
||||
Assert.assertTrue(pingResponse.getStatus() == 0);
|
||||
Assertions.assertTrue(pingResponse.getStatus() == 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -5,71 +5,42 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mock;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
//TODO to enable it we need to update the joined_entity.json test file
|
||||
//@Disabled
|
||||
public class XmlRecordFactoryTest {
|
||||
|
||||
private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource";
|
||||
|
||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
public void testXMLRecordFactory() throws IOException, DocumentException {
|
||||
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream("joined_entity.json"));
|
||||
|
||||
assertNotNull(json);
|
||||
JoinedEntity je = new ObjectMapper().readValue(json, JoinedEntity.class);
|
||||
assertNotNull(je);
|
||||
|
||||
Document doc = buildXml(je);
|
||||
//// TODO specific test assertion on doc
|
||||
}
|
||||
|
||||
@Test
|
||||
void testBologna() throws IOException, DocumentException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream("oaf-bologna.json"));
|
||||
Publication oaf = new ObjectMapper().readValue(json, Publication.class);
|
||||
assertNotNull(oaf);
|
||||
JoinedEntity je = new JoinedEntity();
|
||||
je.setEntity(oaf);
|
||||
assertNotNull(je);
|
||||
|
||||
Document doc = buildXml(je);
|
||||
// TODO specific test assertion on doc
|
||||
|
||||
System.out.println(doc.asXML());
|
||||
|
||||
}
|
||||
|
||||
private Document buildXml(JoinedEntity je) throws DocumentException {
|
||||
ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, XmlConverterJob.schemaLocation,
|
||||
otherDsTypeId);
|
||||
|
||||
String xml = xmlRecordFactory.build(je);
|
||||
Publication p = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
|
||||
|
||||
String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
|
||||
|
||||
assertNotNull(xml);
|
||||
|
||||
|
@ -77,8 +48,14 @@ public class XmlRecordFactoryTest {
|
|||
|
||||
assertNotNull(doc);
|
||||
|
||||
// TODO add assertions based of values extracted from the XML record
|
||||
System.out.println(doc.asXML());
|
||||
|
||||
return doc;
|
||||
Assertions.assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid"));
|
||||
Assertions.assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending"));
|
||||
|
||||
Assertions.assertEquals("0000-0001-9613-9956", doc.valueOf("//creator[@rank = '2']/@orcid"));
|
||||
Assertions.assertEquals("", doc.valueOf("//creator[@rank = '2']/@orcid_pending"));
|
||||
|
||||
// TODO add assertions based of values extracted from the XML record
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,820 @@
|
|||
{
|
||||
"author": [
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Lee, Jaehyun",
|
||||
"name": "Jaehyun",
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "orcid",
|
||||
"classname": "Open Researcher and Contributor ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-6638"
|
||||
},
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "orcid_pending",
|
||||
"classname": "Open Researcher and Contributor ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-6639"
|
||||
}
|
||||
],
|
||||
"rank": 1,
|
||||
"surname": "Lee"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Berrada, Salim",
|
||||
"name": "Salim",
|
||||
"pid": [
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "orcid",
|
||||
"classname": "Open Researcher and Contributor ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-9956"
|
||||
},
|
||||
{
|
||||
"qualifier": {
|
||||
"classid": "orcid_pending",
|
||||
"classname": "Open Researcher and Contributor ID",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-9956"
|
||||
}
|
||||
],
|
||||
"rank": 2,
|
||||
"surname": "Berrada"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Adamu-Lema, Fikru",
|
||||
"name": "Fikru",
|
||||
"pid": [],
|
||||
"rank": 3,
|
||||
"surname": "Adamu-Lema"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Nagy, Nicole",
|
||||
"name": "Nicole",
|
||||
"pid": [],
|
||||
"rank": 4,
|
||||
"surname": "Nagy"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Georgiev, Vihar P.",
|
||||
"name": "Vihar P.",
|
||||
"pid": [],
|
||||
"rank": 5,
|
||||
"surname": "Georgiev"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Sadi, Toufik",
|
||||
"name": "Toufik",
|
||||
"pid": [],
|
||||
"rank": 6,
|
||||
"surname": "Sadi"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Liang, Jie",
|
||||
"name": "Jie",
|
||||
"pid": [],
|
||||
"rank": 7,
|
||||
"surname": "Liang"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Ramos, Raphael",
|
||||
"name": "Raphael",
|
||||
"pid": [],
|
||||
"rank": 8,
|
||||
"surname": "Ramos"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Carrillo-Nunez, Hamilton",
|
||||
"name": "Hamilton",
|
||||
"pid": [],
|
||||
"rank": 9,
|
||||
"surname": "Carrillo-Nunez"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Kalita, Dipankar",
|
||||
"name": "Dipankar",
|
||||
"pid": [],
|
||||
"rank": 10,
|
||||
"surname": "Kalita"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Lilienthal, Katharina",
|
||||
"name": "Katharina",
|
||||
"pid": [],
|
||||
"rank": 11,
|
||||
"surname": "Lilienthal"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Wislicenus, Marcus",
|
||||
"name": "Marcus",
|
||||
"pid": [],
|
||||
"rank": 12,
|
||||
"surname": "Wislicenus"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Pandey, Reeturaj",
|
||||
"name": "Reeturaj",
|
||||
"pid": [],
|
||||
"rank": 13,
|
||||
"surname": "Pandey"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Chen, Bingan",
|
||||
"name": "Bingan",
|
||||
"pid": [],
|
||||
"rank": 14,
|
||||
"surname": "Chen"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Teo, Kenneth B.K.",
|
||||
"name": "Kenneth B. K.",
|
||||
"pid": [],
|
||||
"rank": 15,
|
||||
"surname": "Teo"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Goncalves, Goncalo",
|
||||
"name": "Goncalo",
|
||||
"pid": [],
|
||||
"rank": 16,
|
||||
"surname": "Goncalves"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Okuno, Hanako",
|
||||
"name": "Hanako",
|
||||
"pid": [],
|
||||
"rank": 17,
|
||||
"surname": "Okuno"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Uhlig, Benjamin",
|
||||
"name": "Benjamin",
|
||||
"pid": [],
|
||||
"rank": 18,
|
||||
"surname": "Uhlig"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Todri-Sanial, Aida",
|
||||
"name": "Aida",
|
||||
"pid": [],
|
||||
"rank": 19,
|
||||
"surname": "Todri-Sanial"
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Dijon",
|
||||
"name": "",
|
||||
"pid": [],
|
||||
"rank": 20,
|
||||
"surname": ""
|
||||
},
|
||||
{
|
||||
"affiliation": [],
|
||||
"fullname": "Jean",
|
||||
"name": "",
|
||||
"pid": [],
|
||||
"rank": 21,
|
||||
"surname": ""
|
||||
}
|
||||
],
|
||||
"collectedfrom": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
}
|
||||
],
|
||||
"context": [],
|
||||
"contributor": [],
|
||||
"country": [],
|
||||
"coverage": [],
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"value": "2018-01-01"
|
||||
},
|
||||
"dateofcollection": "2020-01-27T11:32:33.729Z",
|
||||
"dateoftransformation": "2020-01-27T12:03:59.662Z",
|
||||
"description": [],
|
||||
"embargoenddate": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"value": ""
|
||||
},
|
||||
"extraInfo": [],
|
||||
"format": [],
|
||||
"fulltext": [],
|
||||
"id": "50|CSC_________::0000ec4dd9df012feaafa77e71a0fb4c",
|
||||
"instance": [
|
||||
{
|
||||
"accessright": {
|
||||
"classid": "OPEN",
|
||||
"classname": "Open Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"collectedfrom": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"value": "2018-01-01"
|
||||
},
|
||||
"distributionlocation": "",
|
||||
"hostedby": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "0001",
|
||||
"classname": "Article",
|
||||
"schemeid": "dnet:dataCite_resource",
|
||||
"schemename": "dnet:dataCite_resource"
|
||||
},
|
||||
"license": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"value": ""
|
||||
},
|
||||
"url": [
|
||||
"http://juuli.fi/Record/0331473718",
|
||||
"http://dx.doi.org/10.1109/TED.2018.2853550"
|
||||
]
|
||||
}
|
||||
],
|
||||
"journal": {
|
||||
"conferencedate": "",
|
||||
"conferenceplace": "",
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"edition": "",
|
||||
"ep": "3892",
|
||||
"iss": "9",
|
||||
"issnLinking": "",
|
||||
"issnOnline": "",
|
||||
"issnPrinted": "0018-9383",
|
||||
"name": "IEEE Transactions on Electron Devices",
|
||||
"sp": "3884",
|
||||
"vol": "65"
|
||||
},
|
||||
"language": {
|
||||
"classid": "en",
|
||||
"classname": "en",
|
||||
"schemeid": "dnet:languages",
|
||||
"schemename": "dnet:languages"
|
||||
},
|
||||
"lastupdatetimestamp": 0,
|
||||
"originalId": [
|
||||
"0331473718",
|
||||
"10.1109/TED.2018.2853550",
|
||||
"http://juuli.fi/Record/0331473718"
|
||||
],
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "doi",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1109/TED.2018.2853550"
|
||||
}
|
||||
],
|
||||
"publisher": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"value": ""
|
||||
},
|
||||
"relevantdate": [],
|
||||
"resourcetype": {
|
||||
"classid": "0001",
|
||||
"classname": "Article",
|
||||
"schemeid": "dnet:dataCite_resource",
|
||||
"schemename": "dnet:dataCite_resource"
|
||||
},
|
||||
"resulttype": {
|
||||
"classid": "publication",
|
||||
"classname": "publication",
|
||||
"schemeid": "dnet:result_typologies",
|
||||
"schemename": "dnet:result_typologies"
|
||||
},
|
||||
"source": [],
|
||||
"subject": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "http://finto.fi/okm-tieteenala/en/",
|
||||
"classname": "finto",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "ta114"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Conductivity"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Contacts"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Cu-carbon nanotubes (CNT) composites"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "density functional theory (DFT)"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Discrete Fourier transforms"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Electromigration"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "electromigration (EM)"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "electrothermal"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "interconnects"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Lattices"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "multiscale simulation"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Resistance"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "self-heating."
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "Thermal conductivity"
|
||||
}
|
||||
],
|
||||
"title": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"trust": ""
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "Understanding Electromigration in Cu-CNT Composite Interconnects A Multiscale Electrothermal Simulation Study"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-stats-promote</artifactId>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<version>2.1.11</version>
|
||||
<configuration>
|
||||
<failOnNoGitDirectory>false</failOnNoGitDirectory>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,34 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>${jobTracker}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>${nameNode}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<value>${stats_tool_api_url}</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,18 @@
|
|||
export PYTHON_EGG_CACHE=/home/$(whoami)/.python-eggs
|
||||
export link_folder=/tmp/impala-shell-python-egg-cache-$(whoami)
|
||||
if ! [ -L $link_folder ]
|
||||
then
|
||||
rm -Rf "$link_folder"
|
||||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
echo "Getting file from " $3
|
||||
hdfs dfs -copyToLocal $3
|
||||
|
||||
echo "Running impala shell make the new database visible"
|
||||
impala-shell -q "INVALIDATE METADATA;"
|
||||
|
||||
echo "Running impala shell to compute new table stats"
|
||||
impala-shell -d $1 -f $2
|
||||
echo "Impala shell finished"
|
||||
rm $2
|
|
@ -0,0 +1,4 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
curl --request GET $1/cache/promoteCache
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Impala table statistics - Needed to make the tables
|
||||
-- visible for impala
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
INVALIDATE METADATA ${stats_db_name};
|
|
@ -0,0 +1,207 @@
|
|||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
-- Shadow schema table exchange
|
||||
------------------------------------------------------
|
||||
------------------------------------------------------
|
||||
|
||||
-- Dropping old views
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.category;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.concept;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.context;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.country;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.countrygdp;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.creation_date;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_refereed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.dataset_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_organizations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_results;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.datasource_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.funder;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.fundref;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.numbers_country;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.organization;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.organization_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.organization_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.organization_projects;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.organization_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_refereed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.otherresearchproduct_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project_organizations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project_results;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project_resultcount;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.project_results_publication;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_refereed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.publication_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_affiliated_country;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_deposited_country;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_fundercount;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_gold;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_greenoa;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_organization;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_peerreviewed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_projectcount;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_projects;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_refereed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.result_topics;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.rndexpediture;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.roarmap;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_citations;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_classifications;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_concepts;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_datasources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_languages;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_licenses;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_oids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_pids;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_refereed;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_sources;
|
||||
DROP VIEW IF EXISTS ${stats_db_production_name}.software_topics;
|
||||
|
||||
|
||||
-- Creating the shadow database, in case it doesn't exist
|
||||
CREATE database IF NOT EXISTS ${stats_db_production_name};
|
||||
|
||||
-- Creating new views
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.category AS SELECT * FROM ${stats_db_name}.category;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.concept AS SELECT * FROM ${stats_db_name}.concept;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.context AS SELECT * FROM ${stats_db_name}.context;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.country AS SELECT * FROM ${stats_db_name}.country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.countrygdp AS SELECT * FROM ${stats_db_name}.countrygdp;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.creation_date AS SELECT * FROM ${stats_db_name}.creation_date;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset AS SELECT * FROM ${stats_db_name}.dataset;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_citations AS SELECT * FROM ${stats_db_name}.dataset_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_classifications AS SELECT * FROM ${stats_db_name}.dataset_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_concepts AS SELECT * FROM ${stats_db_name}.dataset_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_datasources AS SELECT * FROM ${stats_db_name}.dataset_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_languages AS SELECT * FROM ${stats_db_name}.dataset_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_licenses AS SELECT * FROM ${stats_db_name}.dataset_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_oids AS SELECT * FROM ${stats_db_name}.dataset_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_pids AS SELECT * FROM ${stats_db_name}.dataset_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_refereed AS SELECT * FROM ${stats_db_name}.dataset_refereed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_sources AS SELECT * FROM ${stats_db_name}.dataset_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.dataset_topics AS SELECT * FROM ${stats_db_name}.dataset_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource AS SELECT * FROM ${stats_db_name}.datasource;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_languages AS SELECT * FROM ${stats_db_name}.datasource_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_oids AS SELECT * FROM ${stats_db_name}.datasource_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_organizations AS SELECT * FROM ${stats_db_name}.datasource_organizations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_results AS SELECT * FROM ${stats_db_name}.datasource_results;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.datasource_sources AS SELECT * FROM ${stats_db_name}.datasource_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.funder AS SELECT * FROM ${stats_db_name}.funder;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.fundref AS SELECT * FROM ${stats_db_name}.fundref;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.numbers_country AS SELECT * FROM ${stats_db_name}.numbers_country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization AS SELECT * FROM ${stats_db_name}.organization;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_datasources AS SELECT * FROM ${stats_db_name}.organization_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_pids AS SELECT * FROM ${stats_db_name}.organization_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_projects AS SELECT * FROM ${stats_db_name}.organization_projects;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.organization_sources AS SELECT * FROM ${stats_db_name}.organization_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct AS SELECT * FROM ${stats_db_name}.otherresearchproduct;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_citations AS SELECT * FROM ${stats_db_name}.otherresearchproduct_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_classifications AS SELECT * FROM ${stats_db_name}.otherresearchproduct_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_concepts AS SELECT * FROM ${stats_db_name}.otherresearchproduct_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_datasources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_languages AS SELECT * FROM ${stats_db_name}.otherresearchproduct_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_licenses AS SELECT * FROM ${stats_db_name}.otherresearchproduct_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_oids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_pids AS SELECT * FROM ${stats_db_name}.otherresearchproduct_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_refereed AS SELECT * FROM ${stats_db_name}.otherresearchproduct_refereed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_sources AS SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.otherresearchproduct_topics AS SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project AS SELECT * FROM ${stats_db_name}.project;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_oids AS SELECT * FROM ${stats_db_name}.project_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_organizations AS SELECT * FROM ${stats_db_name}.project_organizations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results AS SELECT * FROM ${stats_db_name}.project_results;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_resultcount AS SELECT * FROM ${stats_db_name}.project_resultcount;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.project_results_publication AS SELECT * FROM ${stats_db_name}.project_results_publication;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication AS SELECT * FROM ${stats_db_name}.publication;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_citations AS SELECT * FROM ${stats_db_name}.publication_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_classifications AS SELECT * FROM ${stats_db_name}.publication_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_concepts AS SELECT * FROM ${stats_db_name}.publication_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_datasources AS SELECT * FROM ${stats_db_name}.publication_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_languages AS SELECT * FROM ${stats_db_name}.publication_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_licenses AS SELECT * FROM ${stats_db_name}.publication_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_oids AS SELECT * FROM ${stats_db_name}.publication_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_pids AS SELECT * FROM ${stats_db_name}.publication_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_refereed AS SELECT * FROM ${stats_db_name}.publication_refereed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_sources AS SELECT * FROM ${stats_db_name}.publication_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.publication_topics AS SELECT * FROM ${stats_db_name}.publication_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result AS SELECT * FROM ${stats_db_name}.result;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_affiliated_country AS SELECT * FROM ${stats_db_name}.result_affiliated_country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_citations AS SELECT * FROM ${stats_db_name}.result_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_classifications AS SELECT * FROM ${stats_db_name}.result_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_concepts AS SELECT * FROM ${stats_db_name}.result_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_datasources AS SELECT * FROM ${stats_db_name}.result_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_deposited_country AS SELECT * FROM ${stats_db_name}.result_deposited_country;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_fundercount AS SELECT * FROM ${stats_db_name}.result_fundercount;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_gold AS SELECT * FROM ${stats_db_name}.result_gold;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_greenoa AS SELECT * FROM ${stats_db_name}.result_greenoa;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_languages AS SELECT * FROM ${stats_db_name}.result_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_licenses AS SELECT * FROM ${stats_db_name}.result_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_oids AS SELECT * FROM ${stats_db_name}.result_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_organization AS SELECT * FROM ${stats_db_name}.result_organization;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_peerreviewed AS SELECT * FROM ${stats_db_name}.result_peerreviewed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_pids AS SELECT * FROM ${stats_db_name}.result_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projectcount AS SELECT * FROM ${stats_db_name}.result_projectcount;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_projects AS SELECT * FROM ${stats_db_name}.result_projects;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_refereed AS SELECT * FROM ${stats_db_name}.result_refereed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_sources AS SELECT * FROM ${stats_db_name}.result_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.result_topics AS SELECT * FROM ${stats_db_name}.result_topics;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.rndexpediture AS SELECT * FROM ${stats_db_name}.rndexpediture;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.roarmap AS SELECT * FROM ${stats_db_name}.roarmap;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software AS SELECT * FROM ${stats_db_name}.software;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_citations AS SELECT * FROM ${stats_db_name}.software_citations;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_classifications AS SELECT * FROM ${stats_db_name}.software_classifications;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_concepts AS SELECT * FROM ${stats_db_name}.software_concepts;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_datasources AS SELECT * FROM ${stats_db_name}.software_datasources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_languages AS SELECT * FROM ${stats_db_name}.software_languages;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_licenses AS SELECT * FROM ${stats_db_name}.software_licenses;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_oids AS SELECT * FROM ${stats_db_name}.software_oids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_pids AS SELECT * FROM ${stats_db_name}.software_pids;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_refereed AS SELECT * FROM ${stats_db_name}.software_refereed;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_sources AS SELECT * FROM ${stats_db_name}.software_sources;
|
||||
CREATE VIEW IF NOT EXISTS ${stats_db_production_name}.software_topics AS SELECT * FROM ${stats_db_name}.software_topics;
|
|
@ -0,0 +1,87 @@
|
|||
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
<description>the target stats database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_db_production_name</name>
|
||||
<description>the name of the production schema</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<description>The url of the API of the stats tool. Is used to trigger the cache promote.</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<description>hive server jdbc url</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_timeout</name>
|
||||
<description>the time period, in seconds, after which Hive fails a transaction if a Hive client has not sent a hearbeat. The default value is 300 seconds.</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>hive.metastore.uris</name>
|
||||
<value>${hive_metastore_uris}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive.txn.timeout</name>
|
||||
<value>${hive_timeout}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="updateProductionViews"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="updateProductionViews">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/updateProductionViews.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>stats_db_production_name=${stats_db_production_name}</param>
|
||||
</hive2>
|
||||
<ok to="computeProductionStats"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="computeProductionStats">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>impala-shell.sh</exec>
|
||||
<argument>${stats_db_production_name}</argument>
|
||||
<argument>computeProductionStats.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/computeProductionStats.sql</argument>
|
||||
<file>impala-shell.sh</file>
|
||||
</shell>
|
||||
<ok to="promoteCache"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="promoteCache">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>promoteCache.sh</exec>
|
||||
<argument>${stats_tool_api_url}</argument>
|
||||
<file>promoteCache.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -27,4 +27,8 @@
|
|||
<name>oozie.wf.workflow.notification.url</name>
|
||||
<value>{serviceUrl}/v1/oozieNotification/jobUpdate?jobId=$jobId%26status=$status</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<value>${stats_tool_api_url}</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -11,7 +11,7 @@ CREATE TABLE ${stats_db_name}.project_oids AS SELECT substr(p.id, 4) AS id, oids
|
|||
|
||||
-- Project_organizations Table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_organizations;
|
||||
CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization';
|
||||
CREATE TABLE ${stats_db_name}.project_organizations AS SELECT substr(r.source, 4) AS id, substr(r.target, 4) AS organization from ${openaire_db_name}.relation r WHERE r.reltype='projectOrganization' and r.datainfo.deletedbyinference=false;
|
||||
|
||||
-- Project_results Table
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.project_results;
|
||||
|
|
|
@ -25,7 +25,7 @@ CREATE OR REPLACE VIEW ${stats_db_name}.result_pids AS SELECT * FROM ${stats_db_
|
|||
CREATE OR REPLACE VIEW ${stats_db_name}.result_topics AS SELECT * FROM ${stats_db_name}.publication_topics UNION ALL SELECT * FROM ${stats_db_name}.software_topics UNION ALL SELECT * FROM ${stats_db_name}.dataset_topics UNION ALL SELECT * FROM ${stats_db_name}.otherresearchproduct_topics;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_organization;
|
||||
CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization';
|
||||
CREATE TABLE ${stats_db_name}.result_organization AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='resultOrganization' and r.datainfo.deletedbyinference=false;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.result_projects;
|
||||
CREATE TABLE ${stats_db_name}.result_projects AS select pr.result AS id, pr.id AS project, datediff(p.enddate, p.startdate) AS daysfromend FROM ${stats_db_name}.result r JOIN ${stats_db_name}.project_results pr ON r.id=pr.result JOIN ${stats_db_name}.project_tmp p ON p.id=pr.id;
|
||||
|
|
|
@ -47,7 +47,7 @@ DROP TABLE IF EXISTS ${stats_db_name}.datasource_oids;
|
|||
CREATE TABLE ${stats_db_name}.datasource_oids AS SELECT substr(d.id, 4) AS id, oids.ids AS oid FROM ${openaire_db_name}.datasource d LATERAL VIEW explode(d.originalid) oids AS ids;
|
||||
|
||||
DROP TABLE IF EXISTS ${stats_db_name}.datasource_organizations;
|
||||
CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization';
|
||||
CREATE TABLE ${stats_db_name}.datasource_organizations AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype='datasourceOrganization' and r.datainfo.deletedbyinference=false;
|
||||
|
||||
-- datasource sources:
|
||||
-- where the datasource info have been collected from.
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
curl --request GET $1/cache/updateCache
|
||||
|
|
@ -17,6 +17,10 @@
|
|||
<name>stats_db_shadow_name</name>
|
||||
<description>the name of the shadow schema</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>stats_tool_api_url</name>
|
||||
<description>The url of the API of the stats tool. Is used to trigger the cache update.</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<description>hive server metastore URIs</description>
|
||||
|
@ -255,7 +259,7 @@
|
|||
<action name="Step17">
|
||||
<hive2 xmlns="uri:oozie:hive2-action:0.1">
|
||||
<jdbc-url>${hive_jdbc_url}</jdbc-url>
|
||||
<script>scripts/step17.sql</script>
|
||||
<script>scripts/updateProductionViews.sql</script>
|
||||
<param>stats_db_name=${stats_db_name}</param>
|
||||
<param>stats_db_shadow_name=${stats_db_shadow_name}</param>
|
||||
</hive2>
|
||||
|
@ -283,10 +287,22 @@
|
|||
<name-node>${nameNode}</name-node>
|
||||
<exec>impala-shell.sh</exec>
|
||||
<argument>${stats_db_shadow_name}</argument>
|
||||
<argument>step19.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/step19.sql</argument>
|
||||
<argument>computeProductionStats.sql</argument>
|
||||
<argument>${wf:appPath()}/scripts/computeProductionStats.sql</argument>
|
||||
<file>impala-shell.sh</file>
|
||||
</shell>
|
||||
<ok to="Step20"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="Step20">
|
||||
<shell xmlns="uri:oozie:shell-action:0.1">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<exec>updateCache.sh</exec>
|
||||
<argument>${stats_tool_api_url}</argument>
|
||||
<file>updateCache.sh</file>
|
||||
</shell>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.4-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
|
|
12
pom.xml
12
pom.xml
|
@ -278,12 +278,12 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.3</version>
|
||||
<version>${org.apache.httpcomponents.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
<version>4.5.3</version>
|
||||
<version>${org.apache.httpcomponents.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.noggit</groupId>
|
||||
|
@ -484,12 +484,6 @@
|
|||
<version>${common.text.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>${org.apache.httpcomponents.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
||||
|
@ -719,7 +713,7 @@
|
|||
<common.csv.version>1.8</common.csv.version>
|
||||
<apache.poi.version>4.1.2</apache.poi.version>
|
||||
<common.text.version>1.8</common.text.version>
|
||||
<org.apache.httpcomponents.version>4.3.4</org.apache.httpcomponents.version>
|
||||
<org.apache.httpcomponents.version>4.5.3</org.apache.httpcomponents.version>
|
||||
<net.alchim31.maven.version>4.0.1</net.alchim31.maven.version>
|
||||
</properties>
|
||||
</project>
|
||||
|
|
Loading…
Reference in New Issue