forked from D-Net/dnet-hadoop
Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi
This commit is contained in:
commit
c238561001
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
public class Constants {
|
||||
|
||||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||
|
||||
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
|
||||
|
||||
static {
|
||||
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
|
||||
accessRightsCoarMap.put("CLOSED", "c_14cb");
|
||||
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
|
||||
}
|
||||
|
||||
static {
|
||||
coarCodeLabelMap.put("c_abf2", "OPEN");
|
||||
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
|
||||
coarCodeLabelMap.put("c_14cb", "CLOSED");
|
||||
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,412 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class GraphResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in) {
|
||||
|
||||
CommunityResult out = new CommunityResult();
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||
if (ort.isPresent()) {
|
||||
switch (ort.get().getClassid()) {
|
||||
case "publication":
|
||||
Optional<Journal> journal = Optional
|
||||
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||
if (journal.isPresent()) {
|
||||
Journal j = journal.get();
|
||||
Container c = new Container();
|
||||
c.setConferencedate(j.getConferencedate());
|
||||
c.setConferenceplace(j.getConferenceplace());
|
||||
c.setEdition(j.getEdition());
|
||||
c.setEp(j.getEp());
|
||||
c.setIss(j.getIss());
|
||||
c.setIssnLinking(j.getIssnLinking());
|
||||
c.setIssnOnline(j.getIssnOnline());
|
||||
c.setIssnPrinted(j.getIssnPrinted());
|
||||
c.setName(j.getName());
|
||||
c.setSp(j.getSp());
|
||||
c.setVol(j.getVol());
|
||||
out.setContainer(c);
|
||||
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
|
||||
}
|
||||
break;
|
||||
case "dataset":
|
||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
|
||||
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
|
||||
|
||||
out
|
||||
.setGeolocation(
|
||||
Optional
|
||||
.ofNullable(id.getGeolocation())
|
||||
.map(
|
||||
igl -> igl
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.map(gli -> {
|
||||
GeoLocation gl = new GeoLocation();
|
||||
gl.setBox(gli.getBox());
|
||||
gl.setPlace(gli.getPlace());
|
||||
gl.setPoint(gli.getPoint());
|
||||
return gl;
|
||||
})
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "software":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||
Optional
|
||||
.ofNullable(is.getCodeRepositoryUrl())
|
||||
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(is.getDocumentationUrl())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setDocumentationUrl(
|
||||
value
|
||||
.stream()
|
||||
.map(v -> v.getValue())
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
Optional
|
||||
.ofNullable(is.getProgrammingLanguage())
|
||||
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
|
||||
|
||||
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
|
||||
break;
|
||||
case "other":
|
||||
|
||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||
out
|
||||
.setContactgroup(
|
||||
Optional
|
||||
.ofNullable(ir.getContactgroup())
|
||||
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out
|
||||
.setContactperson(
|
||||
Optional
|
||||
.ofNullable(ir.getContactperson())
|
||||
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
out
|
||||
.setTool(
|
||||
Optional
|
||||
.ofNullable(ir.getTool())
|
||||
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
|
||||
.orElse(null));
|
||||
|
||||
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getAuthor())
|
||||
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
|
||||
|
||||
// I do not map Access Right UNKNOWN or OTHER
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||
if (oar.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||
out
|
||||
.setBestaccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
final List<String> contributorList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getContributor())
|
||||
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||
out.setContributor(contributorList);
|
||||
|
||||
Optional
|
||||
.ofNullable(input.getCountry())
|
||||
.ifPresent(
|
||||
value -> out
|
||||
.setCountry(
|
||||
value
|
||||
.stream()
|
||||
.map(
|
||||
c -> {
|
||||
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
|
||||
return null;
|
||||
}
|
||||
Country country = new Country();
|
||||
country.setCode(c.getClassid());
|
||||
country.setLabel(c.getClassname());
|
||||
Optional
|
||||
.ofNullable(c.getDataInfo())
|
||||
.ifPresent(
|
||||
provenance -> country
|
||||
.setProvenance(
|
||||
Provenance
|
||||
.newInstance(
|
||||
provenance
|
||||
.getProvenanceaction()
|
||||
.getClassname(),
|
||||
c.getDataInfo().getTrust())));
|
||||
return country;
|
||||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList())));
|
||||
|
||||
final List<String> coverageList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getCoverage())
|
||||
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||
out.setCoverage(coverageList);
|
||||
|
||||
out.setDateofcollection(input.getDateofcollection());
|
||||
|
||||
final List<String> descriptionList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getDescription())
|
||||
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
|
||||
out.setDescription(descriptionList);
|
||||
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||
if (oStr.isPresent()) {
|
||||
out.setEmbargoenddate(oStr.get().getValue());
|
||||
}
|
||||
|
||||
final List<String> formatList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getFormat())
|
||||
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||
out.setFormat(formatList);
|
||||
out.setId(input.getId());
|
||||
out.setOriginalId(input.getOriginalId());
|
||||
|
||||
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
|
||||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
out
|
||||
.setInstance(
|
||||
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
|
||||
|
||||
}
|
||||
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||
if (oL.isPresent()) {
|
||||
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||
}
|
||||
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||
if (oLong.isPresent()) {
|
||||
out.setLastupdatetimestamp(oLong.get());
|
||||
}
|
||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||
if (otitle.isPresent()) {
|
||||
List<StructuredProperty> iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setMaintitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
iTitle = otitle
|
||||
.get()
|
||||
.stream()
|
||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||
.collect(Collectors.toList());
|
||||
if (iTitle.size() > 0) {
|
||||
out.setSubtitle(iTitle.get(0).getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
List<ControlledField> pids = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getPid())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.stream()
|
||||
.forEach(
|
||||
p -> pids
|
||||
.add(
|
||||
ControlledField
|
||||
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||
out.setPid(pids);
|
||||
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublicationdate(oStr.get().getValue());
|
||||
}
|
||||
oStr = Optional.ofNullable(input.getPublisher());
|
||||
if (oStr.isPresent()) {
|
||||
out.setPublisher(oStr.get().getValue());
|
||||
}
|
||||
|
||||
List<String> sourceList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSource())
|
||||
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||
List<Subject> subjectList = new ArrayList<>();
|
||||
Optional
|
||||
.ofNullable(input.getSubject())
|
||||
.ifPresent(
|
||||
value -> value
|
||||
.forEach(s -> subjectList.add(getSubject(s))));
|
||||
|
||||
out.setSubjects(subjectList);
|
||||
|
||||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
out
|
||||
.setCollectedfrom(
|
||||
input
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return out;
|
||||
|
||||
}
|
||||
|
||||
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
CommunityInstance instance = new CommunityInstance();
|
||||
|
||||
setCommonValue(i, instance);
|
||||
|
||||
instance
|
||||
.setCollectedfrom(
|
||||
KeyValue
|
||||
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||
|
||||
instance
|
||||
.setHostedby(
|
||||
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||
|
||||
return instance;
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
|
||||
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||
.ofNullable(i.getAccessright());
|
||||
if (opAr.isPresent()) {
|
||||
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||
instance
|
||||
.setAccessright(
|
||||
AccessRight
|
||||
.newInstance(
|
||||
code,
|
||||
Constants.coarCodeLabelMap.get(code),
|
||||
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||
}
|
||||
}
|
||||
|
||||
Optional
|
||||
.ofNullable(i.getLicense())
|
||||
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getDateofacceptance())
|
||||
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||
Optional
|
||||
.ofNullable(i.getRefereed())
|
||||
.ifPresent(value -> instance.setRefereed(value.getClassname()));
|
||||
Optional
|
||||
.ofNullable(i.getInstancetype())
|
||||
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||
|
||||
}
|
||||
|
||||
private static Subject getSubject(StructuredProperty s) {
|
||||
Subject subject = new Subject();
|
||||
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
|
||||
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
Provenance p = new Provenance();
|
||||
p.setProvenance(di.get().getProvenanceaction().getClassname());
|
||||
p.setTrust(di.get().getTrust());
|
||||
subject.setProvenance(p);
|
||||
}
|
||||
|
||||
return subject;
|
||||
}
|
||||
|
||||
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||
Author a = new Author();
|
||||
a.setFullname(oa.getFullname());
|
||||
a.setName(oa.getName());
|
||||
a.setSurname(oa.getSurname());
|
||||
a.setRank(oa.getRank());
|
||||
|
||||
Optional<List<StructuredProperty>> oPids = Optional
|
||||
.ofNullable(oa.getPid());
|
||||
if (oPids.isPresent()) {
|
||||
Pid pid = getOrcid(oPids.get());
|
||||
if (pid != null) {
|
||||
a.setPid(pid);
|
||||
}
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
private static Pid getOrcid(List<StructuredProperty> p) {
|
||||
for (StructuredProperty pid : p) {
|
||||
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
|
||||
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
|
||||
if (di.isPresent()) {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()),
|
||||
Provenance
|
||||
.newInstance(
|
||||
di.get().getProvenanceaction().getClassname(),
|
||||
di.get().getTrust()));
|
||||
} else {
|
||||
return Pid
|
||||
.newInstance(
|
||||
ControlledField
|
||||
.newInstance(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue())
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
|
|||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
|
||||
name = "communities_infrastructures.json";
|
||||
}
|
||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
|
||||
entry.setSize(fileStatus.getLen());
|
||||
current_size += fileStatus.getLen();
|
||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
|
|||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
|
@ -32,27 +33,33 @@ public class AuthorMerger {
|
|||
|
||||
}
|
||||
|
||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
||||
int pa = countAuthorsPids(a);
|
||||
int pb = countAuthorsPids(b);
|
||||
List<Author> base, enrich;
|
||||
int sa = authorsSize(a);
|
||||
int sb = authorsSize(b);
|
||||
|
||||
if (pa == pb) {
|
||||
base = sa > sb ? a : b;
|
||||
enrich = sa > sb ? b : a;
|
||||
} else {
|
||||
if (sa == sb) {
|
||||
base = pa > pb ? a : b;
|
||||
enrich = pa > pb ? b : a;
|
||||
} else {
|
||||
base = sa > sb ? a : b;
|
||||
enrich = sa > sb ? b : a;
|
||||
}
|
||||
enrichPidFromList(base, enrich);
|
||||
enrichPidFromList(base, enrich, threshold);
|
||||
return base;
|
||||
}
|
||||
|
||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||
return mergeAuthor(a, b, THRESHOLD);
|
||||
}
|
||||
|
||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
||||
if (base == null || enrich == null)
|
||||
return;
|
||||
|
||||
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
||||
final Map<String, Author> basePidAuthorMap = base
|
||||
.stream()
|
||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||
|
@ -63,6 +70,7 @@ public class AuthorMerger {
|
|||
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||
|
||||
// <pid, Author> (list of pid that are missing in the other list)
|
||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||
.stream()
|
||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||
|
@ -83,10 +91,10 @@ public class AuthorMerger {
|
|||
.max(Comparator.comparing(Tuple2::_1));
|
||||
|
||||
if (simAuthor.isPresent()) {
|
||||
double th = THRESHOLD;
|
||||
double th = threshold;
|
||||
// increase the threshold if the surname is too short
|
||||
if (simAuthor.get()._2().getSurname() != null
|
||||
&& simAuthor.get()._2().getSurname().length() <= 3)
|
||||
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
||||
th = 0.99;
|
||||
|
||||
if (simAuthor.get()._1() > th) {
|
||||
|
@ -156,7 +164,7 @@ public class AuthorMerger {
|
|||
}
|
||||
|
||||
private static String normalize(final String s) {
|
||||
return nfd(s)
|
||||
String[] normalized = nfd(s)
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||
// in case
|
||||
|
@ -166,7 +174,12 @@ public class AuthorMerger {
|
|||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
.trim()
|
||||
.split(" ");
|
||||
|
||||
Arrays.sort(normalized);
|
||||
|
||||
return String.join(" ", normalized);
|
||||
}
|
||||
|
||||
private static String nfd(final String s) {
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class AuthorMergerTest {
|
||||
|
||||
private String publicationsBasePath;
|
||||
|
||||
private List<List<Author>> authors;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
|
||||
publicationsBasePath = Paths
|
||||
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
|
||||
.stream()
|
||||
.map(p -> p._2().getAuthor())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void mergeTest() { // used in the dedup: threshold set to 0.95
|
||||
|
||||
for (List<Author> authors1 : authors) {
|
||||
System.out.println("List " + (authors.indexOf(authors1) + 1));
|
||||
for (Author author : authors1) {
|
||||
System.out.println(authorToString(author));
|
||||
}
|
||||
}
|
||||
|
||||
List<Author> merge = AuthorMerger.merge(authors);
|
||||
|
||||
System.out.println("Merge ");
|
||||
for (Author author : merge) {
|
||||
System.out.println(authorToString(author));
|
||||
}
|
||||
|
||||
Assertions.assertEquals(7, merge.size());
|
||||
|
||||
}
|
||||
|
||||
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(path));
|
||||
String line = reader.readLine();
|
||||
while (line != null) {
|
||||
res
|
||||
.add(
|
||||
new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString("$.id", line),
|
||||
new ObjectMapper().readValue(line, clazz)));
|
||||
// read next line
|
||||
line = reader.readLine();
|
||||
}
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public String authorToString(Author a) {
|
||||
|
||||
String print = "Fullname = ";
|
||||
print += a.getFullname() + " pid = [";
|
||||
if (a.getPid() != null)
|
||||
for (StructuredProperty sp : a.getPid()) {
|
||||
print += sp.toComparableString() + " ";
|
||||
}
|
||||
print += "]";
|
||||
return print;
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -105,6 +105,8 @@ public class ModelConstants {
|
|||
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
|
||||
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
|
||||
|
||||
public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
|
||||
|
||||
private static Qualifier qualifier(
|
||||
final String classid,
|
||||
final String classname,
|
||||
|
|
|
@ -2,8 +2,12 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public abstract class Oaf implements Serializable {
|
||||
|
||||
|
@ -40,9 +44,34 @@ public abstract class Oaf implements Serializable {
|
|||
this.lastupdatetimestamp = lastupdatetimestamp;
|
||||
}
|
||||
|
||||
public void mergeOAFDataInfo(Oaf e) {
|
||||
if (e.getDataInfo() != null && compareTrust(this, e) < 0)
|
||||
dataInfo = e.getDataInfo();
|
||||
public void mergeFrom(Oaf o) {
|
||||
if (Objects.isNull(o)) {
|
||||
return;
|
||||
}
|
||||
setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(o.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
setLastupdatetimestamp(
|
||||
Math
|
||||
.max(
|
||||
Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
|
||||
Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
|
||||
}
|
||||
|
||||
public void mergeOAFDataInfo(Oaf o) {
|
||||
if (o.getDataInfo() != null && compareTrust(this, o) < 0)
|
||||
dataInfo = o.getDataInfo();
|
||||
}
|
||||
|
||||
protected String extractTrust(Oaf e) {
|
||||
|
@ -62,7 +91,7 @@ public abstract class Oaf implements Serializable {
|
|||
if (o == null || getClass() != o.getClass())
|
||||
return false;
|
||||
Oaf oaf = (Oaf) o;
|
||||
return Objects.equals(dataInfo, oaf.dataInfo)
|
||||
return Objects.equals(getDataInfo(), oaf.getDataInfo())
|
||||
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
|
||||
}
|
||||
|
||||
|
|
|
@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
|||
}
|
||||
|
||||
public void mergeFrom(OafEntity e) {
|
||||
|
||||
if (e == null)
|
||||
return;
|
||||
super.mergeFrom(e);
|
||||
|
||||
originalId = mergeLists(originalId, e.getOriginalId());
|
||||
|
||||
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
|
||||
|
||||
pid = mergeLists(pid, e.getPid());
|
||||
|
||||
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
||||
|
|
|
@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
|
|||
? p.getFundedamount()
|
||||
: fundedamount;
|
||||
|
||||
// programme = mergeLists(programme, p.getProgramme());
|
||||
|
||||
h2020classification = mergeLists(h2020classification, p.getH2020classification());
|
||||
|
||||
mergeOAFDataInfo(e);
|
||||
|
|
|
@ -130,19 +130,7 @@ public class Relation extends Oaf {
|
|||
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
||||
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
||||
|
||||
setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(r.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
super.mergeFrom(r);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable {
|
|||
|
||||
Result r = (Result) e;
|
||||
|
||||
// TODO consider merging also Measures
|
||||
measures = mergeLists(measures, r.getMeasures());
|
||||
|
||||
instance = mergeLists(instance, r.getInstance());
|
||||
|
||||
|
@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable {
|
|||
if (a.size() == b.size()) {
|
||||
int msa = a
|
||||
.stream()
|
||||
.filter(i -> i.getValue() != null)
|
||||
.filter(i -> i != null && i.getValue() != null)
|
||||
.map(i -> i.getValue().length())
|
||||
.max(Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
int msb = b
|
||||
.stream()
|
||||
.filter(i -> i.getValue() != null)
|
||||
.filter(i -> i != null && i.getValue() != null)
|
||||
.map(i -> i.getValue().length())
|
||||
.max(Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Class that maps the model of the bipFinder! input data.
|
||||
* Only needed for deserialization purposes
|
||||
*/
|
||||
|
||||
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
|
||||
|
||||
public BipDeserialize() {
|
||||
super();
|
||||
}
|
||||
|
||||
public List<Score> get(String key) {
|
||||
|
||||
if (super.get(key) == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
return super.get(key);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
|
||||
*/
|
||||
|
||||
public class BipScore implements Serializable {
|
||||
private String id; // doi
|
||||
private List<Score> scoreList; // unit as given in the inputfile
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<Score> getScoreList() {
|
||||
return scoreList;
|
||||
}
|
||||
|
||||
public void setScoreList(List<Score> scoreList) {
|
||||
this.scoreList = scoreList;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
/**
|
||||
* Just collects all the atomic actions produced for the different results and saves them in
|
||||
* outputpath for the ActionSet
|
||||
*/
|
||||
public class CollectAndSave implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CollectAndSave.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}: ", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
collectAndSave(spark, inputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
sc
|
||||
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
|
||||
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
|
||||
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class KeyValue implements Serializable {
|
||||
|
||||
private String key;
|
||||
private String value;
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public void setKey(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Subset of the information of the generic results that are needed to create the atomic action
|
||||
*/
|
||||
public class PreparedResult implements Serializable {
|
||||
private String id; // openaire id
|
||||
private String value; // doi
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public void setValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* represents the score in the input file
|
||||
*/
|
||||
public class Score implements Serializable {
|
||||
|
||||
private String id;
|
||||
private List<KeyValue> unit;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public List<KeyValue> getUnit() {
|
||||
return unit;
|
||||
}
|
||||
|
||||
public void setUnit(List<KeyValue> unit) {
|
||||
this.unit = unit;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,200 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* created the Atomic Action for each tipe of results
|
||||
*/
|
||||
public class SparkAtomicActionScoreJob implements Serializable {
|
||||
|
||||
private static String DOI = "doi";
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
SparkAtomicActionScoreJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath {}: ", inputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath {}: ", outputPath);
|
||||
|
||||
final String bipScorePath = parser.get("bipScorePath");
|
||||
log.info("bipScorePath: {}", bipScorePath);
|
||||
|
||||
final String resultClassName = parser.get("resultTableName");
|
||||
log.info("resultTableName: {}", resultClassName);
|
||||
|
||||
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
|
||||
});
|
||||
}
|
||||
|
||||
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
|
||||
String bipScorePath, Class<I> inputClazz) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
|
||||
.textFile(bipScorePath)
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
|
||||
|
||||
Dataset<BipScore> bipScores = spark
|
||||
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
|
||||
BipScore bs = new BipScore();
|
||||
bs.setId(key);
|
||||
bs.setScoreList(entry.get(key));
|
||||
return bs;
|
||||
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
|
||||
|
||||
System.out.println(bipScores.count());
|
||||
|
||||
Dataset<I> results = readPath(spark, inputPath, inputClazz);
|
||||
|
||||
results.createOrReplaceTempView("result");
|
||||
|
||||
Dataset<PreparedResult> preparedResult = spark
|
||||
.sql(
|
||||
"select pIde.value value, id " +
|
||||
"from result " +
|
||||
"lateral view explode (pid) p as pIde " +
|
||||
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
|
||||
.as(Encoders.bean(PreparedResult.class));
|
||||
|
||||
bipScores
|
||||
.joinWith(
|
||||
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
|
||||
"inner")
|
||||
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
|
||||
BipScore ret = value._1();
|
||||
ret.setId(value._2().getId());
|
||||
return ret;
|
||||
}, Encoders.bean(BipScore.class))
|
||||
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
|
||||
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
|
||||
Result ret = new Result();
|
||||
ret.setDataInfo(getDataInfo());
|
||||
BipScore first = it.next();
|
||||
ret.setId(first.getId());
|
||||
|
||||
ret.setMeasures(getMeasure(first));
|
||||
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
||||
|
||||
return ret;
|
||||
}, Encoders.bean(Result.class))
|
||||
.toJavaRDD()
|
||||
.map(p -> new AtomicAction(inputClazz, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
|
||||
}
|
||||
|
||||
private static List<Measure> getMeasure(BipScore value) {
|
||||
return value
|
||||
.getScoreList()
|
||||
.stream()
|
||||
.map(score -> {
|
||||
Measure m = new Measure();
|
||||
m.setId(score.getId());
|
||||
m
|
||||
.setUnit(
|
||||
score
|
||||
.getUnit()
|
||||
.stream()
|
||||
.map(unit -> {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setValue(unit.getValue());
|
||||
kv.setKey(unit.getKey());
|
||||
kv.setDataInfo(getDataInfo());
|
||||
return kv;
|
||||
})
|
||||
.collect(Collectors.toList()));
|
||||
return m;
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static DataInfo getDataInfo() {
|
||||
DataInfo di = new DataInfo();
|
||||
di.setInferred(false);
|
||||
di.setInvisible(false);
|
||||
di.setDeletedbyinference(false);
|
||||
di.setTrust("");
|
||||
Qualifier qualifier = new Qualifier();
|
||||
qualifier.setClassid("sysimport:actionset");
|
||||
qualifier.setClassname("Harvested");
|
||||
qualifier.setSchemename("dnet:provenanceActions");
|
||||
qualifier.setSchemeid("dnet:provenanceActions");
|
||||
di.setProvenanceaction(qualifier);
|
||||
return di;
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ip",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the URL from where to get the programme file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rtn",
|
||||
"paramLongName": "resultTableName",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "bsp",
|
||||
"paramLongName": "bipScorePath",
|
||||
"paramDescription": "the path of the new ActionSet",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,171 @@
|
|||
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>inputPath</name>
|
||||
<description>the input path of the resources to be extended</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>bipScorePath</name>
|
||||
<description>the path where to find the bipFinder scores</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="deleteoutputpath"/>
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
<action name="deleteoutputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="atomicactions"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="atomicactions">
|
||||
<path start="atomicactions_publication"/>
|
||||
<path start="atomicactions_dataset"/>
|
||||
<path start="atomicactions_orp"/>
|
||||
<path start="atomicactions_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="atomicactions_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for publications</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for datasets</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_orp">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for orp</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="atomicactions_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Produces the atomic action with the bip finder scores for software</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
|
||||
</spark>
|
||||
<ok to="join_aa"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="join_aa" to="collectandsave"/>
|
||||
|
||||
<action name="collectandsave">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>saves all the aa produced for the several types of results in the as output path</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,323 @@
|
|||
|
||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
public class SparkAtomicActionScoreJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
private static final Logger log = LoggerFactory
|
||||
.getLogger(SparkAtomicActionScoreJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void matchOne() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
|
||||
Assertions.assertTrue(tmp.count() == 1);
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(2, execVerification.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.47565045883e-08",
|
||||
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.227515392",
|
||||
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void matchOneWithTwo() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
|
||||
Assertions.assertTrue(tmp.count() == 1);
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(4, execVerification.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'influence'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'popularity'").count());
|
||||
|
||||
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
|
||||
String tmp_influence = tmp_ds.get(0).getString(0);
|
||||
Assertions
|
||||
.assertTrue(
|
||||
"1.47565045883e-08".equals(tmp_influence) ||
|
||||
"1.98956540239e-08".equals(tmp_influence));
|
||||
|
||||
tmp_influence = tmp_ds.get(1).getString(0);
|
||||
Assertions
|
||||
.assertTrue(
|
||||
"1.47565045883e-08".equals(tmp_influence) ||
|
||||
"1.98956540239e-08".equals(tmp_influence));
|
||||
|
||||
Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void matchTwo() throws Exception {
|
||||
String bipScoresPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
||||
.getPath();
|
||||
String inputPath = getClass()
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
|
||||
.getPath();
|
||||
|
||||
SparkAtomicActionScoreJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged",
|
||||
Boolean.FALSE.toString(),
|
||||
"-inputPath",
|
||||
inputPath,
|
||||
"-bipScorePath",
|
||||
bipScoresPath,
|
||||
"-resultTableName",
|
||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
||||
"-outputPath",
|
||||
workingDir.toString() + "/actionSet"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Publication> tmp = sc
|
||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||
.map(aa -> ((Publication) aa.getPayload()));
|
||||
|
||||
Assertions.assertTrue(tmp.count() == 2);
|
||||
|
||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
||||
verificationDataset.createOrReplaceTempView("publication");
|
||||
|
||||
Dataset<Row> execVerification = spark
|
||||
.sql(
|
||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
||||
"lateral view explode(measures) m as mes " +
|
||||
"lateral view explode(mes.unit) u as mUnit ");
|
||||
|
||||
Assertions.assertEquals(4, execVerification.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'influence'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
execVerification.filter("id = 'popularity'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.47565045883e-08",
|
||||
execVerification
|
||||
.filter(
|
||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
||||
"and id = 'influence'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"1.98956540239e-08",
|
||||
execVerification
|
||||
.filter(
|
||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
||||
"and id = 'influence'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.282046161584",
|
||||
execVerification
|
||||
.filter(
|
||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
||||
"and id = 'popularity'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
"0.227515392",
|
||||
execVerification
|
||||
.filter(
|
||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
||||
"and id = 'popularity'")
|
||||
.select("value")
|
||||
.collectAsList()
|
||||
.get(0)
|
||||
.getString(0));
|
||||
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
|
|||
IOUtils
|
||||
.toString(
|
||||
CheckDuplictedIdsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String countPath = parser.get("workingPath") + "/counts";
|
||||
final String countPath = parser.get("outputDir") + "/counts";
|
||||
log.info("countPath: {}", countPath);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
|
|||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
|
|
|
@ -44,10 +44,10 @@ public class GenerateEventsJob {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String eventsPath = workingPath + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
||||
|
@ -59,6 +59,9 @@ public class GenerateEventsJob {
|
|||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||
|
||||
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
|
||||
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
@ -70,12 +73,12 @@ public class GenerateEventsJob {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
|
||||
|
||||
final Dataset<ResultGroup> groups = ClusterUtils
|
||||
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
|
||||
.readPath(spark, workingDir + "/duplicates", ResultGroup.class);
|
||||
|
||||
final Dataset<Event> dataset = groups
|
||||
.map(
|
||||
g -> EventFinder
|
||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
|
||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
|
||||
Encoders
|
||||
.bean(EventGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||
|
|
|
@ -46,7 +46,7 @@ public class GenerateStatsJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String dbUrl = parser.get("dbUrl");
|
||||
|
|
|
@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
|
||||
log.info("maxEventsForTopic: {}", maxEventsForTopic);
|
||||
|
||||
|
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
|
|||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
log.info("*** Start indexing");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
|
|
|
@ -54,7 +54,7 @@ public class IndexNotificationsJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
|
||||
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
|
||||
|
||||
|
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
|
|||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
log.info("*** Start indexing");
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
|
|
|
@ -36,7 +36,7 @@ public class IndexOnESJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
|
@ -45,6 +45,18 @@ public class IndexOnESJob {
|
|||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||
|
||||
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||
|
||||
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||
|
||||
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
|
@ -53,15 +65,13 @@ public class IndexOnESJob {
|
|||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
||||
|
||||
esCfg.put("es.index.auto.create", "false");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
}
|
||||
|
|
|
@ -42,10 +42,10 @@ public class JoinStep0Job {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -57,10 +57,10 @@ public class JoinStep0Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedDatasource> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
|
||||
.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep1Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep1Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedProject> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
|
||||
.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -39,10 +39,10 @@ public class JoinStep2Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -54,10 +54,10 @@ public class JoinStep2Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedSoftware> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
|
||||
.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep3Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep3Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedDataset> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
|
||||
.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep4Job {
|
|||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
|
||||
final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -55,10 +55,10 @@ public class JoinStep4Job {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedPublication> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
|
||||
.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
|
||||
.toColumn();
|
||||
|
|
|
@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
|
|||
IOUtils
|
||||
.toString(
|
||||
PartitionEventsByDsIdJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -48,24 +55,43 @@ public class PartitionEventsByDsIdJob {
|
|||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
final String eventsPath = parser.get("outputDir") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
||||
final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
|
||||
log.info("partitionPath: {}", partitionPath);
|
||||
|
||||
final String opendoarIds = parser.get("opendoarIds");
|
||||
log.info("opendoarIds: {}", opendoarIds);
|
||||
|
||||
final Set<String> validOpendoarIds = new HashSet<>();
|
||||
if (!opendoarIds.trim().equals("-")) {
|
||||
validOpendoarIds
|
||||
.addAll(
|
||||
Arrays
|
||||
.stream(opendoarIds.split(","))
|
||||
.map(String::trim)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
log.info("validOpendoarIds: {}", validOpendoarIds);
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
|
||||
.limit(10000)
|
||||
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
|
||||
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
|
||||
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
|
||||
.map(
|
||||
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
|
||||
Encoders.bean(ShortEventMessageWithGroupId.class))
|
||||
.coalesce(1)
|
||||
.write()
|
||||
.partitionBy("group")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(partitionPath);
|
||||
|
||||
});
|
||||
|
@ -97,6 +123,7 @@ public class PartitionEventsByDsIdJob {
|
|||
|
||||
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
|
||||
|
||||
res.setEventId(e.getEventId());
|
||||
res.setOriginalId(payload.getResult().getOriginalId());
|
||||
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
|
||||
res.setTopic(e.getTopic());
|
||||
|
|
|
@ -45,10 +45,10 @@ public class PrepareGroupsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String groupsPath = workingPath + "/duplicates";
|
||||
final String groupsPath = workingDir + "/duplicates";
|
||||
log.info("groupsPath: {}", groupsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
|
|||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> results = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||
.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<Relation> mergedRels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
|
|
|
@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedDatasets";
|
||||
final String relsPath = workingDir + "/relatedDatasets";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
|
|||
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
|
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
|
|||
final Dataset<RelatedDataset> dataset = rels
|
||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
|
||||
t._2);
|
||||
rel.getRelDataset().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedDataset.class));
|
||||
|
|
|
@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedDatasources";
|
||||
final String relsPath = workingDir + "/relatedDatasources";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedProjects";
|
||||
final String relsPath = workingDir + "/relatedProjects";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
|
|||
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
|
|
|
@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedPublications";
|
||||
final String relsPath = workingDir + "/relatedPublications";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
|
|||
Encoders.bean(OaBrokerRelatedPublication.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||
|
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
|
|||
final Dataset<RelatedPublication> dataset = rels
|
||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
|
||||
final RelatedPublication rel = new RelatedPublication(
|
||||
t._1.getSource(), t._2);
|
||||
rel.getRelPublication().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedPublication.class));
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String relsPath = workingPath + "/relatedSoftwares";
|
||||
final String relsPath = workingDir + "/relatedSoftwares";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
|
|||
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.loadRelations(graphPath, spark)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
|
|||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String simpleEntitiesPath = workingPath + "/simpleEntities";
|
||||
final String simpleEntitiesPath = workingDir + "/simpleEntities";
|
||||
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
|||
|
||||
public EnrichMissingSubject() {
|
||||
super(20,
|
||||
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
||||
s -> {
|
||||
switch (s.getType().toLowerCase()) {
|
||||
case "acm":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_ACM;
|
||||
case "arxiv":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
|
||||
case "ddc":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_DDC;
|
||||
case "jel":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_JEL;
|
||||
case "mesh":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
|
||||
case "rvk":
|
||||
return Topic.ENRICH_MISSING_SUBJECT_RVK;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
},
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
|||
|
||||
public EnrichMoreSubject() {
|
||||
super(20,
|
||||
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
||||
s -> {
|
||||
switch (s.getType().toLowerCase()) {
|
||||
case "acm":
|
||||
return Topic.ENRICH_MORE_SUBJECT_ACM;
|
||||
case "arxiv":
|
||||
return Topic.ENRICH_MORE_SUBJECT_ARXIV;
|
||||
case "ddc":
|
||||
return Topic.ENRICH_MORE_SUBJECT_DDC;
|
||||
case "jel":
|
||||
return Topic.ENRICH_MORE_SUBJECT_JEL;
|
||||
case "mesh":
|
||||
return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
|
||||
case "rvk":
|
||||
return Topic.ENRICH_MORE_SUBJECT_RVK;
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
},
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class ClusterUtils {
|
||||
|
||||
|
@ -30,6 +31,16 @@ public class ClusterUtils {
|
|||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.map(r -> {
|
||||
r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
|
||||
r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
|
||||
return r;
|
||||
}, Encoders.bean(Relation.class));
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
final SparkSession spark,
|
||||
final String inputPath,
|
||||
|
@ -67,6 +78,7 @@ public class ClusterUtils {
|
|||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(path);
|
||||
}
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||
res.setOpenaireId(d.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(d.getId()));
|
||||
res.setOriginalId(first(d.getOriginalId()));
|
||||
res.setTitle(structPropValue(d.getTitle()));
|
||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -89,7 +89,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||
res.setOriginalId(first(p.getOriginalId()));
|
||||
res.setTitle(structPropValue(p.getTitle()));
|
||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
|
@ -106,7 +106,7 @@ public class ConversionUtils {
|
|||
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
|
||||
res.setOpenaireId(result.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(result.getId()));
|
||||
res.setOriginalId(first(result.getOriginalId()));
|
||||
res.setTypology(classId(result.getResulttype()));
|
||||
res.setTitles(structPropList(result.getTitle()));
|
||||
|
@ -129,6 +129,10 @@ public class ConversionUtils {
|
|||
return res;
|
||||
}
|
||||
|
||||
public static String cleanOpenaireId(final String id) {
|
||||
return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
|
||||
}
|
||||
|
||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||
if (author == null) {
|
||||
return null;
|
||||
|
@ -188,7 +192,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerProject res = new OaBrokerProject();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||
res.setTitle(fieldValue(p.getTitle()));
|
||||
res.setAcronym(fieldValue(p.getAcronym()));
|
||||
res.setCode(fieldValue(p.getCode()));
|
||||
|
@ -214,7 +218,7 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||
res.setOpenaireId(sw.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(sw.getId()));
|
||||
res.setName(structPropValue(sw.getTitle()));
|
||||
res.setDescription(fieldValue(sw.getDescription()));
|
||||
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
|
||||
|
@ -230,7 +234,7 @@ public class ConversionUtils {
|
|||
|
||||
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
|
||||
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
|
||||
res.setOpenaireId(ds.getId());
|
||||
res.setOpenaireId(cleanOpenaireId(ds.getId()));
|
||||
res.setType(classId(ds.getDatasourcetype()));
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
|
|||
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
|
||||
collectedFromSet
|
||||
.stream()
|
||||
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
|
||||
.map(
|
||||
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||
BrokerConstants.COLLECTED_FROM_REL))
|
||||
.forEach(res::addTuple);
|
||||
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
|
||||
|
||||
hostedBySet
|
||||
.stream()
|
||||
.map(
|
||||
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||
BrokerConstants.HOSTED_BY_REL))
|
||||
.forEach(res::addTuple);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
|
|
@ -76,6 +76,7 @@ public class EventFinder {
|
|||
final Set<String> dsIdWhitelist,
|
||||
final Set<String> dsIdBlacklist,
|
||||
final Set<String> dsTypeWhitelist,
|
||||
final Set<String> topicWhitelist,
|
||||
final Map<String, LongAccumulator> accumulators) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
@ -84,7 +85,13 @@ public class EventFinder {
|
|||
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
||||
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
||||
for (final UpdateMatcher<?> matcher : matchers) {
|
||||
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
|
||||
for (final UpdateInfo<?> info : matcher
|
||||
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
|
||||
if (topicWhitelist == null || topicWhitelist.isEmpty()
|
||||
|| topicWhitelist.contains(info.getTopic().getPath())) {
|
||||
list.add(info);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
[
|
||||
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the data are stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -7,7 +7,7 @@
|
|||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path where the temporary data will be stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
|
@ -24,6 +24,11 @@
|
|||
<value>-</value>
|
||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>topicWhitelist</name>
|
||||
<value>*</value>
|
||||
<description>a white list (comma separeted, * for all) of topics</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esEventIndexName</name>
|
||||
<description>the elasticsearch index name for events</description>
|
||||
|
@ -36,6 +41,26 @@
|
|||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -111,15 +136,15 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ensure_working_path"/>
|
||||
<start to="ensure_output_dir"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ensure_working_path">
|
||||
<action name="ensure_output_dir">
|
||||
<fs>
|
||||
<mkdir path='${workingPath}'/>
|
||||
<mkdir path='${outputDir}'/>
|
||||
</fs>
|
||||
<ok to="start_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -152,7 +177,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -176,7 +201,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -201,7 +226,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -225,7 +250,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -249,7 +274,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -273,7 +298,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="wait_entities_and_rels"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -299,7 +324,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step1"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -323,7 +348,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step2"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -347,7 +372,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step3"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -371,7 +396,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="join_entities_step4"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -395,7 +420,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="prepare_groups"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -419,7 +444,7 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
</spark>
|
||||
<ok to="generate_events"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -442,10 +467,12 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
|
||||
</spark>
|
||||
<ok to="index_event_subset"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -468,9 +495,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
|
@ -495,9 +526,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="stats"/>
|
||||
|
@ -521,7 +556,7 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
|
||||
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
|
||||
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
||||
|
|
|
@ -1,7 +1,13 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path where the temporary data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the generated events will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
@ -22,5 +28,11 @@
|
|||
"paramLongName": "datasourceIdBlacklist",
|
||||
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "topicWhitelist",
|
||||
"paramLongName": "topicWhitelist",
|
||||
"paramDescription": "a white list (comma separeted, * for all) of topics",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the data path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -16,5 +16,29 @@
|
|||
"paramLongName": "esHost",
|
||||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the generated data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -17,6 +17,30 @@
|
|||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "maxEventsForTopic",
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the dir that contains the events folder",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
@ -17,6 +17,30 @@
|
|||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryCount",
|
||||
"paramLongName": "esBatchWriteRetryCount",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchWriteRetryWait",
|
||||
"paramLongName": "esBatchWriteRetryWait",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esBatchSizeEntries",
|
||||
"paramLongName": "esBatchSizeEntries",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "esNodesWanOnly",
|
||||
"paramLongName": "esNodesWanOnly",
|
||||
"paramDescription": "an ES configuration property",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "broker",
|
||||
"paramLongName": "brokerApiBaseUrl",
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data are stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
|
@ -36,6 +36,26 @@
|
|||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -122,9 +142,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where the data will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "list",
|
||||
"paramLongName": "opendoarIds",
|
||||
"paramDescription": "the opendoar IDs whitelist (comma separated)",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>opendoarIds</name>
|
||||
<description>the opendoar IDs whitelist (comma separated)</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="opendoarPartition"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="opendoarPartition">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PartitionEventsByDsIdJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -1,41 +1,38 @@
|
|||
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceTypeWhitelist</name>
|
||||
<value>-</value>
|
||||
<description>a white list (comma separeted, - for empty list) of datasource types</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdBlacklist</name>
|
||||
<value>-</value>
|
||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esEventIndexName</name>
|
||||
<description>the elasticsearch index name for events</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNotificationsIndexName</name>
|
||||
<description>the elasticsearch index name for notifications</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndexHost</name>
|
||||
<description>the elasticsearch host</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryCount</name>
|
||||
<value>8</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchWriteRetryWait</name>
|
||||
<value>60s</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esBatchSizeEntries</name>
|
||||
<value>200</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esNodesWanOnly</name>
|
||||
<value>true</value>
|
||||
<description>an ES configuration property</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>maxIndexedEventsForDsAndTopic</name>
|
||||
<description>the max number of events for each couple (ds/topic)</description>
|
||||
|
@ -44,18 +41,6 @@
|
|||
<name>brokerApiBaseUrl</name>
|
||||
<description>the url of the broker service api</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbUrl</name>
|
||||
<description>the url of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbUser</name>
|
||||
<description>the user of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>brokerDbPassword</name>
|
||||
<description>the password of the broker database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -111,36 +96,45 @@
|
|||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="partition"/>
|
||||
<start to="index_event_subset"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="partition">
|
||||
<action name="index_event_subset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PartitionEventsByDsIdJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
|
||||
<name>IndexEventSubsetOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -1,8 +1,8 @@
|
|||
[
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the working path",
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputDir",
|
||||
"paramDescription": "the path where generated data are stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -10,10 +10,11 @@ import java.io.Serializable;
|
|||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
@ -100,8 +101,8 @@ public class EntityMergerTest implements Serializable {
|
|||
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
||||
|
||||
// verify authors
|
||||
assertEquals(pub_merged.getAuthor().size(), 9);
|
||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
||||
assertEquals(13, pub_merged.getAuthor().size());
|
||||
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
||||
|
||||
// verify title
|
||||
int count = 0;
|
||||
|
|
|
@ -7,7 +7,6 @@ import java.util.List;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
|
@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD;
|
|||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
||||
|
|
|
@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
|
|
@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
|
|||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
|
@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss
|
|||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
||||
|
||||
object DoiBoostMappingUtil {
|
||||
def getUnknownCountry(): Qualifier = {
|
||||
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
|
||||
}
|
||||
|
||||
|
||||
|
||||
def generateMAGAffiliationId(affId: String): String = {
|
||||
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
|
||||
}
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
//STATIC STRING
|
||||
val MAG = "microsoft"
|
||||
val MAG_NAME = "Microsoft Academic Graph"
|
||||
val ORCID = "ORCID"
|
||||
val ORCID = "orcid"
|
||||
val ORCID_PENDING = "orcid_pending"
|
||||
val CROSSREF = "Crossref"
|
||||
val UNPAYWALL = "UnpayWall"
|
||||
val GRID_AC = "grid.ac"
|
||||
|
|
|
@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet {
|
|||
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
||||
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
||||
val workingDirPath = parser.get("targetPath")
|
||||
val sequenceFilePath = parser.get("sFilePath")
|
||||
|
||||
spark.read.load(dbDatasetPath).as[OafDataset]
|
||||
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
|
||||
.map(d =>DoiBoostMappingUtil.fixResult(d))
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
|
||||
// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbPublicationPath).as[Publication]
|
||||
val asPublication =spark.read.load(dbPublicationPath).as[Publication]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbOrganizationPath).as[Organization]
|
||||
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
|
||||
spark.read.load(crossRefRelation).as[Relation]
|
||||
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
|
||||
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||
|
||||
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
|
||||
|
||||
// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||
|
||||
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
|
@ -30,7 +31,7 @@ object SparkGenerateDoiBoost {
|
|||
import spark.implicits._
|
||||
|
||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||
val workingDirPath = parser.get("workingDirPath")
|
||||
val workingDirPath = parser.get("workingPath")
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
|
@ -62,7 +63,7 @@ object SparkGenerateDoiBoost {
|
|||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||
|
||||
logger.info("Phase 3) Join Result with MAG")
|
||||
logger.info("Phase 4) Join Result with MAG")
|
||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||
|
@ -132,7 +133,7 @@ object SparkGenerateDoiBoost {
|
|||
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
||||
if (affiliation.OfficialPage.isDefined)
|
||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||
o
|
||||
}
|
||||
else
|
||||
|
|
|
@ -200,7 +200,7 @@ case object Crossref2Oaf {
|
|||
a.setSurname(family)
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
|
||||
a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
|
|
@ -2,18 +2,16 @@
|
|||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.Optional;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
|
@ -30,34 +28,45 @@ public class CrossrefImporter {
|
|||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsuri = parser.get("namenode");
|
||||
System.out.println("HDFS URI" + hdfsuri);
|
||||
Path hdfswritepath = new Path(parser.get("targetPath"));
|
||||
System.out.println("TargetPath: " + hdfsuri);
|
||||
final String namenode = parser.get("namenode");
|
||||
System.out.println("namenode: " + namenode);
|
||||
|
||||
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
|
||||
? Long.parseLong(parser.get("timestamp"))
|
||||
: -1;
|
||||
Path targetPath = new Path(parser.get("targetPath"));
|
||||
System.out.println("targetPath: " + targetPath);
|
||||
|
||||
if (timestamp > 0)
|
||||
System.out.println("Timestamp added " + timestamp);
|
||||
final Long timestamp = Optional
|
||||
.ofNullable(parser.get("timestamp"))
|
||||
.map(s -> {
|
||||
try {
|
||||
return Long.parseLong(s);
|
||||
} catch (NumberFormatException e) {
|
||||
return -1L;
|
||||
}
|
||||
})
|
||||
.orElse(-1L);
|
||||
System.out.println("timestamp: " + timestamp);
|
||||
|
||||
final String esServer = parser.get("esServer");
|
||||
System.out.println("esServer: " + esServer);
|
||||
|
||||
final String esIndex = parser.get("esIndex");
|
||||
System.out.println("esIndex: " + esIndex);
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
conf.set("fs.defaultFS", namenode);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
ESClient client = timestamp > 0
|
||||
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
|
||||
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
|
||||
// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
|
||||
final ESClient client = new ESClient(esServer, esIndex, timestamp);
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.file(targetPath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
|
@ -74,8 +83,7 @@ public class CrossrefImporter {
|
|||
end = System.currentTimeMillis();
|
||||
final float time = (end - start) / 1000.0F;
|
||||
System.out
|
||||
.println(
|
||||
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
|
||||
.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
|
||||
start = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.HttpHeaders;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
|
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
|
|||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
public class ESClient implements Iterator<String> {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
|
||||
|
||||
static final String blobPath = "$.hits[*].hits[*]._source.blob";
|
||||
static final String scrollIdPath = "$._scroll_id";
|
||||
static final String JSON_NO_TS = "{\"size\":1000}";
|
||||
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||
private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
|
||||
private static final String SCROLL_ID_PATH = "$._scroll_id";
|
||||
private static final String JSON_NO_TS = "{\"size\":1000}";
|
||||
private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||
private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||
|
||||
public static final String APPLICATION_JSON = "application/json";
|
||||
|
||||
public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
|
||||
public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";
|
||||
|
||||
private final String scrollId;
|
||||
|
||||
|
@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {
|
|||
|
||||
private final String esHost;
|
||||
|
||||
public ESClient(final String esHost, final String esIndex) throws IOException {
|
||||
|
||||
public ESClient(final String esHost, final String esIndex, final long timestamp) {
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
public ESClient(final String esHost, final String esIndex, final long timestamp)
|
||||
throws IOException {
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
|
||||
String.format(JSON_WITH_TS, timestamp));
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
final String body = timestamp > 0
|
||||
? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
|
||||
: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
|
||||
scrollId = getJPathString(SCROLL_ID_PATH, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
private String getResponse(final String url, final String json) {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
try {
|
||||
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpPost httpPost = new HttpPost(url);
|
||||
if (json != null) {
|
||||
StringEntity entity = new StringEntity(json);
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
|
||||
httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
|
||||
}
|
||||
try (CloseableHttpResponse response = client.execute(httpPost)) {
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
}
|
||||
CloseableHttpResponse response = client.execute(httpPost);
|
||||
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on executing request ", e);
|
||||
} finally {
|
||||
try {
|
||||
client.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close client ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
|
|||
}
|
||||
|
||||
private List<String> getBlobs(final String body) {
|
||||
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
|
||||
final List<String> res = JsonPath.read(body, BLOB_PATH);
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
|
|||
if (buffer.isEmpty()) {
|
||||
|
||||
final String json_param = String.format(JSON_SCROLL, scrollId);
|
||||
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
|
||||
final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
|
||||
try {
|
||||
buffer = getBlobs(body);
|
||||
} catch (Throwable e) {
|
||||
logger.error("Error on get next page: body:" + body);
|
||||
System.out.println("Error on get next page: body:" + body);
|
||||
}
|
||||
}
|
||||
return nextItem;
|
||||
|
|
|
@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
|
|||
|
||||
|
||||
val stream = Map(
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
// ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
||||
|
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
|
|||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
|
||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
||||
)
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}
|
|||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkPreProcessMAG {
|
||||
object SparkProcessMAG {
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
|
|||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val workingPath = parser.get("workingPath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
import spark.implicits._
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
|
||||
|
||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
|
||||
|
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
|
|||
.map(_._2)
|
||||
|
||||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
|
||||
|
||||
logger.info("Phase 0) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
|
||||
|
||||
logger.info("Phase 3) Group Author by PaperId")
|
||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||
|
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
|
|||
} else
|
||||
mpa
|
||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
|
||||
|
||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||
|
||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||
|
||||
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
||||
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
|
||||
|
||||
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
|
||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
|
||||
|
||||
|
||||
var magPubs: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
|
||||
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
|
||||
|
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
|
|||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
|
||||
.save(s"$workingPath/merge_step_2_conference")
|
||||
|
||||
|
||||
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
|
||||
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||
|
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
|
|||
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
||||
.save(s"$workingPath/merge_step_3")
|
||||
|
||||
|
||||
// logger.info("Phase 6) Enrich Publication with description")
|
||||
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
|
||||
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
||||
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
|
||||
|
||||
|
||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
|
||||
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||
|
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
|
|||
.equalTo(paperField("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/mag_publication")
|
||||
.save(s"$workingPath/mag_publication")
|
||||
|
||||
|
||||
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
|
||||
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
|
||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
|
||||
|
||||
}
|
||||
}
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||
import org.apache.commons.lang.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
@ -17,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
|
|||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||
object ORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||
val mapper = new ObjectMapper
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
def isJsonValid(inputStr: String): Boolean = {
|
||||
import java.io.IOException
|
||||
|
@ -43,16 +44,19 @@ object ORCIDToOAF {
|
|||
}
|
||||
|
||||
|
||||
def convertTOOAF(input:ORCIDElement) :Publication = {
|
||||
val doi = input.doi
|
||||
def convertTOOAF(input:OrcidDOI) :Publication = {
|
||||
val doi = input.getDoi
|
||||
val pub:Publication = new Publication
|
||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
try{
|
||||
pub.setAuthor(input.authors.map(a=> {
|
||||
generateAuthor(a.name, a.surname, a.creditName, a.oid)
|
||||
}).asJava)
|
||||
|
||||
val l:List[Author]= input.getAuthors.asScala.map(a=> {
|
||||
generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
|
||||
})(collection.breakOut)
|
||||
|
||||
pub.setAuthor(l.asJava)
|
||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
pub
|
||||
|
@ -63,6 +67,13 @@ object ORCIDToOAF {
|
|||
}
|
||||
}
|
||||
|
||||
def generateOricPIDDatainfo():DataInfo = {
|
||||
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
|
||||
di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
|
||||
di.getProvenanceaction.setClassname("Harvested")
|
||||
di
|
||||
}
|
||||
|
||||
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
|
@ -72,7 +83,7 @@ object ORCIDToOAF {
|
|||
else
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
|
|
@ -1,21 +1,72 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkConvertORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
|
||||
|
||||
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
|
||||
.map(d => (d.getId, d))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(getPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
||||
parser.parseArgument(args)
|
||||
|
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
|
|||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
import spark.implicits._
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
|
||||
run(spark, sourcePath, targetPath)
|
||||
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,10 +3,8 @@ package eu.dnetlib.doiboost.orcid;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -18,11 +16,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
|||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.mortbay.log.Log;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -37,7 +33,7 @@ public class SparkDownloadOrcidAuthors {
|
|||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
static String lastUpdate;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
|
@ -52,12 +48,12 @@ public class SparkDownloadOrcidAuthors {
|
|||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
logger.info("workingPath: ", workingPath);
|
||||
logger.info("workingPath: {}", workingPath);
|
||||
final String outputPath = parser.get("outputPath");
|
||||
logger.info("outputPath: ", outputPath);
|
||||
logger.info("outputPath: {}", outputPath);
|
||||
final String token = parser.get("token");
|
||||
final String lambdaFileName = parser.get("lambdaFileName");
|
||||
logger.info("lambdaFileName: ", lambdaFileName);
|
||||
logger.info("lambdaFileName: {}", lambdaFileName);
|
||||
|
||||
lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
|
||||
|
||||
|
@ -179,8 +175,8 @@ public class SparkDownloadOrcidAuthors {
|
|||
}
|
||||
|
||||
private static boolean isModified(String orcidId, String modifiedDate) {
|
||||
Date modifiedDateDt = null;
|
||||
Date lastUpdateDt = null;
|
||||
Date modifiedDateDt;
|
||||
Date lastUpdateDt;
|
||||
try {
|
||||
if (modifiedDate.length() != 19) {
|
||||
modifiedDate = modifiedDate.substring(0, 19);
|
||||
|
|
|
@ -5,5 +5,6 @@
|
|||
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true},
|
||||
{"paramName": "sp", "paramLongName":"sFilePath", "paramDescription": "the Sequence file Path", "paramRequired": true}
|
||||
]
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
{"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
|
||||
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
||||
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
||||
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
{"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||
]
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
[
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
|
||||
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false}
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
|
||||
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false},
|
||||
{"paramName":"ess", "paramLongName":"esServer", "paramDescription": "elasticsearch server url", "paramRequired": true},
|
||||
{"paramName":"esi", "paramLongName":"esIndex", "paramDescription": "elasticsearch index name", "paramRequired": true}
|
||||
]
|
|
@ -39,14 +39,7 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${workingDirPath}'/>
|
||||
<mkdir path='${workingDirPath}'/>
|
||||
</fs>
|
||||
<ok to="CreateDOIBoost"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="CreateDOIBoost">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
|
|
|
@ -8,6 +8,10 @@
|
|||
<name>targetPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
|
@ -31,10 +35,10 @@
|
|||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path='${targetPath}'/>
|
||||
<mkdir path='${targetPath}'/>
|
||||
<delete path='${workingPath}'/>
|
||||
<mkdir path='${workingPath}'/>
|
||||
</fs>
|
||||
<ok to="PreprocessMag"/>
|
||||
<ok to="ConvertMagToDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -52,10 +56,10 @@
|
|||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<ok to="PreprocessMag"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -65,7 +69,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to Dataset</name>
|
||||
<name>Convert Mag to OAF Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -75,7 +79,8 @@
|
|||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--sourcePath</arg><arg>${workingPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}/process</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
[
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target dir path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
|
||||
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
|
||||
|
||||
]
|
|
@ -0,0 +1,42 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,335 @@
|
|||
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorIntersectionMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- Itersection Parameters -->
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working Path</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>hostedByMapPath</name>
|
||||
<description>the hostedByMap Path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the Path of the sequence file action set</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- Crossref Parameters -->
|
||||
<property>
|
||||
<name>inputPathCrossref</name>
|
||||
<description>the Crossref input path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>crossrefTimestamp</name>
|
||||
<description>Timestamp for the Crossref incremental Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esServer</name>
|
||||
<description>elasticsearch server url for the Crossref Harvesting</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>esIndex</name>
|
||||
<description>elasticsearch index name for the Crossref Harvesting</description>
|
||||
</property>
|
||||
|
||||
<!-- MAG Parameters -->
|
||||
<property>
|
||||
<name>inputPathMAG</name>
|
||||
<description>the MAG working path</description>
|
||||
</property>
|
||||
|
||||
|
||||
<!-- UnpayWall Parameters -->
|
||||
<property>
|
||||
<name>inputPathUnpayWall</name>
|
||||
<description>the UnpayWall working path</description>
|
||||
</property>
|
||||
|
||||
<!-- ORCID Parameters -->
|
||||
<property>
|
||||
<name>inputPathOrcid</name>
|
||||
<description>the ORCID working path</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="resume_from"/>
|
||||
|
||||
<decision name="resume_from">
|
||||
<switch>
|
||||
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
|
||||
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
|
||||
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
|
||||
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
|
||||
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
|
||||
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
|
||||
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
|
||||
<default to="ImportCrossRef"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ImportCrossRef">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--esServer</arg><arg>${esServer}</arg>
|
||||
<arg>--esIndex</arg><arg>${esIndex}</arg>
|
||||
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
|
||||
</java>
|
||||
<ok to="GenerateCrossrefDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<!-- CROSSREF SECTION -->
|
||||
|
||||
<action name="GenerateCrossrefDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateCrossrefDataset</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="RenameDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="RenameDataset">
|
||||
<fs>
|
||||
<delete path="${inputPathCrossref}/crossref_ds"/>
|
||||
<move source="${inputPathCrossref}/crossref_ds_updated"
|
||||
target="${inputPathCrossref}/crossref_ds"/>
|
||||
</fs>
|
||||
<ok to="ConvertCrossrefToOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="ConvertCrossrefToOAF">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ConvertCrossrefToOAF</name>
|
||||
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ResetMagWorkingPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
<!-- MAG SECTION -->
|
||||
<action name="ResetMagWorkingPath">
|
||||
<fs>
|
||||
<delete path="${inputPathMAG}/dataset"/>
|
||||
<delete path="${inputPathMAG}/process"/>
|
||||
<delete path="${inputPathMAG}/dataset"/>
|
||||
</fs>
|
||||
<ok to="ConvertMagToDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ConvertMagToDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathMAG}/input</arg>
|
||||
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessMAG"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ProcessMAG">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert Mag to OAF Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessUW"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- UnpayWall SECTION -->
|
||||
|
||||
<action name="ProcessUW">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert UnpayWall to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="ProcessORCID"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- ORCID SECTION -->
|
||||
<action name="ProcessORCID">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Convert ORCID to Dataset</name>
|
||||
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="CreateDOIBoost"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- INTERSECTION SECTION-->
|
||||
<action name="CreateDOIBoost">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create DOIBoost Infospace</name>
|
||||
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorIntersectionMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||
<arg>--affiliationPath</arg><arg>${inputPathMAG}/process/Affiliations</arg>
|
||||
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/process/PaperAuthorAffiliations</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="GenerateActionSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="GenerateActionSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Generate DOIBoost ActionSet</name>
|
||||
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
|
||||
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
|
||||
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
|
||||
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
|
||||
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
|
||||
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
|
||||
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
|
||||
<arg>--sFilePath</arg><arg>${outputPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -1,6 +1,9 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
|
|||
})
|
||||
}
|
||||
|
||||
// @Test
|
||||
// def testOAFConvert():Unit ={
|
||||
//
|
||||
// val spark: SparkSession =
|
||||
// SparkSession
|
||||
// .builder()
|
||||
// .appName(getClass.getSimpleName)
|
||||
// .master("local[*]").getOrCreate()
|
||||
//
|
||||
//
|
||||
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
|
||||
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
//
|
||||
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
|
||||
// println(df.first.getId)
|
||||
// println(mapper.writeValueAsString(df.first()))
|
||||
//
|
||||
//
|
||||
//
|
||||
//
|
||||
// }
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -104,7 +104,7 @@ public class PrepareResultOrcidAssociationStep1 {
|
|||
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
||||
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
||||
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
|
||||
+ " lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
|
||||
+ " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
|
||||
+ " GROUP BY id) r_t "
|
||||
+ " JOIN ("
|
||||
+ " SELECT source, target "
|
||||
|
|
|
@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
|||
.stream()
|
||||
.map(con -> con.getId())
|
||||
.collect(Collectors.toList());
|
||||
Result res = new Result();
|
||||
R res = (R) ret.getClass().newInstance();
|
||||
res.setId(ret.getId());
|
||||
List<Context> propagatedContexts = new ArrayList<>();
|
||||
for (String cId : communitySet) {
|
||||
|
|
|
@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob {
|
|||
})
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
Result r = new Result();
|
||||
R r = (R) ret.getClass().newInstance();
|
||||
r.setId(ret.getId());
|
||||
r.setContext(contextList);
|
||||
ret.mergeFrom(r);
|
||||
|
|
|
@ -170,6 +170,7 @@ public class OrcidPropagationJobTest {
|
|||
.filter(
|
||||
"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
|
||||
+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = '" +
|
||||
|
||||
ModelConstants.ORCID_PENDING + "'")
|
||||
.count());
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class ResultToCommunityJobTest {
|
||||
|
@ -66,7 +65,7 @@ public class ResultToCommunityJobTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void test1() throws Exception {
|
||||
public void testSparkResultToCommunityThroughSemRelJob() throws Exception {
|
||||
SparkResultToCommunityThroughSemRelJob
|
||||
.main(
|
||||
new String[] {
|
||||
|
|
|
@ -15,8 +15,11 @@ import eu.dnetlib.dhp.schema.oaf.*;
|
|||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||
public static final String DOI_PREFIX_REGEX = "^10\\.";
|
||||
|
||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||
public static final int ORCID_LEN = 19;
|
||||
|
||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
|
||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||
|
@ -56,13 +59,19 @@ public class CleaningFunctions {
|
|||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r.getAuthor().forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a.getPid().forEach(p -> {
|
||||
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
|
||||
});
|
||||
}
|
||||
});
|
||||
r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(a -> {
|
||||
if (Objects.nonNull(a.getPid())) {
|
||||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
|
||||
}
|
||||
});
|
||||
}
|
||||
if (value instanceof Publication) {
|
||||
|
||||
|
@ -86,7 +95,7 @@ public class CleaningFunctions {
|
|||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
|
||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
|
@ -101,6 +110,16 @@ public class CleaningFunctions {
|
|||
.setLanguage(
|
||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getCountry())) {
|
||||
r
|
||||
.setCountry(
|
||||
r
|
||||
.getCountry()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(c -> StringUtils.isNotBlank(c.getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
|
@ -153,12 +172,14 @@ public class CleaningFunctions {
|
|||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
.setResourcetype(
|
||||
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
i
|
||||
.setAccessright(
|
||||
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
|
@ -173,12 +194,22 @@ public class CleaningFunctions {
|
|||
if (Objects.isNull(bestaccessrights)) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
} else {
|
||||
r.setBestaccessright(bestaccessrights);
|
||||
}
|
||||
}
|
||||
if (Objects.nonNull(r.getAuthor())) {
|
||||
r
|
||||
.setAuthor(
|
||||
r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.filter(a -> Objects.nonNull(a))
|
||||
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
|
||||
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
boolean nullRank = r
|
||||
.getAuthor()
|
||||
.stream()
|
||||
|
@ -199,6 +230,7 @@ public class CleaningFunctions {
|
|||
a
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
|
@ -211,14 +243,31 @@ public class CleaningFunctions {
|
|||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID);
|
||||
} else {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
||||
if (p
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
.toLowerCase()
|
||||
.contains(ModelConstants.ORCID)) {
|
||||
if (pidProvenance
|
||||
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID);
|
||||
} else {
|
||||
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
||||
}
|
||||
final String orcid = p
|
||||
.getValue()
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
|
||||
if (orcid.length() == ORCID_LEN) {
|
||||
p.setValue(orcid);
|
||||
} else {
|
||||
p.setValue("");
|
||||
}
|
||||
}
|
||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||
return p;
|
||||
})
|
||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
|
@ -286,7 +335,7 @@ public class CleaningFunctions {
|
|||
|
||||
// TODO add cleaning for more PID types as needed
|
||||
case "doi":
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
||||
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
|
||||
break;
|
||||
}
|
||||
return pid;
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.apache.spark.sql.expressions.Aggregator;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.Configuration;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
|
@ -44,7 +45,8 @@ public class GroupEntitiesAndRelationsSparkJob {
|
|||
|
||||
private final static String SOURCE_JPATH = "$.source";
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
|
|
|
@ -40,4 +40,18 @@ public class Constants {
|
|||
coarCodeLabelMap.put("c_14cb", "CLOSED");
|
||||
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||
}
|
||||
|
||||
public enum DUMPTYPE {
|
||||
COMPLETE("complete"), COMMUNITY("community"), FUNDER("funder");
|
||||
|
||||
private String type;
|
||||
|
||||
DUMPTYPE(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,17 +11,12 @@ import java.util.Set;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
|
@ -33,7 +28,7 @@ public class DumpProducts implements Serializable {
|
|||
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
|
||||
Class<? extends OafEntity> inputClazz,
|
||||
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> outputClazz,
|
||||
boolean graph) {
|
||||
String dumpType) {
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
|
@ -42,7 +37,7 @@ public class DumpProducts implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph);
|
||||
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -53,13 +48,13 @@ public class DumpProducts implements Serializable {
|
|||
String communityMapPath,
|
||||
Class<I> inputClazz,
|
||||
Class<O> outputClazz,
|
||||
boolean graph) {
|
||||
String dumpType) {
|
||||
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map((MapFunction<I, O>) value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
|
||||
.map((MapFunction<I, O>) value -> execMap(value, communityMap, dumpType), Encoders.bean(outputClazz))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
@ -70,18 +65,18 @@ public class DumpProducts implements Serializable {
|
|||
|
||||
private static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O execMap(I value,
|
||||
CommunityMap communityMap,
|
||||
boolean graph) {
|
||||
String dumpType) {
|
||||
|
||||
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
|
||||
if (odInfo.isPresent()) {
|
||||
if (odInfo.get().getDeletedbyinference()) {
|
||||
if (odInfo.get().getDeletedbyinference() || odInfo.get().getInvisible()) {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!graph) {
|
||||
if (Constants.DUMPTYPE.COMMUNITY.getType().equals(dumpType)) {
|
||||
Set<String> communities = communityMap.keySet();
|
||||
|
||||
Optional<List<Context>> inputContext = Optional
|
||||
|
@ -102,7 +97,8 @@ public class DumpProducts implements Serializable {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
return (O) ResultMapper.map(value, communityMap, graph);
|
||||
|
||||
return (O) ResultMapper.map(value, communityMap, dumpType);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,10 +21,10 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|||
public class ResultMapper implements Serializable {
|
||||
|
||||
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
|
||||
E in, Map<String, String> communityMap, boolean graph) {
|
||||
E in, Map<String, String> communityMap, String dumpType) {
|
||||
|
||||
Result out;
|
||||
if (graph) {
|
||||
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
out = new GraphResult();
|
||||
} else {
|
||||
out = new CommunityResult();
|
||||
|
@ -217,7 +217,7 @@ public class ResultMapper implements Serializable {
|
|||
.ofNullable(input.getInstance());
|
||||
|
||||
if (oInst.isPresent()) {
|
||||
if (graph) {
|
||||
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
((GraphResult) out)
|
||||
.setInstance(oInst.get().stream().map(i -> getGraphInstance(i)).collect(Collectors.toList()));
|
||||
} else {
|
||||
|
@ -296,7 +296,7 @@ public class ResultMapper implements Serializable {
|
|||
out.setType(input.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
if (!graph) {
|
||||
if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
|
||||
((CommunityResult) out)
|
||||
.setCollectedfrom(
|
||||
input
|
||||
|
|
|
@ -9,6 +9,7 @@ import java.util.Set;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -54,7 +55,7 @@ public class CommunitySplit implements Serializable {
|
|||
|
||||
private static void printResult(String c, Dataset<CommunityResult> result, String outputPath) {
|
||||
Dataset<CommunityResult> community_products = result
|
||||
.filter(r -> containsCommunity(r, c));
|
||||
.filter((FilterFunction<CommunityResult>) r -> containsCommunity(r, c));
|
||||
|
||||
try {
|
||||
community_products.first();
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue