Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

pull/98/head
Enrico Ottonello 3 years ago
commit c238561001

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.common;
import java.util.Map;
import com.google.common.collect.Maps;
public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
accessRightsCoarMap.put("CLOSED", "c_14cb");
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
}
static {
coarCodeLabelMap.put("c_abf2", "OPEN");
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
}

@ -0,0 +1,412 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class GraphResultMapper implements Serializable {
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in) {
CommunityResult out = new CommunityResult();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
c.setConferencedate(j.getConferencedate());
c.setConferenceplace(j.getConferenceplace());
c.setEdition(j.getEdition());
c.setEp(j.getEp());
c.setIss(j.getIss());
c.setIssnLinking(j.getIssnLinking());
c.setIssnOnline(j.getIssnOnline());
c.setIssnPrinted(j.getIssnPrinted());
c.setName(j.getName());
c.setSp(j.getSp());
c.setVol(j.getVol());
out.setContainer(c);
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
}
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out
.setGeolocation(
Optional
.ofNullable(id.getGeolocation())
.map(
igl -> igl
.stream()
.filter(Objects::nonNull)
.map(gli -> {
GeoLocation gl = new GeoLocation();
gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
break;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
Optional
.ofNullable(is.getCodeRepositoryUrl())
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
Optional
.ofNullable(is.getDocumentationUrl())
.ifPresent(
value -> out
.setDocumentationUrl(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
break;
case "other":
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
out
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
break;
}
Optional
.ofNullable(input.getAuthor())
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
out
.setBestaccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
Country country = new Country();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(input.getOriginalId());
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
out
.setInstance(
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
}
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
}
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
List<ControlledField> pids = new ArrayList<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.stream()
.forEach(
p -> pids
.add(
ControlledField
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
out.setPid(pids);
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
List<String> sourceList = new ArrayList<>();
Optional
.ofNullable(input.getSource())
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<Subject> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.forEach(s -> subjectList.add(getSubject(s))));
out.setSubjects(subjectList);
out.setType(input.getResulttype().getClassid());
}
out
.setCollectedfrom(
input
.getCollectedfrom()
.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
.collect(Collectors.toList()));
return out;
}
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright());
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
}
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
p.setProvenance(di.get().getProvenanceaction().getClassname());
p.setTrust(di.get().getTrust());
subject.setProvenance(p);
}
return subject;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional<List<StructuredProperty>> oPids = Optional
.ofNullable(oa.getPid());
if (oPids.isPresent()) {
Pid pid = getOrcid(oPids.get());
if (pid != null) {
a.setPid(pid);
}
}
return a;
}
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue()),
Provenance
.newInstance(
di.get().getProvenanceaction().getClassname(),
di.get().getTrust()));
} else {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue())
);
}
}
}
return null;
}
}

@ -90,9 +90,6 @@ public class MakeTarArchive implements Serializable {
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
name = "communities_infrastructures.json";
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen();

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
@ -32,27 +33,33 @@ public class AuthorMerger {
}
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base, enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
if (pa == pb) {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} else {
if (sa == sb) {
base = pa > pb ? a : b;
enrich = pa > pb ? b : a;
} else {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
}
enrichPidFromList(base, enrich);
enrichPidFromList(base, enrich, threshold);
return base;
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
return mergeAuthor(a, b, THRESHOLD);
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
if (base == null || enrich == null)
return;
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -63,6 +70,7 @@ public class AuthorMerger {
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
@ -83,10 +91,10 @@ public class AuthorMerger {
.max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent()) {
double th = THRESHOLD;
double th = threshold;
// increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null
&& simAuthor.get()._2().getSurname().length() <= 3)
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
th = 0.99;
if (simAuthor.get()._1() > th) {
@ -156,7 +164,7 @@ public class AuthorMerger {
}
private static String normalize(final String s) {
return nfd(s)
String[] normalized = nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
@ -166,7 +174,12 @@ public class AuthorMerger {
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
.trim()
.split(" ");
Arrays.sort(normalized);
return String.join(" ", normalized);
}
private static String nfd(final String s) {

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.oa.merge;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
public class AuthorMergerTest {
private String publicationsBasePath;
private List<List<Author>> authors;
@BeforeEach
public void setUp() throws Exception {
publicationsBasePath = Paths
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
.toFile()
.getAbsolutePath();
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
.stream()
.map(p -> p._2().getAuthor())
.collect(Collectors.toList());
}
@Test
public void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));
for (Author author : authors1) {
System.out.println(authorToString(author));
}
}
List<Author> merge = AuthorMerger.merge(authors);
System.out.println("Merge ");
for (Author author : merge) {
System.out.println(authorToString(author));
}
Assertions.assertEquals(7, merge.size());
}
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public String authorToString(Author a) {
String print = "Fullname = ";
print += a.getFullname() + " pid = [";
if (a.getPid() != null)
for (StructuredProperty sp : a.getPid()) {
print += sp.toComparableString() + " ";
}
print += "]";
return print;
}
}

File diff suppressed because one or more lines are too long

@ -105,6 +105,8 @@ public class ModelConstants {
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
private static Qualifier qualifier(
final String classid,
final String classname,

@ -2,8 +2,12 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public abstract class Oaf implements Serializable {
@ -40,9 +44,34 @@ public abstract class Oaf implements Serializable {
this.lastupdatetimestamp = lastupdatetimestamp;
}
public void mergeOAFDataInfo(Oaf e) {
if (e.getDataInfo() != null && compareTrust(this, e) < 0)
dataInfo = e.getDataInfo();
public void mergeFrom(Oaf o) {
if (Objects.isNull(o)) {
return;
}
setCollectedfrom(
Stream
.concat(
Optional
.ofNullable(getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(o.getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()))
.distinct() // relies on KeyValue.equals
.collect(Collectors.toList()));
setLastupdatetimestamp(
Math
.max(
Optional.ofNullable(getLastupdatetimestamp()).orElse(0L),
Optional.ofNullable(o.getLastupdatetimestamp()).orElse(0L)));
}
public void mergeOAFDataInfo(Oaf o) {
if (o.getDataInfo() != null && compareTrust(this, o) < 0)
dataInfo = o.getDataInfo();
}
protected String extractTrust(Oaf e) {
@ -62,7 +91,7 @@ public abstract class Oaf implements Serializable {
if (o == null || getClass() != o.getClass())
return false;
Oaf oaf = (Oaf) o;
return Objects.equals(dataInfo, oaf.dataInfo)
return Objects.equals(getDataInfo(), oaf.getDataInfo())
&& Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp);
}

@ -78,14 +78,10 @@ public abstract class OafEntity extends Oaf implements Serializable {
}
public void mergeFrom(OafEntity e) {
if (e == null)
return;
super.mergeFrom(e);
originalId = mergeLists(originalId, e.getOriginalId());
collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom());
pid = mergeLists(pid, e.getPid());
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)

@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
? p.getFundedamount()
: fundedamount;
// programme = mergeLists(programme, p.getProgramme());
h2020classification = mergeLists(h2020classification, p.getH2020classification());
mergeOAFDataInfo(e);

@ -130,19 +130,7 @@ public class Relation extends Oaf {
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
setCollectedfrom(
Stream
.concat(
Optional
.ofNullable(getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()),
Optional
.ofNullable(r.getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()))
.distinct() // relies on KeyValue.equals
.collect(Collectors.toList()));
super.mergeFrom(r);
}
@Override

@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable {
Result r = (Result) e;
// TODO consider merging also Measures
measures = mergeLists(measures, r.getMeasures());
instance = mergeLists(instance, r.getInstance());
@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable {
if (a.size() == b.size()) {
int msa = a
.stream()
.filter(i -> i.getValue() != null)
.filter(i -> i != null && i.getValue() != null)
.map(i -> i.getValue().length())
.max(Comparator.naturalOrder())
.orElse(0);
int msb = b
.stream()
.filter(i -> i.getValue() != null)
.filter(i -> i != null && i.getValue() != null)
.map(i -> i.getValue().length())
.max(Comparator.naturalOrder())
.orElse(0);

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* Class that maps the model of the bipFinder! input data.
* Only needed for deserialization purposes
*/
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
public BipDeserialize() {
super();
}
public List<Score> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
*/
public class BipScore implements Serializable {
private String id; // doi
private List<Score> scoreList; // unit as given in the inputfile
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<Score> getScoreList() {
return scoreList;
}
public void setScoreList(List<Score> scoreList) {
this.scoreList = scoreList;
}
}

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Just collects all the atomic actions produced for the different results and saves them in
* outputpath for the ActionSet
*/
public class CollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
collectAndSave(spark, inputPath, outputPath);
});
}
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
public class KeyValue implements Serializable {
private String key;
private String value;
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
/**
* Subset of the information of the generic results that are needed to create the atomic action
*/
public class PreparedResult implements Serializable {
private String id; // openaire id
private String value; // doi
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
import java.util.List;
/**
* represents the score in the input file
*/
public class Score implements Serializable {
private String id;
private List<KeyValue> unit;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<KeyValue> getUnit() {
return unit;
}
public void setUnit(List<KeyValue> unit) {
this.unit = unit;
}
}

@ -0,0 +1,200 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import scala.Tuple2;
/**
* created the Atomic Action for each tipe of results
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkAtomicActionScoreJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String bipScorePath = parser.get("bipScorePath");
log.info("bipScorePath: {}", bipScorePath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(bipScorePath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
Dataset<BipScore> bipScores = spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
System.out.println(bipScores.count());
Dataset<I> results = readPath(spark, inputPath, inputClazz);
results.createOrReplaceTempView("result");
Dataset<PreparedResult> preparedResult = spark
.sql(
"select pIde.value value, id " +
"from result " +
"lateral view explode (pid) p as pIde " +
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
.as(Encoders.bean(PreparedResult.class));
bipScores
.joinWith(
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
"inner")
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
BipScore ret = value._1();
ret.setId(value._2().getId());
return ret;
}, Encoders.bean(BipScore.class))
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
Result ret = new Result();
ret.setDataInfo(getDataInfo());
BipScore first = it.next();
ret.setId(first.getId());
ret.setMeasures(getMeasure(first));
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
return ret;
}, Encoders.bean(Result.class))
.toJavaRDD()
.map(p -> new AtomicAction(inputClazz, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv.setDataInfo(getDataInfo());
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
private static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setInvisible(false);
di.setDeletedbyinference(false);
di.setTrust("");
Qualifier qualifier = new Qualifier();
qualifier.setClassid("sysimport:actionset");
qualifier.setClassname("Harvested");
qualifier.setSchemename("dnet:provenanceActions");
qualifier.setSchemeid("dnet:provenanceActions");
di.setProvenanceaction(qualifier);
return di;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rtn",
"paramLongName": "resultTableName",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "bsp",
"paramLongName": "bipScorePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

@ -0,0 +1,171 @@
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>inputPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
</parameters>
<start to="deleteoutputpath"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="deleteoutputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="atomicactions"/>
<error to="Kill"/>
</action>
<fork name="atomicactions">
<path start="atomicactions_publication"/>
<path start="atomicactions_dataset"/>
<path start="atomicactions_orp"/>
<path start="atomicactions_software"/>
</fork>
<action name="atomicactions_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for publications</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for datasets</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_orp">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for orp</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<action name="atomicactions_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the atomic action with the bip finder scores for software</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${inputPath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--bipScorePath</arg><arg>${bipScorePath}</arg>
</spark>
<ok to="join_aa"/>
<error to="Kill"/>
</action>
<join name="join_aa" to="collectandsave"/>
<action name="collectandsave">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>saves all the aa produced for the several types of results in the as output path</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,323 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class SparkAtomicActionScoreJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(SparkAtomicActionScoreJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(SparkAtomicActionScoreJobTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(SparkAtomicActionScoreJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(SparkAtomicActionScoreJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
public void matchOne() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(2, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
}
@Test
public void matchOneWithTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 1);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
execVerification.select("oaid").collectAsList().get(0).getString(0));
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
String tmp_influence = tmp_ds.get(0).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
tmp_influence = tmp_ds.get(1).getString(0);
Assertions
.assertTrue(
"1.47565045883e-08".equals(tmp_influence) ||
"1.98956540239e-08".equals(tmp_influence));
Assertions.assertTrue(!tmp_ds.get(0).getString(0).equals(tmp_ds.get(1).getString(0)));
}
@Test
public void matchTwo() throws Exception {
String bipScoresPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
.getPath();
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
.getPath();
SparkAtomicActionScoreJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-bipScorePath",
bipScoresPath,
"-resultTableName",
"eu.dnetlib.dhp.schema.oaf.Publication",
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Publication) aa.getPayload()));
Assertions.assertTrue(tmp.count() == 2);
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
verificationDataset.createOrReplaceTempView("publication");
Dataset<Row> execVerification = spark
.sql(
"Select p.id oaid, mes.id, mUnit.value from publication p " +
"lateral view explode(measures) m as mes " +
"lateral view explode(mes.unit) u as mUnit ");
Assertions.assertEquals(4, execVerification.count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
Assertions
.assertEquals(
2,
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'influence'").count());
Assertions
.assertEquals(
2,
execVerification.filter("id = 'popularity'").count());
Assertions
.assertEquals(
"1.47565045883e-08",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"1.98956540239e-08",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'influence'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.282046161584",
execVerification
.filter(
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
Assertions
.assertEquals(
"0.227515392",
execVerification
.filter(
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
"and id = 'popularity'")
.select("value")
.collectAsList()
.get(0)
.getString(0));
}
}

@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
IOUtils
.toString(
CheckDuplictedIdsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
parser.parseArgument(args);
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String countPath = parser.get("workingPath") + "/counts";
final String countPath = parser.get("outputDir") + "/counts";
log.info("countPath: {}", countPath);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(countPath);
;

@ -44,10 +44,10 @@ public class GenerateEventsJob {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String eventsPath = workingPath + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
@ -59,6 +59,9 @@ public class GenerateEventsJob {
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -70,12 +73,12 @@ public class GenerateEventsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
final Dataset<ResultGroup> groups = ClusterUtils
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
.readPath(spark, workingDir + "/duplicates", ResultGroup.class);
final Dataset<Event> dataset = groups
.map(
g -> EventFinder
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
Encoders
.bean(EventGroup.class))
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));

@ -46,7 +46,7 @@ public class GenerateStatsJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String dbUrl = parser.get("dbUrl");

@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index");
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
log.info("maxEventsForTopic: {}", maxEventsForTopic);
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);

@ -54,7 +54,7 @@ public class IndexNotificationsJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index");
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
log.info("*** Start indexing");
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);

@ -36,7 +36,7 @@ public class IndexOnESJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String index = parser.get("index");
@ -45,6 +45,18 @@ public class IndexOnESJob {
final String indexHost = parser.get("esHost");
log.info("indexHost: {}", indexHost);
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
final String esNodesWanOnly = parser.get("esNodesWanOnly");
log.info("esNodesWanOnly: {}", esNodesWanOnly);
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
final JavaRDD<String> inputRdd = ClusterUtils
@ -53,15 +65,13 @@ public class IndexOnESJob {
.javaRDD();
final Map<String, String> esCfg = new HashMap<>();
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
esCfg.put("es.index.auto.create", "false");
esCfg.put("es.nodes", indexHost);
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
esCfg.put("es.batch.write.retry.count", "8");
esCfg.put("es.batch.write.retry.wait", "60s");
esCfg.put("es.batch.size.entries", "200");
esCfg.put("es.nodes.wan.only", "true");
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
}

@ -42,10 +42,10 @@ public class JoinStep0Job {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
@ -57,10 +57,10 @@ public class JoinStep0Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
final Dataset<RelatedDatasource> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
.toColumn();

@ -40,10 +40,10 @@ public class JoinStep1Job {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep1Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
final Dataset<RelatedProject> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
.toColumn();

@ -39,10 +39,10 @@ public class JoinStep2Job {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
@ -54,10 +54,10 @@ public class JoinStep2Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
final Dataset<RelatedSoftware> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
.toColumn();

@ -40,10 +40,10 @@ public class JoinStep3Job {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep3Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
final Dataset<RelatedDataset> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
.toColumn();

@ -40,10 +40,10 @@ public class JoinStep4Job {
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
@ -55,10 +55,10 @@ public class JoinStep4Job {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
final Dataset<RelatedPublication> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
.toColumn();

@ -4,8 +4,13 @@ package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
@ -13,6 +18,8 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
@ -37,7 +44,7 @@ public class PartitionEventsByDsIdJob {
IOUtils
.toString(
PartitionEventsByDsIdJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/od_partitions_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
@ -48,24 +55,43 @@ public class PartitionEventsByDsIdJob {
final SparkConf conf = new SparkConf();
final String eventsPath = parser.get("workingPath") + "/events";
final String eventsPath = parser.get("outputDir") + "/events";
log.info("eventsPath: {}", eventsPath);
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
log.info("partitionPath: {}", partitionPath);
final String opendoarIds = parser.get("opendoarIds");
log.info("opendoarIds: {}", opendoarIds);
final Set<String> validOpendoarIds = new HashSet<>();
if (!opendoarIds.trim().equals("-")) {
validOpendoarIds
.addAll(
Arrays
.stream(opendoarIds.split(","))
.map(String::trim)
.filter(StringUtils::isNotBlank)
.map(s -> OPENDOAR_NSPREFIX + DigestUtils.md5Hex(s))
.collect(Collectors.toSet()));
}
log.info("validOpendoarIds: {}", validOpendoarIds);
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils
.readPath(spark, eventsPath, Event.class)
.filter(e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter(e -> e.getMap().getTargetDatasourceId().contains(OPENDOAR_NSPREFIX))
.limit(10000)
.map(e -> messageFromNotification(e), Encoders.bean(ShortEventMessageWithGroupId.class))
.filter((FilterFunction<Event>) e -> StringUtils.isNotBlank(e.getMap().getTargetDatasourceId()))
.filter((FilterFunction<Event>) e -> e.getMap().getTargetDatasourceId().startsWith(OPENDOAR_NSPREFIX))
.filter((FilterFunction<Event>) e -> validOpendoarIds.contains(e.getMap().getTargetDatasourceId()))
.map(
(MapFunction<Event, ShortEventMessageWithGroupId>) e -> messageFromNotification(e),
Encoders.bean(ShortEventMessageWithGroupId.class))
.coalesce(1)
.write()
.partitionBy("group")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(partitionPath);
});
@ -97,6 +123,7 @@ public class PartitionEventsByDsIdJob {
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
res.setEventId(e.getEventId());
res.setOriginalId(payload.getResult().getOriginalId());
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
res.setTopic(e.getTopic());

@ -45,10 +45,10 @@ public class PrepareGroupsJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String groupsPath = workingPath + "/duplicates";
final String groupsPath = workingDir + "/duplicates";
log.info("groupsPath: {}", groupsPath);
final SparkConf conf = new SparkConf();
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
final Dataset<OaBrokerMainEntity> results = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
final Dataset<Relation> mergedRels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.loadRelations(graphPath, spark)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()

@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedDatasets";
final String relsPath = workingDir + "/relatedDatasets";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
final Dataset<RelatedDataset> dataset = rels
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
t._2);
rel.getRelDataset().setRelType(t._1.getRelClass());
return rel;
}, Encoders.bean(RelatedDataset.class));

@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedDatasources";
final String relsPath = workingDir + "/relatedDatasources";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();

@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedProjects";
final String relsPath = workingDir + "/relatedProjects";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))

@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedPublications";
final String relsPath = workingDir + "/relatedPublications";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
Encoders.bean(OaBrokerRelatedPublication.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
final Dataset<RelatedPublication> dataset = rels
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
final RelatedPublication rel = new RelatedPublication(
t._1.getSource(), t._2);
rel.getRelPublication().setRelType(t._1.getRelClass());
return rel;
}, Encoders.bean(RelatedPublication.class));

@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String relsPath = workingPath + "/relatedSoftwares";
final String relsPath = workingDir + "/relatedSoftwares";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.loadRelations(graphPath, spark)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))

@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
final String simpleEntitiesPath = workingPath + "/simpleEntities";
final String simpleEntitiesPath = workingDir + "/simpleEntities";
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
final SparkConf conf = new SparkConf();

@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingSubject() {
super(20,
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
s -> {
switch (s.getType().toLowerCase()) {
case "acm":
return Topic.ENRICH_MISSING_SUBJECT_ACM;
case "arxiv":
return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
case "ddc":
return Topic.ENRICH_MISSING_SUBJECT_DDC;
case "jel":
return Topic.ENRICH_MISSING_SUBJECT_JEL;
case "mesh":
return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
case "rvk":
return Topic.ENRICH_MISSING_SUBJECT_RVK;
default:
return null;
}
},
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}

@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMoreSubject() {
super(20,
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
s -> {
switch (s.getType().toLowerCase()) {
case "acm":
return Topic.ENRICH_MORE_SUBJECT_ACM;
case "arxiv":
return Topic.ENRICH_MORE_SUBJECT_ARXIV;
case "ddc":
return Topic.ENRICH_MORE_SUBJECT_DDC;
case "jel":
return Topic.ENRICH_MORE_SUBJECT_JEL;
case "mesh":
return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
case "rvk":
return Topic.ENRICH_MORE_SUBJECT_RVK;
default:
return null;
}
},
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}

@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class ClusterUtils {
@ -30,6 +31,16 @@ public class ClusterUtils {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
return ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.map(r -> {
r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
return r;
}, Encoders.bean(Relation.class));
}
public static <R> Dataset<R> readPath(
final SparkSession spark,
final String inputPath,
@ -67,6 +78,7 @@ public class ClusterUtils {
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(path);
}

@ -74,7 +74,7 @@ public class ConversionUtils {
}
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
res.setOpenaireId(d.getId());
res.setOpenaireId(cleanOpenaireId(d.getId()));
res.setOriginalId(first(d.getOriginalId()));
res.setTitle(structPropValue(d.getTitle()));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -89,7 +89,7 @@ public class ConversionUtils {
}
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
res.setOpenaireId(p.getId());
res.setOpenaireId(cleanOpenaireId(p.getId()));
res.setOriginalId(first(p.getOriginalId()));
res.setTitle(structPropValue(p.getTitle()));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
@ -106,7 +106,7 @@ public class ConversionUtils {
final OaBrokerMainEntity res = new OaBrokerMainEntity();
res.setOpenaireId(result.getId());
res.setOpenaireId(cleanOpenaireId(result.getId()));
res.setOriginalId(first(result.getOriginalId()));
res.setTypology(classId(result.getResulttype()));
res.setTitles(structPropList(result.getTitle()));
@ -129,6 +129,10 @@ public class ConversionUtils {
return res;
}
public static String cleanOpenaireId(final String id) {
return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
}
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
if (author == null) {
return null;
@ -188,7 +192,7 @@ public class ConversionUtils {
}
final OaBrokerProject res = new OaBrokerProject();
res.setOpenaireId(p.getId());
res.setOpenaireId(cleanOpenaireId(p.getId()));
res.setTitle(fieldValue(p.getTitle()));
res.setAcronym(fieldValue(p.getAcronym()));
res.setCode(fieldValue(p.getCode()));
@ -214,7 +218,7 @@ public class ConversionUtils {
}
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
res.setOpenaireId(sw.getId());
res.setOpenaireId(cleanOpenaireId(sw.getId()));
res.setName(structPropValue(sw.getTitle()));
res.setDescription(fieldValue(sw.getDescription()));
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
@ -230,7 +234,7 @@ public class ConversionUtils {
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
res.setOpenaireId(ds.getId());
res.setOpenaireId(cleanOpenaireId(ds.getId()));
res.setType(classId(ds.getDatasourcetype()));
return res;
}

@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
collectedFromSet
.stream()
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
.map(
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
BrokerConstants.COLLECTED_FROM_REL))
.forEach(res::addTuple);
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
hostedBySet
.stream()
.map(
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
BrokerConstants.HOSTED_BY_REL))
.forEach(res::addTuple);
return res;
}

@ -76,6 +76,7 @@ public class EventFinder {
final Set<String> dsIdWhitelist,
final Set<String> dsIdBlacklist,
final Set<String> dsTypeWhitelist,
final Set<String> topicWhitelist,
final Map<String, LongAccumulator> accumulators) {
final List<UpdateInfo<?>> list = new ArrayList<>();
@ -84,7 +85,13 @@ public class EventFinder {
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
for (final UpdateMatcher<?> matcher : matchers) {
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
for (final UpdateInfo<?> info : matcher
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
if (topicWhitelist == null || topicWhitelist.isEmpty()
|| topicWhitelist.contains(info.getTopic().getPath())) {
list.add(info);
}
}
}
}
}

@ -0,0 +1,9 @@
[
{
"paramName": "o",
"paramLongName": "outputDir",
"paramDescription": "the path where the data are stored",
"paramRequired": true
}
]

@ -7,7 +7,7 @@
},
{
"paramName": "o",
"paramLongName": "workingPath",
"paramLongName": "workingDir",
"paramDescription": "the path where the temporary data will be stored",
"paramRequired": true
}

@ -6,7 +6,7 @@
<description>the path where the graph is stored</description>
</property>
<property>
<name>workingPath</name>
<name>outputDir</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
@ -24,6 +24,11 @@
<value>-</value>
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
</property>
<property>
<name>topicWhitelist</name>
<value>*</value>
<description>a white list (comma separeted, * for all) of topics</description>
</property>
<property>
<name>esEventIndexName</name>
<description>the elasticsearch index name for events</description>
@ -36,6 +41,26 @@
<name>esIndexHost</name>
<description>the elasticsearch host</description>
</property>
<property>
<name>esBatchWriteRetryCount</name>
<value>8</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchWriteRetryWait</name>
<value>60s</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchSizeEntries</name>
<value>200</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esNodesWanOnly</name>
<value>true</value>
<description>an ES configuration property</description>
</property>
<property>
<name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description>
@ -111,15 +136,15 @@
</configuration>
</global>
<start to="ensure_working_path"/>
<start to="ensure_output_dir"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ensure_working_path">
<action name="ensure_output_dir">
<fs>
<mkdir path='${workingPath}'/>
<mkdir path='${outputDir}'/>
</fs>
<ok to="start_entities_and_rels"/>
<error to="Kill"/>
@ -152,7 +177,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -176,7 +201,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -201,7 +226,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -225,7 +250,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -249,7 +274,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -273,7 +298,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
@ -299,7 +324,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="join_entities_step1"/>
<error to="Kill"/>
@ -323,7 +348,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="join_entities_step2"/>
<error to="Kill"/>
@ -347,7 +372,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="join_entities_step3"/>
<error to="Kill"/>
@ -371,7 +396,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="join_entities_step4"/>
<error to="Kill"/>
@ -395,7 +420,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="prepare_groups"/>
<error to="Kill"/>
@ -419,7 +444,7 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
</spark>
<ok to="generate_events"/>
<error to="Kill"/>
@ -442,10 +467,12 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
</spark>
<ok to="index_event_subset"/>
<error to="Kill"/>
@ -468,9 +495,13 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esEventIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
@ -495,9 +526,13 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="stats"/>
@ -521,7 +556,7 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>

@ -1,7 +1,13 @@
[
{
"paramName": "wp",
"paramLongName": "workingDir",
"paramDescription": "the path where the temporary data are stored",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "workingPath",
"paramLongName": "outputDir",
"paramDescription": "the path where the generated events will be stored",
"paramRequired": true
},
@ -22,5 +28,11 @@
"paramLongName": "datasourceIdBlacklist",
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
"paramRequired": true
},
{
"paramName": "topicWhitelist",
"paramLongName": "topicWhitelist",
"paramDescription": "a white list (comma separeted, * for all) of topics",
"paramRequired": true
}
]

@ -1,8 +1,8 @@
[
{
"paramName": "o",
"paramLongName": "workingPath",
"paramDescription": "the workinh path",
"paramLongName": "outputDir",
"paramDescription": "the data path",
"paramRequired": true
},
{
@ -16,5 +16,29 @@
"paramLongName": "esHost",
"paramDescription": "the ES host",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryCount",
"paramLongName": "esBatchWriteRetryCount",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryWait",
"paramLongName": "esBatchWriteRetryWait",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchSizeEntries",
"paramLongName": "esBatchSizeEntries",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esNodesWanOnly",
"paramLongName": "esNodesWanOnly",
"paramDescription": "an ES configuration property",
"paramRequired": true
}
]

@ -1,8 +1,8 @@
[
{
"paramName": "o",
"paramLongName": "workingPath",
"paramDescription": "the workinh path",
"paramLongName": "outputDir",
"paramDescription": "the path where the generated data are stored",
"paramRequired": true
},
{
@ -16,7 +16,31 @@
"paramLongName": "esHost",
"paramDescription": "the ES host",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryCount",
"paramLongName": "esBatchWriteRetryCount",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryWait",
"paramLongName": "esBatchWriteRetryWait",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchSizeEntries",
"paramLongName": "esBatchSizeEntries",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esNodesWanOnly",
"paramLongName": "esNodesWanOnly",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "maxEventsForTopic",

@ -1,8 +1,8 @@
[
{
"paramName": "o",
"paramLongName": "workingPath",
"paramDescription": "the workinh path",
"paramLongName": "outputDir",
"paramDescription": "the dir that contains the events folder",
"paramRequired": true
},
{
@ -17,6 +17,30 @@
"paramDescription": "the ES host",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryCount",
"paramLongName": "esBatchWriteRetryCount",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchWriteRetryWait",
"paramLongName": "esBatchWriteRetryWait",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esBatchSizeEntries",
"paramLongName": "esBatchSizeEntries",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "esNodesWanOnly",
"paramLongName": "esNodesWanOnly",
"paramDescription": "an ES configuration property",
"paramRequired": true
},
{
"paramName": "broker",
"paramLongName": "brokerApiBaseUrl",

@ -6,8 +6,8 @@
<description>the path where the graph is stored</description>
</property>
<property>
<name>workingPath</name>
<description>the path where the the generated data will be stored</description>
<name>outputDir</name>
<description>the path where the the generated data are stored</description>
</property>
<property>
<name>datasourceIdWhitelist</name>
@ -36,6 +36,26 @@
<name>esIndexHost</name>
<description>the elasticsearch host</description>
</property>
<property>
<name>esBatchWriteRetryCount</name>
<value>8</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchWriteRetryWait</name>
<value>60s</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esBatchSizeEntries</name>
<value>200</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esNodesWanOnly</name>
<value>true</value>
<description>an ES configuration property</description>
</property>
<property>
<name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description>
@ -122,9 +142,13 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="End"/>

@ -0,0 +1,14 @@
[
{
"paramName": "o",
"paramLongName": "outputDir",
"paramDescription": "the path where the data will be stored",
"paramRequired": true
},
{
"paramName": "list",
"paramLongName": "opendoarIds",
"paramDescription": "the opendoar IDs whitelist (comma separated)",
"paramRequired": true
}
]

@ -0,0 +1,99 @@
<workflow-app name="partitionEventsByOpendoarIds" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>opendoarIds</name>
<description>the opendoar IDs whitelist (comma separated)</description>
</property>
<property>
<name>outputDir</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="opendoarPartition"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="opendoarPartition">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PartitionEventsByDsIdJob</name>
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -1,60 +1,45 @@
<workflow-app name="create broker events - partial" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the path where the graph is stored</description>
</property>
<property>
<name>workingPath</name>
<name>outputDir</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
<name>datasourceIdWhitelist</name>
<value>-</value>
<description>a white list (comma separeted, - for empty list) of datasource ids</description>
</property>
<property>
<name>datasourceTypeWhitelist</name>
<value>-</value>
<description>a white list (comma separeted, - for empty list) of datasource types</description>
</property>
<property>
<name>datasourceIdBlacklist</name>
<value>-</value>
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
</property>
<property>
<name>esEventIndexName</name>
<description>the elasticsearch index name for events</description>
</property>
<property>
<name>esNotificationsIndexName</name>
<description>the elasticsearch index name for notifications</description>
</property>
<property>
<name>esIndexHost</name>
<description>the elasticsearch host</description>
</property>
<property>
<name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description>
<name>esBatchWriteRetryCount</name>
<value>8</value>
<description>an ES configuration property</description>
</property>
<property>
<name>brokerApiBaseUrl</name>
<description>the url of the broker service api</description>
<property>
<name>esBatchWriteRetryWait</name>
<value>60s</value>
<description>an ES configuration property</description>
</property>
<property>
<name>brokerDbUrl</name>
<description>the url of the broker database</description>
<property>
<name>esBatchSizeEntries</name>
<value>200</value>
<description>an ES configuration property</description>
</property>
<property>
<name>esNodesWanOnly</name>
<value>true</value>
<description>an ES configuration property</description>
</property>
<property>
<name>brokerDbUser</name>
<description>the user of the broker database</description>
<name>maxIndexedEventsForDsAndTopic</name>
<description>the max number of events for each couple (ds/topic)</description>
</property>
<property>
<name>brokerDbPassword</name>
<description>the password of the broker database</description>
<name>brokerApiBaseUrl</name>
<description>the url of the broker service api</description>
</property>
<property>
<name>sparkDriverMemory</name>
@ -111,36 +96,45 @@
</configuration>
</global>
<start to="partition"/>
<start to="index_event_subset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="partition">
<action name="index_event_subset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PartitionEventsByDsIdJob</name>
<class>eu.dnetlib.dhp.broker.oa.PartitionEventsByDsIdJob</class>
<name>IndexEventSubsetOnESJob</name>
<class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.dynamicAllocation.maxExecutors="8"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
<arg>--index</arg><arg>${esEventIndexName}</arg>
<arg>--esHost</arg><arg>${esIndexHost}</arg>
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,8 +1,8 @@
[
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the working path",
"paramName": "o",
"paramLongName": "outputDir",
"paramDescription": "the path where generated data are stored",
"paramRequired": true
},
{

@ -10,10 +10,11 @@ import java.io.Serializable;
import java.nio.file.Paths;
import java.util.*;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ -100,8 +101,8 @@ public class EntityMergerTest implements Serializable {
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
// verify authors
assertEquals(pub_merged.getAuthor().size(), 9);
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
assertEquals(13, pub_merged.getAuthor().size());
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
// verify title
int count = 0;

@ -7,7 +7,6 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent;

@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Oaf;

@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
import scala.io.Source
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
object DoiBoostMappingUtil {
def getUnknownCountry(): Qualifier = {
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
}
def generateMAGAffiliationId(affId: String): String = {
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
}
val logger: Logger = LoggerFactory.getLogger(getClass)
//STATIC STRING
val MAG = "microsoft"
val MAG_NAME = "Microsoft Academic Graph"
val ORCID = "ORCID"
val ORCID = "orcid"
val ORCID_PENDING = "orcid_pending"
val CROSSREF = "Crossref"
val UNPAYWALL = "UnpayWall"
val GRID_AC = "grid.ac"

@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet {
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
val dbOrganizationPath = parser.get("dbOrganizationPath")
val workingDirPath = parser.get("targetPath")
val sequenceFilePath = parser.get("sFilePath")
spark.read.load(dbDatasetPath).as[OafDataset]
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
.map(d =>DoiBoostMappingUtil.fixResult(d))
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
spark.read.load(dbPublicationPath).as[Publication]
val asPublication =spark.read.load(dbPublicationPath).as[Publication]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(dbOrganizationPath).as[Organization]
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(crossRefRelation).as[Relation]
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
spark.read.load(dbaffiliationRelationPath).as[Relation]
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])

@ -2,6 +2,7 @@ package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
@ -30,7 +31,7 @@ object SparkGenerateDoiBoost {
import spark.implicits._
val hostedByMapPath = parser.get("hostedByMapPath")
val workingDirPath = parser.get("workingDirPath")
val workingDirPath = parser.get("workingPath")
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
@ -62,7 +63,7 @@ object SparkGenerateDoiBoost {
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
logger.info("Phase 3) Join Result with MAG")
logger.info("Phase 4) Join Result with MAG")
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
@ -132,7 +133,7 @@ object SparkGenerateDoiBoost {
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
if (affiliation.OfficialPage.isDefined)
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
o
}
else

@ -200,7 +200,7 @@ case object Crossref2Oaf {
a.setSurname(family)
a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)
a
}

@ -2,18 +2,16 @@
package eu.dnetlib.doiboost.crossref;
import java.io.ByteArrayOutputStream;
import java.util.Optional;
import java.util.zip.Inflater;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
@ -30,34 +28,45 @@ public class CrossrefImporter {
parser.parseArgument(args);
final String hdfsuri = parser.get("namenode");
System.out.println("HDFS URI" + hdfsuri);
Path hdfswritepath = new Path(parser.get("targetPath"));
System.out.println("TargetPath: " + hdfsuri);
final String namenode = parser.get("namenode");
System.out.println("namenode: " + namenode);
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
? Long.parseLong(parser.get("timestamp"))
: -1;
Path targetPath = new Path(parser.get("targetPath"));
System.out.println("targetPath: " + targetPath);
if (timestamp > 0)
System.out.println("Timestamp added " + timestamp);
final Long timestamp = Optional
.ofNullable(parser.get("timestamp"))
.map(s -> {
try {
return Long.parseLong(s);
} catch (NumberFormatException e) {
return -1L;
}
})
.orElse(-1L);
System.out.println("timestamp: " + timestamp);
final String esServer = parser.get("esServer");
System.out.println("esServer: " + esServer);
final String esIndex = parser.get("esIndex");
System.out.println("esIndex: " + esIndex);
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
conf.set("fs.defaultFS", namenode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
ESClient client = timestamp > 0
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
final ESClient client = new ESClient(esServer, esIndex, timestamp);
try (SequenceFile.Writer writer = SequenceFile
.createWriter(
conf,
SequenceFile.Writer.file(hdfswritepath),
SequenceFile.Writer.file(targetPath),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class))) {
@ -74,8 +83,7 @@ public class CrossrefImporter {
end = System.currentTimeMillis();
final float time = (end - start) / 1000.0F;
System.out
.println(
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
start = System.currentTimeMillis();
}
}

@ -1,11 +1,11 @@
package eu.dnetlib.doiboost.crossref;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpHeaders;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
import com.jayway.jsonpath.JsonPath;
public class ESClient implements Iterator<String> {
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
static final String blobPath = "$.hits[*].hits[*]._source.blob";
static final String scrollIdPath = "$._scroll_id";
static final String JSON_NO_TS = "{\"size\":1000}";
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
private static final String SCROLL_ID_PATH = "$._scroll_id";
private static final String JSON_NO_TS = "{\"size\":1000}";
private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
public static final String APPLICATION_JSON = "application/json";
public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";
private final String scrollId;
@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {
private final String esHost;
public ESClient(final String esHost, final String esIndex) throws IOException {
public ESClient(final String esHost, final String esIndex, final long timestamp) {
this.esHost = esHost;
final String body = getResponse(
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
scrollId = getJPathString(scrollIdPath, body);
buffer = getBlobs(body);
}
public ESClient(final String esHost, final String esIndex, final long timestamp)
throws IOException {
this.esHost = esHost;
final String body = getResponse(
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
String.format(JSON_WITH_TS, timestamp));
scrollId = getJPathString(scrollIdPath, body);
final String body = timestamp > 0
? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
scrollId = getJPathString(SCROLL_ID_PATH, body);
buffer = getBlobs(body);
}
private String getResponse(final String url, final String json) {
CloseableHttpClient client = HttpClients.createDefault();
try {
try (CloseableHttpClient client = HttpClients.createDefault()) {
HttpPost httpPost = new HttpPost(url);
if (json != null) {
StringEntity entity = new StringEntity(json);
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
}
try (CloseableHttpResponse response = client.execute(httpPost)) {
return IOUtils.toString(response.getEntity().getContent());
}
CloseableHttpResponse response = client.execute(httpPost);
return IOUtils.toString(response.getEntity().getContent());
} catch (Throwable e) {
throw new RuntimeException("Error on executing request ", e);
} finally {
try {
client.close();
} catch (IOException e) {
throw new RuntimeException("Unable to close client ", e);
}
}
}
@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
}
private List<String> getBlobs(final String body) {
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
final List<String> res = JsonPath.read(body, BLOB_PATH);
return res;
}
@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
if (buffer.isEmpty()) {
final String json_param = String.format(JSON_SCROLL, scrollId);
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
try {
buffer = getBlobs(body);
} catch (Throwable e) {
logger.error("Error on get next page: body:" + body);
System.out.println("Error on get next page: body:" + body);
}
}
return nextItem;

@ -21,15 +21,17 @@ object SparkImportMagIntoDataset {
val stream = Map(
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "Iso3166Code:string", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"AuthorExtendedAttributes" -> Tuple2("mag/AuthorExtendedAttributes.txt", Seq("AuthorId:long", "AttributeType:int", "AttributeValue:string")),
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
// ['FieldOfStudyId:long', 'Rank:uint', 'NormalizedName:string', 'DisplayName:string', 'MainType:string', 'Level:int', 'PaperCount:long', 'PaperFamilyCount:long', 'CitationCount:long', 'CreatedDate:DateTime']
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "PaperFamilyCount:long", "CitationCount:long", "CreatedDate:DateTime")),
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "PaperFamilyCount:long" ,"CitationCount:long", "CreatedDate:DateTime")),
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
@ -39,7 +41,7 @@ object SparkImportMagIntoDataset {
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "OnlineDate:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "FamilyRank:uint?", "CreatedDate:DateTime")),
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
)

@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
object SparkPreProcessMAG {
object SparkProcessMAG {
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(getClass)
@ -26,12 +26,15 @@ object SparkPreProcessMAG {
.master(parser.get("master")).getOrCreate()
val sourcePath = parser.get("sourcePath")
val workingPath = parser.get("workingPath")
val targetPath = parser.get("targetPath")
import spark.implicits._
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
logger.info("Phase 1) make uninque DOI in Papers:")
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
val d: Dataset[MagPapers] = spark.read.load(s"$sourcePath/Papers").as[MagPapers]
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
@ -41,11 +44,12 @@ object SparkPreProcessMAG {
.map(_._2)
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
distinctPaper.write.mode(SaveMode.Overwrite).save(s"$workingPath/Papers_distinct")
logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val pa = spark.read.load(s"$sourcePath/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"$workingPath/PaperAbstract")
logger.info("Phase 3) Group Author by PaperId")
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
@ -64,24 +68,24 @@ object SparkPreProcessMAG {
} else
mpa
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_1_paper_authors")
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
val papers = spark.read.load((s"$workingPath/Papers_distinct")).as[MagPapers]
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val paperWithAuthors = spark.read.load(s"$workingPath/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_2")
var magPubs: Dataset[(String, Publication)] =
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
spark.read.load(s"$workingPath/merge_step_2").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
@ -95,10 +99,10 @@ object SparkPreProcessMAG {
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
.write
.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
.save(s"$workingPath/merge_step_2_conference")
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
@ -108,27 +112,27 @@ object SparkPreProcessMAG {
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/merge_step_3")
.save(s"$workingPath/merge_step_3")
// logger.info("Phase 6) Enrich Publication with description")
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract]
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
magPubs = spark.read.load(s"$workingPath/merge_step_3").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithDescription(item)
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
).write.mode(SaveMode.Overwrite).save(s"$workingPath/merge_step_4")
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
magPubs = spark.read.load(s"$workingPath/merge_step_4").as[Publication]
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
@ -144,14 +148,14 @@ object SparkPreProcessMAG {
.equalTo(paperField("PaperId")), "left")
.map(item => ConversionUtil.updatePubsWithSubject(item))
.write.mode(SaveMode.Overwrite)
.save(s"${parser.get("targetPath")}/mag_publication")
.save(s"$workingPath/mag_publication")
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication]
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
.map(_._2)
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication")
}
}

@ -1,10 +1,11 @@
package eu.dnetlib.doiboost.orcid
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import eu.dnetlib.doiboost.DoiBoostMappingUtil
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
import org.apache.commons.lang.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._
@ -17,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
object ORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
val mapper = new ObjectMapper
val mapper = new ObjectMapper()
def isJsonValid(inputStr: String): Boolean = {
import java.io.IOException
@ -43,16 +44,19 @@ object ORCIDToOAF {
}
def convertTOOAF(input:ORCIDElement) :Publication = {
val doi = input.doi
def convertTOOAF(input:OrcidDOI) :Publication = {
val doi = input.getDoi
val pub:Publication = new Publication
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
pub.setPid(List(createSP(doi.toLowerCase, "doi", PID_TYPES)).asJava)
pub.setDataInfo(generateDataInfo())
pub.setId(generateIdentifier(pub, doi.toLowerCase))
try{
pub.setAuthor(input.authors.map(a=> {
generateAuthor(a.name, a.surname, a.creditName, a.oid)
}).asJava)
val l:List[Author]= input.getAuthors.asScala.map(a=> {
generateAuthor(a.getName, a.getSurname, a.getCreditName, a.getOid)
})(collection.breakOut)
pub.setAuthor(l.asJava)
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
pub
@ -63,6 +67,13 @@ object ORCIDToOAF {
}
}
def generateOricPIDDatainfo():DataInfo = {
val di =DoiBoostMappingUtil.generateDataInfo("0.91")
di.getProvenanceaction.setClassid("sysimport:crosswalk:entityregistry")
di.getProvenanceaction.setClassname("Harvested")
di
}
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
val a = new Author
a.setName(given)
@ -72,7 +83,7 @@ object ORCIDToOAF {
else
a.setFullname(s"$given $family")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateOricPIDDatainfo())).asJava)
a
}

@ -1,21 +1,72 @@
package eu.dnetlib.doiboost.orcid
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
object SparkConvertORCIDToOAF {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
override def zero: Publication = new Publication()
override def reduce(b: Publication, a: (String, Publication)): Publication = {
b.mergeFrom(a._2)
b.setAuthor(AuthorMerger.mergeAuthor(a._2.getAuthor, b.getAuthor))
if (b.getId == null)
b.setId(a._2.getId)
b
}
override def merge(wx: Publication, wy: Publication): Publication = {
wx.mergeFrom(wy)
wx.setAuthor(AuthorMerger.mergeAuthor(wy.getAuthor, wx.getAuthor))
if(wx.getId == null && wy.getId.nonEmpty)
wx.setId(wy.getId)
wx
}
override def finish(reduction: Publication): Publication = reduction
override def bufferEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
override def outputEncoder: Encoder[Publication] =
Encoders.kryo(classOf[Publication])
}
def run(spark:SparkSession,sourcePath:String, targetPath:String):Unit = {
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val mapOrcid: Encoder[OrcidDOI] = Encoders.kryo[OrcidDOI]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
val mapper = new ObjectMapper()
mapper.getDeserializationConfig.withFeatures(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES)
val dataset:Dataset[OrcidDOI] = spark.createDataset(spark.sparkContext.textFile(sourcePath).map(s => mapper.readValue(s,classOf[OrcidDOI])))
logger.info("Converting ORCID to OAF")
dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null)
.map(d => (d.getId, d))
.groupByKey(_._1)(Encoders.STRING)
.agg(getPublicationAggregator().toColumn)
.map(p => p._2)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
def main(args: Array[String]): Unit = {
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
parser.parseArgument(args)
@ -26,19 +77,12 @@ object SparkConvertORCIDToOAF {
.appName(getClass.getSimpleName)
.master(parser.get("master")).getOrCreate()
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
import spark.implicits._
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
logger.info("Converting ORCID to OAF")
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
.map(_._2)
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
run(spark, sourcePath, targetPath)
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
}
}

@ -3,10 +3,8 @@ package eu.dnetlib.doiboost.orcid;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
@ -18,11 +16,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.util.LongAccumulator;
import org.mortbay.log.Log;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,7 +33,7 @@ public class SparkDownloadOrcidAuthors {
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
static String lastUpdate;
public static void main(String[] args) throws IOException, Exception {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
@ -52,12 +48,12 @@ public class SparkDownloadOrcidAuthors {
.orElse(Boolean.TRUE);
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
logger.info("workingPath: ", workingPath);
logger.info("workingPath: {}", workingPath);
final String outputPath = parser.get("outputPath");
logger.info("outputPath: ", outputPath);
logger.info("outputPath: {}", outputPath);
final String token = parser.get("token");
final String lambdaFileName = parser.get("lambdaFileName");
logger.info("lambdaFileName: ", lambdaFileName);
logger.info("lambdaFileName: {}", lambdaFileName);
lastUpdate = HDFSUtil.readFromTextFile(workingPath.concat("last_update.txt"));
@ -179,8 +175,8 @@ public class SparkDownloadOrcidAuthors {
}
private static boolean isModified(String orcidId, String modifiedDate) {
Date modifiedDateDt = null;
Date lastUpdateDt = null;
Date modifiedDateDt;
Date lastUpdateDt;
try {
if (modifiedDate.length() != 19) {
modifiedDate = modifiedDate.substring(0, 19);

@ -5,5 +5,6 @@
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true},
{"paramName": "sp", "paramLongName":"sFilePath", "paramDescription": "the Sequence file Path", "paramRequired": true}
]

@ -3,5 +3,5 @@
{"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
{"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true}
]

@ -1,5 +1,7 @@
[
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false}
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false},
{"paramName":"ess", "paramLongName":"esServer", "paramDescription": "elasticsearch server url", "paramRequired": true},
{"paramName":"esi", "paramLongName":"esIndex", "paramDescription": "elasticsearch index name", "paramRequired": true}
]

@ -39,14 +39,7 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetWorkingPath">
<fs>
<delete path='${workingDirPath}'/>
<mkdir path='${workingDirPath}'/>
</fs>
<ok to="CreateDOIBoost"/>
<error to="Kill"/>
</action>
<action name="CreateDOIBoost">
<spark xmlns="uri:oozie:spark-action:0.2">

@ -8,6 +8,10 @@
<name>targetPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>workingPath</name>
<description>the working dir base path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -31,10 +35,10 @@
<action name="ResetWorkingPath">
<fs>
<delete path='${targetPath}'/>
<mkdir path='${targetPath}'/>
<delete path='${workingPath}'/>
<mkdir path='${workingPath}'/>
</fs>
<ok to="PreprocessMag"/>
<ok to="ConvertMagToDataset"/>
<error to="Kill"/>
</action>
@ -52,10 +56,10 @@
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<ok to="PreprocessMag"/>
<error to="Kill"/>
</action>
@ -65,7 +69,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to Dataset</name>
<name>Convert Mag to OAF Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkPreProcessMAG</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
@ -75,7 +79,8 @@
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--sourcePath</arg><arg>${workingPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}/process</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>

@ -1,6 +1,7 @@
[
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the base path of MAG input", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the target dir path", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working dir path", "paramRequired": true},
{"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true}
]

@ -0,0 +1,42 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

@ -0,0 +1,335 @@
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorIntersectionMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<!-- Itersection Parameters -->
<property>
<name>workingPath</name>
<description>the working Path</description>
</property>
<property>
<name>hostedByMapPath</name>
<description>the hostedByMap Path</description>
</property>
<property>
<name>outputPath</name>
<description>the Path of the sequence file action set</description>
</property>
<!-- Crossref Parameters -->
<property>
<name>inputPathCrossref</name>
<description>the Crossref input path</description>
</property>
<property>
<name>crossrefTimestamp</name>
<description>Timestamp for the Crossref incremental Harvesting</description>
</property>
<property>
<name>esServer</name>
<description>elasticsearch server url for the Crossref Harvesting</description>
</property>
<property>
<name>esIndex</name>
<description>elasticsearch index name for the Crossref Harvesting</description>
</property>
<!-- MAG Parameters -->
<property>
<name>inputPathMAG</name>
<description>the MAG working path</description>
</property>
<!-- UnpayWall Parameters -->
<property>
<name>inputPathUnpayWall</name>
<description>the UnpayWall working path</description>
</property>
<!-- ORCID Parameters -->
<property>
<name>inputPathOrcid</name>
<description>the ORCID working path</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
<default to="ImportCrossRef"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportCrossRef">
<java>
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--esServer</arg><arg>${esServer}</arg>
<arg>--esIndex</arg><arg>${esIndex}</arg>
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
</java>
<ok to="GenerateCrossrefDataset"/>
<error to="Kill"/>
</action>
<!-- CROSSREF SECTION -->
<action name="GenerateCrossrefDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>GenerateCrossrefDataset</name>
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="RenameDataset"/>
<error to="Kill"/>
</action>
<action name="RenameDataset">
<fs>
<delete path="${inputPathCrossref}/crossref_ds"/>
<move source="${inputPathCrossref}/crossref_ds_updated"
target="${inputPathCrossref}/crossref_ds"/>
</fs>
<ok to="ConvertCrossrefToOAF"/>
<error to="Kill"/>
</action>
<action name="ConvertCrossrefToOAF">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ConvertCrossrefToOAF</name>
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ResetMagWorkingPath"/>
<error to="Kill"/>
</action>
<!-- MAG SECTION -->
<action name="ResetMagWorkingPath">
<fs>
<delete path="${inputPathMAG}/dataset"/>
<delete path="${inputPathMAG}/process"/>
<delete path="${inputPathMAG}/dataset"/>
</fs>
<ok to="ConvertMagToDataset"/>
<error to="Kill"/>
</action>
<action name="ConvertMagToDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathMAG}/input</arg>
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessMAG"/>
<error to="Kill"/>
</action>
<action name="ProcessMAG">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert Mag to OAF Dataset</name>
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessUW"/>
<error to="Kill"/>
</action>
<!-- UnpayWall SECTION -->
<action name="ProcessUW">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert UnpayWall to Dataset</name>
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="ProcessORCID"/>
<error to="Kill"/>
</action>
<!-- ORCID SECTION -->
<action name="ProcessORCID">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Convert ORCID to Dataset</name>
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
<arg>--targetPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="CreateDOIBoost"/>
<error to="Kill"/>
</action>
<!-- INTERSECTION SECTION-->
<action name="CreateDOIBoost">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create DOIBoost Infospace</name>
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorIntersectionMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
<arg>--affiliationPath</arg><arg>${inputPathMAG}/process/Affiliations</arg>
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/process/PaperAuthorAffiliations</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="GenerateActionSet"/>
<error to="Kill"/>
</action>
<action name="GenerateActionSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Generate DOIBoost ActionSet</name>
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
<jar>dhp-doiboost-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
${sparkExtraOPT}
</spark-opts>
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
<arg>--sFilePath</arg><arg>${outputPath}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -1,6 +1,9 @@
package eu.dnetlib.doiboost.orcid
import org.codehaus.jackson.map.ObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.Publication
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test
import org.slf4j.{Logger, LoggerFactory}
@ -21,6 +24,30 @@ class MappingORCIDToOAFTest {
})
}
// @Test
// def testOAFConvert():Unit ={
//
// val spark: SparkSession =
// SparkSession
// .builder()
// .appName(getClass.getSimpleName)
// .master("local[*]").getOrCreate()
//
//
// SparkConvertORCIDToOAF.run( spark,"/Users/sandro/Downloads/orcid", "/Users/sandro/Downloads/orcid_oaf")
// implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
//
// val df = spark.read.load("/Users/sandro/Downloads/orcid_oaf").as[Publication]
// println(df.first.getId)
// println(mapper.writeValueAsString(df.first()))
//
//
//
//
// }

@ -104,7 +104,7 @@ public class PrepareResultOrcidAssociationStep1 {
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
+ " lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
+ " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
+ " GROUP BY id) r_t "
+ " JOIN ("
+ " SELECT source, target "

@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob {
.stream()
.map(con -> con.getId())
.collect(Collectors.toList());
Result res = new Result();
R res = (R) ret.getClass().newInstance();
res.setId(ret.getId());
List<Context> propagatedContexts = new ArrayList<>();
for (String cId : communitySet) {

@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob {
})
.filter(Objects::nonNull)
.collect(Collectors.toList());
Result r = new Result();
R r = (R) ret.getClass().newInstance();
r.setId(ret.getId());
r.setContext(contextList);
ret.mergeFrom(r);

@ -170,6 +170,7 @@ public class OrcidPropagationJobTest {
.filter(
"id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' "
+ "and name = 'Vajinder' and surname = 'Kumar' and pidType = '" +
ModelConstants.ORCID_PENDING + "'")
.count());

@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
import eu.dnetlib.dhp.schema.oaf.Dataset;
public class ResultToCommunityJobTest {
@ -66,7 +65,7 @@ public class ResultToCommunityJobTest {
}
@Test
public void test1() throws Exception {
public void testSparkResultToCommunityThroughSemRelJob() throws Exception {
SparkResultToCommunityThroughSemRelJob
.main(
new String[] {

@ -15,8 +15,11 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String DOI_PREFIX_REGEX = "^10\\.";
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
@ -56,13 +59,19 @@ public class CleaningFunctions {
}
}
if (Objects.nonNull(r.getAuthor())) {
r.getAuthor().forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
r
.getAuthor()
.stream()
.filter(Objects::nonNull)
.forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a
.getPid()
.stream()
.filter(Objects::nonNull)
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
}
});
}
if (value instanceof Publication) {
@ -86,7 +95,7 @@ public class CleaningFunctions {
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
// nothing to clean here
@ -101,6 +110,16 @@ public class CleaningFunctions {
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getCountry())) {
r
.setCountry(
r
.getCountry()
.stream()
.filter(Objects::nonNull)
.filter(c -> StringUtils.isNotBlank(c.getClassid()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
@ -153,12 +172,14 @@ public class CleaningFunctions {
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
i
.setAccessright(
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
@ -173,12 +194,22 @@ public class CleaningFunctions {
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) {
r
.setAuthor(
r
.getAuthor()
.stream()
.filter(a -> Objects.nonNull(a))
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.collect(Collectors.toList()));
boolean nullRank = r
.getAuthor()
.stream()
@ -199,6 +230,7 @@ public class CleaningFunctions {
a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
@ -211,14 +243,31 @@ public class CleaningFunctions {
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
if (p
.getQualifier()
.getClassid()
.toLowerCase()
.contains(ModelConstants.ORCID)) {
if (pidProvenance
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
}
final String orcid = p
.getValue()
.trim()
.toLowerCase()
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
if (orcid.length() == ORCID_LEN) {
p.setValue(orcid);
} else {
p.setValue("");
}
}
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p;
})
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(
Collectors
.toMap(
@ -286,7 +335,7 @@ public class CleaningFunctions {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
break;
}
return pid;

@ -21,6 +21,7 @@ import org.apache.spark.sql.expressions.Aggregator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
@ -44,7 +45,8 @@ public class GroupEntitiesAndRelationsSparkJob {
private final static String SOURCE_JPATH = "$.source";
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
public static void main(String[] args) throws Exception {

@ -40,4 +40,18 @@ public class Constants {
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
public enum DUMPTYPE {
COMPLETE("complete"), COMMUNITY("community"), FUNDER("funder");
private String type;
DUMPTYPE(String type) {
this.type = type;
}
public String getType() {
return type;
}
}
}

@ -11,17 +11,12 @@ import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.dhp.schema.oaf.*;
/**
@ -33,7 +28,7 @@ public class DumpProducts implements Serializable {
public void run(Boolean isSparkSessionManaged, String inputPath, String outputPath, String communityMapPath,
Class<? extends OafEntity> inputClazz,
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> outputClazz,
boolean graph) {
String dumpType) {
SparkConf conf = new SparkConf();
@ -42,7 +37,7 @@ public class DumpProducts implements Serializable {
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, graph);
execDump(spark, inputPath, outputPath, communityMapPath, inputClazz, outputClazz, dumpType);
});
}
@ -53,13 +48,13 @@ public class DumpProducts implements Serializable {
String communityMapPath,
Class<I> inputClazz,
Class<O> outputClazz,
boolean graph) {
String dumpType) {
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
Utils
.readPath(spark, inputPath, inputClazz)
.map((MapFunction<I, O>) value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
.map((MapFunction<I, O>) value -> execMap(value, communityMap, dumpType), Encoders.bean(outputClazz))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
@ -70,18 +65,18 @@ public class DumpProducts implements Serializable {
private static <I extends OafEntity, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O execMap(I value,
CommunityMap communityMap,
boolean graph) {
String dumpType) {
Optional<DataInfo> odInfo = Optional.ofNullable(value.getDataInfo());
if (odInfo.isPresent()) {
if (odInfo.get().getDeletedbyinference()) {
if (odInfo.get().getDeletedbyinference() || odInfo.get().getInvisible()) {
return null;
}
} else {
return null;
}
if (!graph) {
if (Constants.DUMPTYPE.COMMUNITY.getType().equals(dumpType)) {
Set<String> communities = communityMap.keySet();
Optional<List<Context>> inputContext = Optional
@ -102,7 +97,8 @@ public class DumpProducts implements Serializable {
return null;
}
}
return (O) ResultMapper.map(value, communityMap, graph);
return (O) ResultMapper.map(value, communityMap, dumpType);
}
}

@ -21,10 +21,10 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultMapper implements Serializable {
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in, Map<String, String> communityMap, boolean graph) {
E in, Map<String, String> communityMap, String dumpType) {
Result out;
if (graph) {
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
out = new GraphResult();
} else {
out = new CommunityResult();
@ -217,7 +217,7 @@ public class ResultMapper implements Serializable {
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
if (graph) {
if (Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
((GraphResult) out)
.setInstance(oInst.get().stream().map(i -> getGraphInstance(i)).collect(Collectors.toList()));
} else {
@ -296,7 +296,7 @@ public class ResultMapper implements Serializable {
out.setType(input.getResulttype().getClassid());
}
if (!graph) {
if (!Constants.DUMPTYPE.COMPLETE.getType().equals(dumpType)) {
((CommunityResult) out)
.setCollectedfrom(
input

@ -9,6 +9,7 @@ import java.util.Set;
import java.util.stream.Collectors;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
@ -54,7 +55,7 @@ public class CommunitySplit implements Serializable {
private static void printResult(String c, Dataset<CommunityResult> result, String outputPath) {
Dataset<CommunityResult> community_products = result
.filter(r -> containsCommunity(r, c));
.filter((FilterFunction<CommunityResult>) r -> containsCommunity(r, c));
try {
community_products.first();

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save