fixed issues while running on cluster

This commit is contained in:
Miriam Baglioni 2020-06-15 11:12:14 +02:00
parent 56e70573c2
commit f96ca900e1
10 changed files with 867 additions and 565 deletions

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
public class CommunityMap extends HashMap<String, String> { public class CommunityMap extends HashMap<String, String> implements Serializable {
} }

View File

@ -1,18 +1,29 @@
package eu.dnetlib.dhp.oa.graph.dump;
import com.google.common.collect.Maps; package eu.dnetlib.dhp.oa.graph.dump;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Maps;
public class Constants { public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap(); public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
static { public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
accessRightsCoarMap.put("OPEN", "http://purl.org/coar/access_right/c_abf2");
accessRightsCoarMap.put("RESTRICTED", "http://purl.org/coar/access_right/c_16ec"); static {
accessRightsCoarMap.put("OPEN SOURCE", "http://purl.org/coar/access_right/c_abf2"); accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("CLOSED", "http://purl.org/coar/access_right/c_14cb //metadataonly for coar"); accessRightsCoarMap.put("RESTRICTED", "c_16ec");
accessRightsCoarMap.put("EMBARGO", "http://purl.org/coar/access_right/c_f1cf"); accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
} accessRightsCoarMap.put("CLOSED", "c_14cb");
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
}
static {
coarCodeLabelMap.put("c_abf2", "OPEN");
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
} }

View File

@ -1,223 +1,407 @@
package eu.dnetlib.dhp.oa.graph.dump;
import eu.dnetlib.dhp.schema.dump.oaf.*; package eu.dnetlib.dhp.oa.graph.dump;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.collection.immutable.Stream;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import javax.swing.text.html.Option;
import org.apache.avro.generic.GenericData;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class Mapper implements Serializable { public class Mapper implements Serializable {
public static <I extends eu.dnetlib.dhp.schema.oaf.Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O map( public static <I extends eu.dnetlib.dhp.schema.oaf.Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O map(
I input, Map<String,String> communityMap){ I input, Map<String, String> communityMap) {
O out = null; O out = null;
switch (input.getResulttype().getClassid()){ Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
case "publication": if (ort.isPresent()) {
out = (O)new Publication(); switch (ort.get().getClassid()) {
Optional<Journal> journal = Optional.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal()); case "publication":
if(journal.isPresent()){ out = (O) new Publication();
Journal j = journal.get(); Optional<Journal> journal = Optional
Container c = new Container(); .ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
c.setConferencedate(j.getConferencedate()); if (journal.isPresent()) {
c.setConferenceplace(j.getConferenceplace()); Journal j = journal.get();
c.setEdition(j.getEdition()); Container c = new Container();
c.setEp(j.getEp()); c.setConferencedate(j.getConferencedate());
c.setIss(j.getIss()); c.setConferenceplace(j.getConferenceplace());
c.setIssnLinking(j.getIssnLinking()); c.setEdition(j.getEdition());
c.setIssnOnline(j.getIssnOnline()); c.setEp(j.getEp());
c.setIssnPrinted(j.getIssnPrinted()); c.setIss(j.getIss());
c.setName(j.getName()); c.setIssnLinking(j.getIssnLinking());
c.setSp(j.getSp()); c.setIssnOnline(j.getIssnOnline());
c.setVol(j.getVol()); c.setIssnPrinted(j.getIssnPrinted());
out.setContainer(c); c.setName(j.getName());
} c.setSp(j.getSp());
break; c.setVol(j.getVol());
case "dataset": out.setContainer(c);
Dataset d = new Dataset(); }
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset)input; break;
d.setSize(id.getSize().getValue()); case "dataset":
d.setVersion(id.getVersion().getValue()); Dataset d = new Dataset();
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> d.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> d.setVersion(v.getValue()));
List<eu.dnetlib.dhp.schema.oaf.GeoLocation> igl = id.getGeolocation(); d
d.setGeolocation(igl.stream() .setGeolocation(
.filter(Objects::nonNull) Optional
.map(gli -> { .ofNullable(id.getGeolocation())
GeoLocation gl = new GeoLocation(); .map(
gl.setBox(gli.getBox()); igl -> igl
gl.setPlace(gli.getPlace()); .stream()
gl.setPoint(gli.getPoint()); .filter(Objects::nonNull)
return gl; .map(gli -> {
}).collect(Collectors.toList())); GeoLocation gl = new GeoLocation();
out = (O)d; gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
break; out = (O) d;
case "software":
Software s = new Software();
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software)input;
s.setCodeRepositoryUrl(is.getCodeRepositoryUrl().getValue());
s.setDocumentationUrl(is.getDocumentationUrl()
.stream()
.map(du -> du.getValue()).collect(Collectors.toList()));
s.setProgrammingLanguage(is.getProgrammingLanguage().getClassid());
out = (O) s; break;
break; case "software":
case "otherresearchproduct": Software s = new Software();
OtherResearchProduct or = new OtherResearchProduct(); eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct)input; Optional
or.setContactgroup(ir.getContactgroup().stream().map(cg -> cg.getValue()).collect(Collectors.toList())); .ofNullable(is.getCodeRepositoryUrl())
or.setContactperson(ir.getContactperson().stream().map(cp->cp.getValue()).collect(Collectors.toList())); .ifPresent(value -> s.setCodeRepositoryUrl(value.getValue()));
or.setTool(ir.getTool().stream().map(t -> t.getValue()).collect(Collectors.toList())); Optional
out = (O) or; .ofNullable(is.getDocumentationUrl())
break; .ifPresent(
} value -> s
out.setAuthor(input.getAuthor() .setDocumentationUrl(
.stream() value
.map(oa -> { .stream()
Author a = new Author(); .map(v -> v.getValue())
a.setAffiliation(oa.getAffiliation().stream().map(aff -> aff.getValue()).collect(Collectors.toList())); .collect(Collectors.toList())));
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
a.setPid(oa.getPid().stream().map(p -> {
ControlledField cf = new ControlledField();
cf.setScheme( p.getQualifier().getClassid());
cf.setValue( p.getValue());
return cf;
}).collect(Collectors.toList()));
return a;
}).collect(Collectors.toList()));
//I do not map Access Right UNKNOWN or OTHER
if (Constants.accessRightsCoarMap.containsKey(input.getBestaccessright().getClassid())){
AccessRight ar = new AccessRight();
ar.setSchema(Constants.accessRightsCoarMap.get(input.getBestaccessright().getClassid()));
ar.setCode(input.getBestaccessright().getClassid());
ar.setLabel(input.getBestaccessright().getClassname());
out.setBestaccessright(ar);
}
out.setCollectedfrom(input.getCollectedfrom().stream().map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue())) Optional
.collect(Collectors.toList())); .ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> s.setProgrammingLanguage(value.getClassid()));
Set<String> communities = communityMap.keySet(); out = (O) s;
List<Context> contextList = input.getContext() break;
.stream() case "other":
.map(c -> { OtherResearchProduct or = new OtherResearchProduct();
if(communities.contains(c.getId())){ eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
Context context = new Context(); or
context.setCode(c.getId()); .setContactgroup(
context.setLabel(communityMap.get(c.getId())); Optional
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo()); .ofNullable(ir.getContactgroup())
if(dataInfo.isPresent()){ .map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
context.setProvenance(dataInfo.get().stream() .orElse(null));
.map(di -> {
if (di.getInferred()){
return di.getProvenanceaction().getClassid();
}
return null;
}).filter(Objects::nonNull)
.collect(Collectors.toList()));
}
return context;
}
return null;
}
).filter(Objects::nonNull)
.collect(Collectors.toList());
if(contextList.size() > 0){
out.setContext(contextList);
}
out.setContributor(input.getContributor()
.stream()
.map(c -> c.getValue()).collect(Collectors.toList()));
out.setCountry(input.getCountry()
.stream()
.map(c -> {
Country country = new Country();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional<DataInfo> dataInfo = Optional.ofNullable(c.getDataInfo());
if(dataInfo.isPresent()){
country.setProvenance(dataInfo.get().getProvenanceaction().getClassid());
}
return country;
}).collect(Collectors.toList()));
out.setCoverage(input.getCoverage().stream().map(c->c.getValue()).collect(Collectors.toList()));
out.setDateofcollection(input.getDateofcollection()); or
out.setDescription(input.getDescription().stream().map(d->d.getValue()).collect(Collectors.toList())); .setContactperson(
out.setEmbargoenddate(input.getEmbargoenddate().getValue()); Optional
out.setFormat(input.getFormat().stream().map(f->f.getValue()).collect(Collectors.toList())); .ofNullable(ir.getContactperson())
out.setId(input.getId()); .map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
out.setOriginalId(input.getOriginalId()); .orElse(null));
out.setInstance(input.getInstance() or
.stream() .setTool(
.map(i -> { Optional
Instance instance = new Instance(); .ofNullable(ir.getTool())
AccessRight ar = new AccessRight(); .map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
ar.setCode(i.getAccessright().getClassid()); .orElse(null));
ar.setLabel(i.getAccessright().getClassname()); out = (O) or;
if(Constants.accessRightsCoarMap.containsKey(i.getAccessright().getClassid())){ break;
ar.setSchema(Constants.accessRightsCoarMap.get(i.getAccessright().getClassid())); }
} Optional<List<eu.dnetlib.dhp.schema.oaf.Author>> oAuthor = Optional.ofNullable(input.getAuthor());
instance.setAccessright(ar); if (oAuthor.isPresent()) {
instance.setCollectedfrom(KeyValue.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue())); // List<eu.dnetlib.dhp.schema.dump.oaf.Author> authorList = new ArrayList<>();
instance.setHostedby(KeyValue.newInstance(i.getHostedby().getKey(),i.getHostedby().getValue())); out
instance.setLicense(i.getLicense().getValue()); .setAuthor(
instance.setPublicationdata(i.getDateofacceptance().getValue()); oAuthor
instance.setRefereed(i.getRefereed().getValue()); .get()
instance.setType(i.getInstancetype().getClassid()); .stream()
instance.setUrl(i.getUrl()); .map(oa -> getAuthor(oa))
return instance; .collect(Collectors.toList()));
}).collect(Collectors.toList())); }
out.setLanguage(Qualifier.newInstance(input.getLanguage().getClassid(), input.getLanguage().getClassname())); // I do not map Access Right UNKNOWN or OTHER
out.setLastupdatetimestamp(input.getLastupdatetimestamp());
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle()); Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if(otitle.isPresent()){ if (oar.isPresent()) {
List<StructuredProperty> iTitle = otitle.get() if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
.stream() String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title")) out
.collect(Collectors.toList()); .setBestaccessright(
if(iTitle.size() > 0 ){ AccessRight
out.setMaintitle(iTitle.get(0).getValue()); .newInstance(
} code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
iTitle = otitle.get() out
.stream() .setCollectedfrom(
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle")) input
.collect(Collectors.toList()); .getCollectedfrom()
if(iTitle.size() > 0){ .stream()
out.setSubtitle(iTitle.get(0).getValue()); .map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
} .collect(Collectors.toList()));
} Set<String> communities = communityMap.keySet();
List<Context> contextList = input
.getContext()
.stream()
.map(c -> {
if (communities.contains(c.getId())
|| communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
Context context = new Context();
if (!communityMap.containsKey(c.getId())) {
context.setCode(c.getId().substring(0, c.getId().indexOf("::")));
context.setLabel(communityMap.get(context.getCode()));
} else {
context.setCode(c.getId());
context.setLabel(communityMap.get(c.getId()));
}
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
if (dataInfo.isPresent()) {
List<String> provenance = new ArrayList<>();
provenance
.addAll(
dataInfo
.get()
.stream()
.map(di -> {
if (di.getInferred()) {
return di.getProvenanceaction().getClassname();
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toSet()));
context.setProvenance(provenance);
}
return context;
}
return null;
})
.filter(Objects::nonNull)
.collect(Collectors.toList());
if (contextList.size() > 0) {
out.setContext(contextList);
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
out.setPid(input.getPid().stream().map(p -> { List<Country> countryList = new ArrayList<>();
ControlledField pid = new ControlledField(); Optional
pid.setScheme(p.getQualifier().getClassid()); .ofNullable(input.getCountry())
pid.setValue(p.getValue()); .ifPresent(
return pid; value -> value
}).collect(Collectors.toList())); .stream()
out.setPublicationdata(input.getDateofacceptance().getValue()); .forEach(
out.setPublisher(input.getPublisher().getValue()); c -> {
out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList())); Country country = new Country();
out.setSubject(input.getSubject().stream().map(s->{ country.setCode(c.getClassid());
ControlledField subject = new ControlledField(); country.setLabel(c.getClassname());
subject.setScheme(s.getQualifier().getClassid()); Optional
subject.setValue(s.getValue()); .ofNullable(c.getDataInfo())
return subject; .ifPresent(
}).collect(Collectors.toList())); provenance -> country
out.setType(input.getResulttype().getClassid()); .setProvenance(
provenance
.getProvenanceaction()
.getClassname()));
countryList
.add(country);
}));
return out; out.setCountry(countryList);
}
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.stream().forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(input.getOriginalId());
final List<Instance> instanceList = new ArrayList<>();
Optional
.ofNullable(input.getInstance())
.ifPresent(
inst -> inst
.stream()
.forEach(i -> {
Instance instance = new Instance();
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright());
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getValue()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
instanceList.add(instance);
}));
out
.setInstance(instanceList);
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
}
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
List<ControlledField> pids = new ArrayList<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.stream()
.forEach(
p -> pids
.add(
ControlledField
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
out.setPid(pids);
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
List<String> sourceList = new ArrayList<>();
Optional
.ofNullable(input.getSource())
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<ControlledField> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.stream()
.forEach(
s -> subjectList
.add(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()))));
out.setSubject(subjectList);
out.setType(input.getResulttype().getClassid());
}
return out;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
Optional
.ofNullable(oa.getAffiliation())
.ifPresent(
value -> a
.setAffiliation(
value
.stream()
.map(aff -> aff.getValue())
.collect(Collectors.toList())));
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional
.ofNullable(oa.getPid())
.ifPresent(
value -> a
.setPid(
value
.stream()
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
.collect(Collectors.toList())));
return a;
}
} }

View File

@ -1,34 +1,32 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import java.io.StringReader;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import java.util.List;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import java.io.StringReader; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import java.util.HashMap; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import java.util.List; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.util.Map;
public class QueryInformationSystem { public class QueryInformationSystem {
private ISLookUpService isLookUp; private ISLookUpService isLookUp;
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " + private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " + +
" return " + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
"<community> " + " return " +
"{$x//CONFIGURATION/context/@id}" + "<community> " +
"{$x//CONFIGURATION/context/@label}" + "{$x//CONFIGURATION/context/@id}" +
"</community>"; "{$x//CONFIGURATION/context/@label}" +
"</community>";
public CommunityMap getCommunityMap()
public Map<String,String> getCommunityMap()
throws ISLookUpException { throws ISLookUpException {
return getMap(isLookUp.quickSearchProfile(XQUERY)); return getMap(isLookUp.quickSearchProfile(XQUERY));
@ -42,12 +40,8 @@ public class QueryInformationSystem {
this.isLookUp = isLookUpService; this.isLookUp = isLookUpService;
} }
public void set(String isLookUpUrl){ public static CommunityMap getMap(List<String> communityMap) {
isLookUpUrl = get(isLookUpUrl); final CommunityMap map = new CommunityMap();
}
public ISLookUpService
private static Map<String, String> getMap(List<String> communityMap) {
final Map<String, String> map = new HashMap<>();
communityMap.stream().forEach(xml -> { communityMap.stream().forEach(xml -> {
final Document doc; final Document doc;
@ -59,7 +53,6 @@ public void set(String isLookUpUrl){
e.printStackTrace(); e.printStackTrace();
} }
}); });
return map; return map;

View File

@ -1,27 +1,28 @@
package eu.dnetlib.dhp.oa.graph.dump;
import eu.dnetlib.dhp.schema.dump.oaf.Projects; package eu.dnetlib.dhp.oa.graph.dump;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
public class ResultProject implements Serializable { public class ResultProject implements Serializable {
private String resultId; private String resultId;
private List<Projects> projectsList; private List<Projects> projectsList;
public String getResultId() { public String getResultId() {
return resultId; return resultId;
} }
public void setResultId(String resultId) { public void setResultId(String resultId) {
this.resultId = resultId; this.resultId = resultId;
} }
public List<Projects> getProjectsList() { public List<Projects> getProjectsList() {
return projectsList; return projectsList;
} }
public void setProjectsList(List<Projects> projectsList) { public void setProjectsList(List<Projects> projectsList) {
this.projectsList = projectsList; this.projectsList = projectsList;
} }
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -6,10 +7,8 @@ import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.Context; import javax.management.Query;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -19,127 +18,129 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import javax.management.Query; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class SparkDumpCommunityProducts implements Serializable { public class SparkDumpCommunityProducts implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class); private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
private QueryInformationSystem queryInformationSystem; private static QueryInformationSystem queryInformationSystem;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpCommunityProducts.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
String jsonConfiguration = IOUtils parser.parseArgument(args);
.toString(
SparkDumpCommunityProducts.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); Boolean isSparkSessionManaged = Optional
parser.parseArgument(args); .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean isSparkSessionManaged = Optional final String inputPath = parser.get("sourcePath");
.ofNullable(parser.get("isSparkSessionManaged")) log.info("inputPath: {}", inputPath);
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String inputPath = parser.get("sourcePath"); final String resultClassName = parser.get("resultTableName");
log.info("inputPath: {}", inputPath); log.info("resultTableName: {}", resultClassName);
final String outputPath = parser.get("outputPath"); final String dumpClassName = parser.get("dumpTableName");
log.info("outputPath: {}", outputPath); log.info("dumpClassName: {}", dumpClassName);
final String resultClassName = parser.get("resultTableName"); final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("resultTableName: {}", resultClassName); log.info("isLookUpUrl: {}", isLookUpUrl);
final String dumpClassName = parser.get("dumpTableName"); // final String resultType = parser.get("resultType");
log.info("dumpClassName: {}", dumpClassName); // log.info("resultType: {}", resultType);
final String isLookUpUrl = parser.get("isLookUpUrl"); final Optional<String> cm = Optional.ofNullable(parser.get("communityMap"));
log.info("isLookUpUrl: {}", isLookUpUrl);
final String resultType = parser.get("resultType"); Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
log.info("resultType: {}", resultType); Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> dumpClazz = (Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result>) Class
.forName(dumpClassName);
SparkConf conf = new SparkConf();
SparkDumpCommunityProducts sdcp = new SparkDumpCommunityProducts(); CommunityMap communityMap;
sdcp.exec(isLookUpUrl, isSparkSessionManaged, outputPath, if (!isLookUpUrl.equals("BASEURL:8280/is/services/isLookUp")) {
inputPath, resultClassName, dumpClassName); queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl));
communityMap = queryInformationSystem.getCommunityMap();
} else {
communityMap = new Gson().fromJson(cm.get(), CommunityMap.class);
}
} runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execDump(spark, inputPath, outputPath, communityMap, inputClazz, dumpClazz);
public QueryInformationSystem getQueryInformationSystem() { });
return queryInformationSystem;
}
public void setQueryInformationSystem(QueryInformationSystem queryInformationSystem) { }
this.queryInformationSystem = queryInformationSystem;
}
public ISLookUpService getIsLookUpService(String isLookUpUrl){ public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
return ISLookupClientFactory.getLookUpService(isLookUpUrl); return ISLookupClientFactory.getLookUpService(isLookUpUrl);
} }
public void exec(String isLookUpUrl, Boolean isSparkSessionManaged, String outputPath, String inputPath, public static <I extends Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> void execDump(SparkSession spark,
String resultClassName, String dumpClassName) throws ISLookUpException, ClassNotFoundException { String inputPath,
SparkConf conf = new SparkConf(); String outputPath,
CommunityMap communityMap,
Class<I> inputClazz,
Class<O> dumpClazz) {
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName); // Set<String> communities = communityMap.keySet();
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> dumpClazz = Dataset<I> tmp = Utils.readPath(spark, inputPath, inputClazz);
(Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result>) Class.forName(dumpClassName);
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl)); tmp
Map<String,String> .map(value -> execMap(value, communityMap), Encoders.bean(dumpClazz))
communityMap = queryInformationSystem.getCommunityMap(); .filter(Objects::nonNull)
runWithSparkSession( .write()
conf, .mode(SaveMode.Overwrite)
isSparkSessionManaged, .option("compression", "gzip")
spark -> { .json(outputPath);
Utils.removeOutputDir(spark, outputPath);
execDump(spark, inputPath, outputPath , communityMap, inputClazz, dumpClazz);
});
}
}
private <I extends Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result > void execDump( private static <O extends eu.dnetlib.dhp.schema.dump.oaf.Result, I extends Result> O execMap(I value,
SparkSession spark, CommunityMap communityMap) {
String inputPath, {
String outputPath, Set<String> communities = communityMap.keySet();
Map<String,String> communityMap, Optional<List<Context>> inputContext = Optional.ofNullable(value.getContext());
Class<I> inputClazz, if (!inputContext.isPresent()) {
Class<O> dumpClazz) { return null;
}
Set<String> communities = communityMap.keySet(); List<String> toDumpFor = inputContext.get().stream().map(c -> {
Dataset<I> tmp = Utils.readPath(spark, inputPath, inputClazz); if (communities.contains(c.getId())) {
tmp.map(value -> { return c.getId();
Optional<List<Context>> inputContext = Optional.ofNullable(value.getContext()); }
if(!inputContext.isPresent()){ if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
return null; return c.getId().substring(0, 3);
} }
List<String> toDumpFor = inputContext.get().stream().map(c -> { return null;
if (communities.contains(c.getId())) { }).filter(Objects::nonNull).collect(Collectors.toList());
return c.getId(); if (toDumpFor.size() == 0) {
} return null;
return null; }
}).filter(Objects::nonNull).collect(Collectors.toList()); return Mapper.map(value, communityMap);
if(toDumpFor.size() == 0){ }
return null; }
}
return Mapper.map(value, communityMap);
},Encoders.bean(dumpClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
} }

View File

@ -1,11 +1,13 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import com.fasterxml.jackson.databind.ObjectMapper; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Projects; import java.io.Serializable;
import eu.dnetlib.dhp.schema.dump.oaf.Result; import java.io.StringReader;
import eu.dnetlib.dhp.schema.oaf.Project; import java.util.*;
import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -14,86 +16,165 @@ import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2; import scala.Tuple2;
import java.io.Serializable; public class SparkPrepareResultProject implements Serializable {
import java.util.Arrays; private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkPrepareResultProject.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json"));
public class SparkPrepareResultProject implements Serializable { final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
public static void main(String[] args) throws Exception { final String inputPath = parser.get("sourcePath");
String jsonConfiguration = IOUtils log.info("inputPath: {}", inputPath);
.toString(
SparkPrepareResultProject.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final String outputPath = parser.get("outputPath");
parser.parseArgument(args); log.info("outputPath: {}", outputPath);
Boolean isSparkSessionManaged = Optional SparkConf conf = new SparkConf();
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); runWithSparkSession(
log.info("inputPath: {}", inputPath); conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
prepareResultProjectList(spark, inputPath, outputPath);
});
}
final String outputPath = parser.get("outputPath"); private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
log.info("outputPath: {}", outputPath); Dataset<Relation> relation = Utils
.readPath(spark, inputPath + "/relation", Relation.class)
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
Dataset<Project> projects = Utils.readPath(spark, inputPath + "/project", Project.class);
projects
.joinWith(relation, projects.col("id").equalTo(relation.col("source")))
.groupByKey(
(MapFunction<Tuple2<Project, Relation>, String>) value -> value._2().getTarget(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, ResultProject>) (s, it) -> {
Set<String> projectSet = new HashSet<>();
Tuple2<Project, Relation> first = it.next();
ResultProject rp = new ResultProject();
rp.setResultId(first._2().getTarget());
Project p = first._1();
projectSet.add(p.getId());
Projects ps = Projects
.newInstance(
p.getId(), p.getCode().getValue(),
Optional
.ofNullable(p.getAcronym())
.map(a -> a.getValue())
.orElse(null),
Optional
.ofNullable(p.getTitle())
.map(v -> v.getValue())
.orElse(null),
Optional
.ofNullable(p.getFundingtree())
.map(
value -> value
.stream()
.map(ft -> getFunder(ft.getValue()))
.collect(Collectors.toList())
.get(0))
.orElse(null));
List<Projects> projList = new ArrayList<>();
projList.add(ps);
rp.setProjectsList(projList);
it.forEachRemaining(c -> {
Project op = c._1();
if (!projectSet.contains(op.getId())) {
projList
.add(
Projects
.newInstance(
op.getId(),
op.getCode().getValue(),
Optional
.ofNullable(op.getAcronym())
.map(a -> a.getValue())
.orElse(null),
Optional
.ofNullable(op.getTitle())
.map(v -> v.getValue())
.orElse(null),
Optional
.ofNullable(op.getFundingtree())
.map(
value -> value
.stream()
.map(ft -> getFunder(ft.getValue()))
.collect(Collectors.toList())
.get(0))
.orElse(null)));
projectSet.add(op.getId());
SparkConf conf = new SparkConf(); }
runWithSparkSession( });
conf, return rp;
isSparkSessionManaged, }, Encoders.bean(ResultProject.class))
spark -> { .write()
Utils.removeOutputDir(spark, outputPath); .mode(SaveMode.Overwrite)
prepareResultProjectList(spark, inputPath, outputPath); .option("compression", "gzip")
}); .json(outputPath);
} }
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) { private static Funder getFunder(String fundingtree) {
Dataset<Relation> relation = Utils.readPath(spark, inputPath + "/relation" , Relation.class) // ["<fundingtree><funder><id>nsf_________::NSF</id><shortname>NSF</shortname><name>National Science
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'"); // Foundation</name><jurisdiction>US</jurisdiction></funder><funding_level_1><id>nsf_________::NSF::CISE/OAD::CISE/CCF</id><description>Division
Dataset<Project> projects = Utils.readPath(spark, inputPath + "/project" , Project.class); // of Computing and Communication Foundations</description><name>Division of Computing and Communication
// Foundations</name><parent><funding_level_0><id>nsf_________::NSF::CISE/OAD</id><description>Directorate for
// Computer &amp; Information Science &amp; Engineering</description><name>Directorate for Computer &amp;
// Information Science &amp;
// Engineering</name><parent/><class>nsf:fundingStream</class></funding_level_0></parent></funding_level_1></fundingtree>"]
Funder f = new Funder();
final Document doc;
try {
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
for (Object o : doc.selectNodes("//funding_level_0")) {
List node = ((Node) o).selectNodes("./name");
f.setFundingStream(((Node) node.get(0)).getText());
projects.joinWith(relation, projects.col("id").equalTo(relation.col("source"))) }
.groupByKey((MapFunction<Tuple2<Project,Relation>,String>)value -> value._2().getTarget(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Tuple2<Project,Relation>, ResultProject>) (s, it) -> return f;
{ } catch (DocumentException e) {
Tuple2<Project, Relation> first = it.next(); e.printStackTrace();
ResultProject rp = new ResultProject(); }
rp.setResultId(first._2().getTarget()); return f;
Project p = first._1(); }
Projects ps = Projects.newInstance(p.getId(), p.getCode().getValue(), p.getAcronym().getValue(),
p.getTitle().getValue(), p.getFundingtree()
.stream()
.map(ft -> ft.getValue()).collect(Collectors.toList()));
List<Projects> projList = Arrays.asList(ps);
rp.setProjectsList(projList);
it.forEachRemaining(c -> {
Project op = c._1();
projList.add(Projects.newInstance(op.getId(), op.getCode().getValue(),
op.getAcronym().getValue(), op.getTitle().getValue(),
op.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())));
});
return rp;
} ,Encoders.bean(ResultProject.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
} }

View File

@ -1,98 +1,122 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.Serializable; import com.google.gson.Gson;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class SparkSplitForCommunity implements Serializable { public class SparkSplitForCommunity implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class); private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkSplitForCommunity.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
String jsonConfiguration = IOUtils parser.parseArgument(args);
.toString(
SparkSplitForCommunity.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); Boolean isSparkSessionManaged = Optional
parser.parseArgument(args); .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
Boolean isSparkSessionManaged = Optional final String inputPath = parser.get("sourcePath");
.ofNullable(parser.get("isSparkSessionManaged")) log.info("inputPath: {}", inputPath);
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String inputPath = parser.get("sourcePath"); final String resultClassName = parser.get("resultTableName");
log.info("inputPath: {}", inputPath); log.info("resultTableName: {}", resultClassName);
final String outputPath = parser.get("outputPath"); final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("outputPath: {}", outputPath); log.info("isLookUpUrl: {}", isLookUpUrl);
final String resultClassName = parser.get("resultTableName"); final Optional<String> cm = Optional.ofNullable(parser.get("communityMap"));
log.info("resultTableName: {}", resultClassName);
final String isLookUpUrl = parser.get("isLookUpUrl"); Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
log.info("isLookUpUrl: {}", isLookUpUrl);
SparkConf conf = new SparkConf();
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName); CommunityMap communityMap;
SparkConf conf = new SparkConf(); if (!isLookUpUrl.equals("BASEURL:8280/is/services/isLookUp")) {
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl));
communityMap = queryInformationSystem.getCommunityMap();
} else {
communityMap = new Gson().fromJson(cm.get(), CommunityMap.class);
}
Map<String,String> runWithSparkSession(
communityMap = QueryInformationSystem.getCommunityMap(isLookUpUrl); conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execSplit(spark, inputPath, outputPath, communityMap.keySet(), inputClazz);
});
}
public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
return ISLookupClientFactory.getLookUpService(isLookUpUrl);
}
runWithSparkSession( private static <R extends Result> void execSplit(SparkSession spark, String inputPath, String outputPath,
conf, Set<String> communities, Class<R> inputClazz) {
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
execSplit(spark, inputPath, outputPath , communityMap.keySet(), inputClazz);
});
}
private static <R extends Result> void execSplit(SparkSession spark, String inputPath, String outputPath, Set<String> communities Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
, Class<R> inputClazz) {
Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz); communities
.stream()
.forEach(c -> printResult(c, result, outputPath));
communities.stream() }
.forEach(c -> printResult(c, result, outputPath));
} private static <R extends Result> void printResult(String c, Dataset<R> result, String outputPath) {
result
.filter(r -> containsCommunity(r, c))
.write()
.option("compression", "gzip")
.mode(SaveMode.Append)
.json(outputPath + "/" + c);
}
private static <R extends Result> void printResult(String c, Dataset<R> result, String outputPath) { private static <R extends Result> boolean containsCommunity(R r, String c) {
result.filter(r -> containsCommunity(r, c)) if (Optional.ofNullable(r.getContext()).isPresent()) {
.write() return r
.option("compression","gzip") .getContext()
.mode(SaveMode.Append) .stream()
.json(outputPath + "/" + c); .filter(con -> con.getCode().equals(c))
} .collect(Collectors.toList())
.size() > 0;
private static <R extends Result> boolean containsCommunity(R r, String c) { }
if(Optional.ofNullable(r.getContext()).isPresent()) { return false;
return r.getContext().stream().filter(con -> con.getCode().equals(c)).collect(Collectors.toList()).size() > 0; }
}
return false;
}
} }

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import com.fasterxml.jackson.databind.ObjectMapper; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.io.Serializable;
import java.util.Optional;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -16,84 +16,86 @@ import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2; import scala.Tuple2;
import java.io.Serializable;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class SparkUpdateProjectInfo implements Serializable { public class SparkUpdateProjectInfo implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class); private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
SparkUpdateProjectInfo.class SparkUpdateProjectInfo.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json")); "/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args); parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf) .map(Boolean::valueOf)
.orElse(Boolean.TRUE); .orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged); log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath"); final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName"); final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName); log.info("resultTableName: {}", resultClassName);
final String preparedInfoPath = parser.get("preparedInfoPath");
log.info("preparedInfoPath: {}", preparedInfoPath);
final String preparedInfoPath = parser.get("preparedInfoPath"); Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
log.info("preparedInfoPath: {}", preparedInfoPath);
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName); SparkConf conf = new SparkConf();
SparkConf conf = new SparkConf(); runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
extend(spark, inputPath, outputPath, preparedInfoPath, inputClazz);
});
}
runWithSparkSession( private static <R extends Result> void extend(
conf, SparkSession spark,
isSparkSessionManaged, String inputPath,
spark -> { String outputPath,
Utils.removeOutputDir(spark, outputPath); String preparedInfoPath,
extend(spark, inputPath, outputPath , preparedInfoPath, inputClazz); Class<R> inputClazz) {
});
}
private static <R extends Result > void extend( Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
SparkSession spark, Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
String inputPath, result
String outputPath, .joinWith(
String preparedInfoPath, resultProject, result.col("id").equalTo(resultProject.col("resultId")),
Class<R> inputClazz) { "left")
.map(value -> {
R r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> {
r.setProjects(rp.getProjectsList());
});
return r;
}, Encoders.bean(inputClazz))
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
Dataset<R> result = Utils.readPath(spark, inputPath , inputClazz); }
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
result.joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")),
"left")
.map(value -> {
R r = value._1();
Optional.ofNullable(value._2()).ifPresent(rp -> {
r.setProjects(rp.getProjectsList());
});
return r;
},Encoders.bean(inputClazz))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}
}

View File

@ -1,23 +1,27 @@
package eu.dnetlib.dhp.oa.graph.dump; package eu.dnetlib.dhp.oa.graph.dump;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
public class Utils { import com.fasterxml.jackson.databind.ObjectMapper;
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath( import eu.dnetlib.dhp.common.HdfsSupport;
SparkSession spark, String inputPath, Class<R> clazz) {
return spark public class Utils {
.read() public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); public static void removeOutputDir(SparkSession spark, String path) {
} HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
} }