forked from D-Net/dnet-hadoop
fixed issues while running on cluster
This commit is contained in:
parent
56e70573c2
commit
f96ca900e1
|
@ -1,7 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
public class CommunityMap extends HashMap<String, String> {
|
public class CommunityMap extends HashMap<String, String> implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,18 +1,29 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
public class Constants {
|
public class Constants {
|
||||||
|
|
||||||
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
|
||||||
|
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
|
||||||
|
|
||||||
static {
|
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
|
||||||
accessRightsCoarMap.put("OPEN", "http://purl.org/coar/access_right/c_abf2");
|
|
||||||
accessRightsCoarMap.put("RESTRICTED", "http://purl.org/coar/access_right/c_16ec");
|
static {
|
||||||
accessRightsCoarMap.put("OPEN SOURCE", "http://purl.org/coar/access_right/c_abf2");
|
accessRightsCoarMap.put("OPEN", "c_abf2");
|
||||||
accessRightsCoarMap.put("CLOSED", "http://purl.org/coar/access_right/c_14cb //metadataonly for coar");
|
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
|
||||||
accessRightsCoarMap.put("EMBARGO", "http://purl.org/coar/access_right/c_f1cf");
|
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
|
||||||
}
|
accessRightsCoarMap.put("CLOSED", "c_14cb");
|
||||||
|
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
coarCodeLabelMap.put("c_abf2", "OPEN");
|
||||||
|
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
|
||||||
|
coarCodeLabelMap.put("c_14cb", "CLOSED");
|
||||||
|
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,223 +1,407 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import scala.collection.immutable.Stream;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.swing.text.html.Option;
|
||||||
|
|
||||||
|
import org.apache.avro.generic.GenericData;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class Mapper implements Serializable {
|
public class Mapper implements Serializable {
|
||||||
|
|
||||||
public static <I extends eu.dnetlib.dhp.schema.oaf.Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O map(
|
public static <I extends eu.dnetlib.dhp.schema.oaf.Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> O map(
|
||||||
I input, Map<String,String> communityMap){
|
I input, Map<String, String> communityMap) {
|
||||||
|
|
||||||
O out = null;
|
O out = null;
|
||||||
switch (input.getResulttype().getClassid()){
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
|
||||||
case "publication":
|
if (ort.isPresent()) {
|
||||||
out = (O)new Publication();
|
switch (ort.get().getClassid()) {
|
||||||
Optional<Journal> journal = Optional.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
case "publication":
|
||||||
if(journal.isPresent()){
|
out = (O) new Publication();
|
||||||
Journal j = journal.get();
|
Optional<Journal> journal = Optional
|
||||||
Container c = new Container();
|
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
|
||||||
c.setConferencedate(j.getConferencedate());
|
if (journal.isPresent()) {
|
||||||
c.setConferenceplace(j.getConferenceplace());
|
Journal j = journal.get();
|
||||||
c.setEdition(j.getEdition());
|
Container c = new Container();
|
||||||
c.setEp(j.getEp());
|
c.setConferencedate(j.getConferencedate());
|
||||||
c.setIss(j.getIss());
|
c.setConferenceplace(j.getConferenceplace());
|
||||||
c.setIssnLinking(j.getIssnLinking());
|
c.setEdition(j.getEdition());
|
||||||
c.setIssnOnline(j.getIssnOnline());
|
c.setEp(j.getEp());
|
||||||
c.setIssnPrinted(j.getIssnPrinted());
|
c.setIss(j.getIss());
|
||||||
c.setName(j.getName());
|
c.setIssnLinking(j.getIssnLinking());
|
||||||
c.setSp(j.getSp());
|
c.setIssnOnline(j.getIssnOnline());
|
||||||
c.setVol(j.getVol());
|
c.setIssnPrinted(j.getIssnPrinted());
|
||||||
out.setContainer(c);
|
c.setName(j.getName());
|
||||||
}
|
c.setSp(j.getSp());
|
||||||
break;
|
c.setVol(j.getVol());
|
||||||
case "dataset":
|
out.setContainer(c);
|
||||||
Dataset d = new Dataset();
|
}
|
||||||
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset)input;
|
break;
|
||||||
d.setSize(id.getSize().getValue());
|
case "dataset":
|
||||||
d.setVersion(id.getVersion().getValue());
|
Dataset d = new Dataset();
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
|
||||||
|
Optional.ofNullable(id.getSize()).ifPresent(v -> d.setSize(v.getValue()));
|
||||||
|
Optional.ofNullable(id.getVersion()).ifPresent(v -> d.setVersion(v.getValue()));
|
||||||
|
|
||||||
List<eu.dnetlib.dhp.schema.oaf.GeoLocation> igl = id.getGeolocation();
|
d
|
||||||
d.setGeolocation(igl.stream()
|
.setGeolocation(
|
||||||
.filter(Objects::nonNull)
|
Optional
|
||||||
.map(gli -> {
|
.ofNullable(id.getGeolocation())
|
||||||
GeoLocation gl = new GeoLocation();
|
.map(
|
||||||
gl.setBox(gli.getBox());
|
igl -> igl
|
||||||
gl.setPlace(gli.getPlace());
|
.stream()
|
||||||
gl.setPoint(gli.getPoint());
|
.filter(Objects::nonNull)
|
||||||
return gl;
|
.map(gli -> {
|
||||||
}).collect(Collectors.toList()));
|
GeoLocation gl = new GeoLocation();
|
||||||
out = (O)d;
|
gl.setBox(gli.getBox());
|
||||||
|
gl.setPlace(gli.getPlace());
|
||||||
|
gl.setPoint(gli.getPoint());
|
||||||
|
return gl;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList()))
|
||||||
|
.orElse(null));
|
||||||
|
|
||||||
break;
|
out = (O) d;
|
||||||
case "software":
|
|
||||||
Software s = new Software();
|
|
||||||
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software)input;
|
|
||||||
s.setCodeRepositoryUrl(is.getCodeRepositoryUrl().getValue());
|
|
||||||
s.setDocumentationUrl(is.getDocumentationUrl()
|
|
||||||
.stream()
|
|
||||||
.map(du -> du.getValue()).collect(Collectors.toList()));
|
|
||||||
s.setProgrammingLanguage(is.getProgrammingLanguage().getClassid());
|
|
||||||
|
|
||||||
out = (O) s;
|
break;
|
||||||
break;
|
case "software":
|
||||||
case "otherresearchproduct":
|
Software s = new Software();
|
||||||
OtherResearchProduct or = new OtherResearchProduct();
|
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
|
||||||
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct)input;
|
Optional
|
||||||
or.setContactgroup(ir.getContactgroup().stream().map(cg -> cg.getValue()).collect(Collectors.toList()));
|
.ofNullable(is.getCodeRepositoryUrl())
|
||||||
or.setContactperson(ir.getContactperson().stream().map(cp->cp.getValue()).collect(Collectors.toList()));
|
.ifPresent(value -> s.setCodeRepositoryUrl(value.getValue()));
|
||||||
or.setTool(ir.getTool().stream().map(t -> t.getValue()).collect(Collectors.toList()));
|
Optional
|
||||||
out = (O) or;
|
.ofNullable(is.getDocumentationUrl())
|
||||||
break;
|
.ifPresent(
|
||||||
}
|
value -> s
|
||||||
out.setAuthor(input.getAuthor()
|
.setDocumentationUrl(
|
||||||
.stream()
|
value
|
||||||
.map(oa -> {
|
.stream()
|
||||||
Author a = new Author();
|
.map(v -> v.getValue())
|
||||||
a.setAffiliation(oa.getAffiliation().stream().map(aff -> aff.getValue()).collect(Collectors.toList()));
|
.collect(Collectors.toList())));
|
||||||
a.setFullname(oa.getFullname());
|
|
||||||
a.setName(oa.getName());
|
|
||||||
a.setSurname(oa.getSurname());
|
|
||||||
a.setRank(oa.getRank());
|
|
||||||
a.setPid(oa.getPid().stream().map(p -> {
|
|
||||||
ControlledField cf = new ControlledField();
|
|
||||||
cf.setScheme( p.getQualifier().getClassid());
|
|
||||||
cf.setValue( p.getValue());
|
|
||||||
return cf;
|
|
||||||
}).collect(Collectors.toList()));
|
|
||||||
return a;
|
|
||||||
}).collect(Collectors.toList()));
|
|
||||||
//I do not map Access Right UNKNOWN or OTHER
|
|
||||||
if (Constants.accessRightsCoarMap.containsKey(input.getBestaccessright().getClassid())){
|
|
||||||
AccessRight ar = new AccessRight();
|
|
||||||
ar.setSchema(Constants.accessRightsCoarMap.get(input.getBestaccessright().getClassid()));
|
|
||||||
ar.setCode(input.getBestaccessright().getClassid());
|
|
||||||
ar.setLabel(input.getBestaccessright().getClassname());
|
|
||||||
out.setBestaccessright(ar);
|
|
||||||
}
|
|
||||||
|
|
||||||
out.setCollectedfrom(input.getCollectedfrom().stream().map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
Optional
|
||||||
.collect(Collectors.toList()));
|
.ofNullable(is.getProgrammingLanguage())
|
||||||
|
.ifPresent(value -> s.setProgrammingLanguage(value.getClassid()));
|
||||||
|
|
||||||
Set<String> communities = communityMap.keySet();
|
out = (O) s;
|
||||||
List<Context> contextList = input.getContext()
|
break;
|
||||||
.stream()
|
case "other":
|
||||||
.map(c -> {
|
OtherResearchProduct or = new OtherResearchProduct();
|
||||||
if(communities.contains(c.getId())){
|
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
|
||||||
Context context = new Context();
|
or
|
||||||
context.setCode(c.getId());
|
.setContactgroup(
|
||||||
context.setLabel(communityMap.get(c.getId()));
|
Optional
|
||||||
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
|
.ofNullable(ir.getContactgroup())
|
||||||
if(dataInfo.isPresent()){
|
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
|
||||||
context.setProvenance(dataInfo.get().stream()
|
.orElse(null));
|
||||||
.map(di -> {
|
|
||||||
if (di.getInferred()){
|
|
||||||
return di.getProvenanceaction().getClassid();
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}).filter(Objects::nonNull)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
return context;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
).filter(Objects::nonNull)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
if(contextList.size() > 0){
|
|
||||||
out.setContext(contextList);
|
|
||||||
}
|
|
||||||
out.setContributor(input.getContributor()
|
|
||||||
.stream()
|
|
||||||
.map(c -> c.getValue()).collect(Collectors.toList()));
|
|
||||||
out.setCountry(input.getCountry()
|
|
||||||
.stream()
|
|
||||||
.map(c -> {
|
|
||||||
Country country = new Country();
|
|
||||||
country.setCode(c.getClassid());
|
|
||||||
country.setLabel(c.getClassname());
|
|
||||||
Optional<DataInfo> dataInfo = Optional.ofNullable(c.getDataInfo());
|
|
||||||
if(dataInfo.isPresent()){
|
|
||||||
country.setProvenance(dataInfo.get().getProvenanceaction().getClassid());
|
|
||||||
}
|
|
||||||
return country;
|
|
||||||
}).collect(Collectors.toList()));
|
|
||||||
out.setCoverage(input.getCoverage().stream().map(c->c.getValue()).collect(Collectors.toList()));
|
|
||||||
|
|
||||||
out.setDateofcollection(input.getDateofcollection());
|
or
|
||||||
out.setDescription(input.getDescription().stream().map(d->d.getValue()).collect(Collectors.toList()));
|
.setContactperson(
|
||||||
out.setEmbargoenddate(input.getEmbargoenddate().getValue());
|
Optional
|
||||||
out.setFormat(input.getFormat().stream().map(f->f.getValue()).collect(Collectors.toList()));
|
.ofNullable(ir.getContactperson())
|
||||||
out.setId(input.getId());
|
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
|
||||||
out.setOriginalId(input.getOriginalId());
|
.orElse(null));
|
||||||
out.setInstance(input.getInstance()
|
or
|
||||||
.stream()
|
.setTool(
|
||||||
.map(i -> {
|
Optional
|
||||||
Instance instance = new Instance();
|
.ofNullable(ir.getTool())
|
||||||
AccessRight ar = new AccessRight();
|
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
|
||||||
ar.setCode(i.getAccessright().getClassid());
|
.orElse(null));
|
||||||
ar.setLabel(i.getAccessright().getClassname());
|
out = (O) or;
|
||||||
if(Constants.accessRightsCoarMap.containsKey(i.getAccessright().getClassid())){
|
break;
|
||||||
ar.setSchema(Constants.accessRightsCoarMap.get(i.getAccessright().getClassid()));
|
}
|
||||||
}
|
Optional<List<eu.dnetlib.dhp.schema.oaf.Author>> oAuthor = Optional.ofNullable(input.getAuthor());
|
||||||
instance.setAccessright(ar);
|
if (oAuthor.isPresent()) {
|
||||||
instance.setCollectedfrom(KeyValue.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
// List<eu.dnetlib.dhp.schema.dump.oaf.Author> authorList = new ArrayList<>();
|
||||||
instance.setHostedby(KeyValue.newInstance(i.getHostedby().getKey(),i.getHostedby().getValue()));
|
out
|
||||||
instance.setLicense(i.getLicense().getValue());
|
.setAuthor(
|
||||||
instance.setPublicationdata(i.getDateofacceptance().getValue());
|
oAuthor
|
||||||
instance.setRefereed(i.getRefereed().getValue());
|
.get()
|
||||||
instance.setType(i.getInstancetype().getClassid());
|
.stream()
|
||||||
instance.setUrl(i.getUrl());
|
.map(oa -> getAuthor(oa))
|
||||||
return instance;
|
.collect(Collectors.toList()));
|
||||||
}).collect(Collectors.toList()));
|
}
|
||||||
|
|
||||||
out.setLanguage(Qualifier.newInstance(input.getLanguage().getClassid(), input.getLanguage().getClassname()));
|
// I do not map Access Right UNKNOWN or OTHER
|
||||||
out.setLastupdatetimestamp(input.getLastupdatetimestamp());
|
|
||||||
|
|
||||||
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
|
||||||
if(otitle.isPresent()){
|
if (oar.isPresent()) {
|
||||||
List<StructuredProperty> iTitle = otitle.get()
|
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
|
||||||
.stream()
|
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
|
||||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
out
|
||||||
.collect(Collectors.toList());
|
.setBestaccessright(
|
||||||
if(iTitle.size() > 0 ){
|
AccessRight
|
||||||
out.setMaintitle(iTitle.get(0).getValue());
|
.newInstance(
|
||||||
}
|
code,
|
||||||
|
Constants.coarCodeLabelMap.get(code),
|
||||||
|
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
iTitle = otitle.get()
|
out
|
||||||
.stream()
|
.setCollectedfrom(
|
||||||
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
input
|
||||||
.collect(Collectors.toList());
|
.getCollectedfrom()
|
||||||
if(iTitle.size() > 0){
|
.stream()
|
||||||
out.setSubtitle(iTitle.get(0).getValue());
|
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
|
||||||
}
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
}
|
Set<String> communities = communityMap.keySet();
|
||||||
|
List<Context> contextList = input
|
||||||
|
.getContext()
|
||||||
|
.stream()
|
||||||
|
.map(c -> {
|
||||||
|
if (communities.contains(c.getId())
|
||||||
|
|| communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
|
||||||
|
Context context = new Context();
|
||||||
|
if (!communityMap.containsKey(c.getId())) {
|
||||||
|
context.setCode(c.getId().substring(0, c.getId().indexOf("::")));
|
||||||
|
context.setLabel(communityMap.get(context.getCode()));
|
||||||
|
} else {
|
||||||
|
context.setCode(c.getId());
|
||||||
|
context.setLabel(communityMap.get(c.getId()));
|
||||||
|
}
|
||||||
|
Optional<List<DataInfo>> dataInfo = Optional.ofNullable(c.getDataInfo());
|
||||||
|
if (dataInfo.isPresent()) {
|
||||||
|
List<String> provenance = new ArrayList<>();
|
||||||
|
provenance
|
||||||
|
.addAll(
|
||||||
|
dataInfo
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.map(di -> {
|
||||||
|
if (di.getInferred()) {
|
||||||
|
return di.getProvenanceaction().getClassname();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toSet()));
|
||||||
|
context.setProvenance(provenance);
|
||||||
|
}
|
||||||
|
return context;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
if (contextList.size() > 0) {
|
||||||
|
out.setContext(contextList);
|
||||||
|
}
|
||||||
|
final List<String> contributorList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getContributor())
|
||||||
|
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
|
||||||
|
out.setContributor(contributorList);
|
||||||
|
|
||||||
out.setPid(input.getPid().stream().map(p -> {
|
List<Country> countryList = new ArrayList<>();
|
||||||
ControlledField pid = new ControlledField();
|
Optional
|
||||||
pid.setScheme(p.getQualifier().getClassid());
|
.ofNullable(input.getCountry())
|
||||||
pid.setValue(p.getValue());
|
.ifPresent(
|
||||||
return pid;
|
value -> value
|
||||||
}).collect(Collectors.toList()));
|
.stream()
|
||||||
out.setPublicationdata(input.getDateofacceptance().getValue());
|
.forEach(
|
||||||
out.setPublisher(input.getPublisher().getValue());
|
c -> {
|
||||||
out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
Country country = new Country();
|
||||||
out.setSubject(input.getSubject().stream().map(s->{
|
country.setCode(c.getClassid());
|
||||||
ControlledField subject = new ControlledField();
|
country.setLabel(c.getClassname());
|
||||||
subject.setScheme(s.getQualifier().getClassid());
|
Optional
|
||||||
subject.setValue(s.getValue());
|
.ofNullable(c.getDataInfo())
|
||||||
return subject;
|
.ifPresent(
|
||||||
}).collect(Collectors.toList()));
|
provenance -> country
|
||||||
out.setType(input.getResulttype().getClassid());
|
.setProvenance(
|
||||||
|
provenance
|
||||||
|
.getProvenanceaction()
|
||||||
|
.getClassname()));
|
||||||
|
countryList
|
||||||
|
.add(country);
|
||||||
|
}));
|
||||||
|
|
||||||
return out;
|
out.setCountry(countryList);
|
||||||
}
|
|
||||||
|
final List<String> coverageList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getCoverage())
|
||||||
|
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
|
||||||
|
out.setCoverage(coverageList);
|
||||||
|
|
||||||
|
out.setDateofcollection(input.getDateofcollection());
|
||||||
|
|
||||||
|
final List<String> descriptionList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getDescription())
|
||||||
|
.ifPresent(value -> value.stream().forEach(d -> descriptionList.add(d.getValue())));
|
||||||
|
out.setDescription(descriptionList);
|
||||||
|
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setEmbargoenddate(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> formatList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getFormat())
|
||||||
|
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
|
||||||
|
out.setFormat(formatList);
|
||||||
|
out.setId(input.getId());
|
||||||
|
out.setOriginalId(input.getOriginalId());
|
||||||
|
|
||||||
|
final List<Instance> instanceList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getInstance())
|
||||||
|
.ifPresent(
|
||||||
|
inst -> inst
|
||||||
|
.stream()
|
||||||
|
.forEach(i -> {
|
||||||
|
Instance instance = new Instance();
|
||||||
|
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
|
||||||
|
.ofNullable(i.getAccessright());
|
||||||
|
if (opAr.isPresent()) {
|
||||||
|
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
|
||||||
|
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
|
||||||
|
instance
|
||||||
|
.setAccessright(
|
||||||
|
AccessRight
|
||||||
|
.newInstance(
|
||||||
|
code,
|
||||||
|
Constants.coarCodeLabelMap.get(code),
|
||||||
|
Constants.COAR_ACCESS_RIGHT_SCHEMA));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
instance
|
||||||
|
.setCollectedfrom(
|
||||||
|
KeyValue
|
||||||
|
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
|
||||||
|
instance
|
||||||
|
.setHostedby(
|
||||||
|
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getLicense())
|
||||||
|
.ifPresent(value -> instance.setLicense(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getDateofacceptance())
|
||||||
|
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getRefereed())
|
||||||
|
.ifPresent(value -> instance.setRefereed(value.getValue()));
|
||||||
|
Optional
|
||||||
|
.ofNullable(i.getInstancetype())
|
||||||
|
.ifPresent(value -> instance.setType(value.getClassname()));
|
||||||
|
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
|
||||||
|
instanceList.add(instance);
|
||||||
|
}));
|
||||||
|
out
|
||||||
|
.setInstance(instanceList);
|
||||||
|
|
||||||
|
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
|
||||||
|
if (oL.isPresent()) {
|
||||||
|
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
|
||||||
|
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
|
||||||
|
}
|
||||||
|
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
|
||||||
|
if (oLong.isPresent()) {
|
||||||
|
out.setLastupdatetimestamp(oLong.get());
|
||||||
|
}
|
||||||
|
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
|
||||||
|
if (otitle.isPresent()) {
|
||||||
|
List<StructuredProperty> iTitle = otitle
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
if (iTitle.size() > 0) {
|
||||||
|
out.setMaintitle(iTitle.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
iTitle = otitle
|
||||||
|
.get()
|
||||||
|
.stream()
|
||||||
|
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
if (iTitle.size() > 0) {
|
||||||
|
out.setSubtitle(iTitle.get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ControlledField> pids = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getPid())
|
||||||
|
.ifPresent(
|
||||||
|
value -> value
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
p -> pids
|
||||||
|
.add(
|
||||||
|
ControlledField
|
||||||
|
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
|
||||||
|
out.setPid(pids);
|
||||||
|
oStr = Optional.ofNullable(input.getDateofacceptance());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setPublicationdate(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
oStr = Optional.ofNullable(input.getPublisher());
|
||||||
|
if (oStr.isPresent()) {
|
||||||
|
out.setPublisher(oStr.get().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> sourceList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getSource())
|
||||||
|
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
|
||||||
|
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
|
||||||
|
List<ControlledField> subjectList = new ArrayList<>();
|
||||||
|
Optional
|
||||||
|
.ofNullable(input.getSubject())
|
||||||
|
.ifPresent(
|
||||||
|
value -> value
|
||||||
|
.stream()
|
||||||
|
.forEach(
|
||||||
|
s -> subjectList
|
||||||
|
.add(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()))));
|
||||||
|
out.setSubject(subjectList);
|
||||||
|
|
||||||
|
out.setType(input.getResulttype().getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
|
||||||
|
Author a = new Author();
|
||||||
|
Optional
|
||||||
|
.ofNullable(oa.getAffiliation())
|
||||||
|
.ifPresent(
|
||||||
|
value -> a
|
||||||
|
.setAffiliation(
|
||||||
|
value
|
||||||
|
.stream()
|
||||||
|
.map(aff -> aff.getValue())
|
||||||
|
.collect(Collectors.toList())));
|
||||||
|
a.setFullname(oa.getFullname());
|
||||||
|
a.setName(oa.getName());
|
||||||
|
a.setSurname(oa.getSurname());
|
||||||
|
a.setRank(oa.getRank());
|
||||||
|
Optional
|
||||||
|
.ofNullable(oa.getPid())
|
||||||
|
.ifPresent(
|
||||||
|
value -> a
|
||||||
|
.setPid(
|
||||||
|
value
|
||||||
|
.stream()
|
||||||
|
.map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue()))
|
||||||
|
.collect(Collectors.toList())));
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,34 +1,32 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import java.io.StringReader;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import java.util.List;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
import org.dom4j.io.SAXReader;
|
import org.dom4j.io.SAXReader;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import java.util.HashMap;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import java.util.List;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class QueryInformationSystem {
|
public class QueryInformationSystem {
|
||||||
|
|
||||||
private ISLookUpService isLookUp;
|
private ISLookUpService isLookUp;
|
||||||
|
|
||||||
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " +
|
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
|
||||||
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
|
+
|
||||||
" return " +
|
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
|
||||||
"<community> " +
|
" return " +
|
||||||
"{$x//CONFIGURATION/context/@id}" +
|
"<community> " +
|
||||||
"{$x//CONFIGURATION/context/@label}" +
|
"{$x//CONFIGURATION/context/@id}" +
|
||||||
"</community>";
|
"{$x//CONFIGURATION/context/@label}" +
|
||||||
|
"</community>";
|
||||||
|
|
||||||
|
public CommunityMap getCommunityMap()
|
||||||
|
|
||||||
public Map<String,String> getCommunityMap()
|
|
||||||
throws ISLookUpException {
|
throws ISLookUpException {
|
||||||
return getMap(isLookUp.quickSearchProfile(XQUERY));
|
return getMap(isLookUp.quickSearchProfile(XQUERY));
|
||||||
|
|
||||||
|
@ -42,12 +40,8 @@ public class QueryInformationSystem {
|
||||||
this.isLookUp = isLookUpService;
|
this.isLookUp = isLookUpService;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void set(String isLookUpUrl){
|
public static CommunityMap getMap(List<String> communityMap) {
|
||||||
isLookUpUrl = get(isLookUpUrl);
|
final CommunityMap map = new CommunityMap();
|
||||||
}
|
|
||||||
public ISLookUpService
|
|
||||||
private static Map<String, String> getMap(List<String> communityMap) {
|
|
||||||
final Map<String, String> map = new HashMap<>();
|
|
||||||
|
|
||||||
communityMap.stream().forEach(xml -> {
|
communityMap.stream().forEach(xml -> {
|
||||||
final Document doc;
|
final Document doc;
|
||||||
|
@ -59,7 +53,6 @@ public void set(String isLookUpUrl){
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return map;
|
return map;
|
||||||
|
|
|
@ -1,27 +1,28 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
|
||||||
|
|
||||||
public class ResultProject implements Serializable {
|
public class ResultProject implements Serializable {
|
||||||
private String resultId;
|
private String resultId;
|
||||||
private List<Projects> projectsList;
|
private List<Projects> projectsList;
|
||||||
|
|
||||||
public String getResultId() {
|
public String getResultId() {
|
||||||
return resultId;
|
return resultId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setResultId(String resultId) {
|
public void setResultId(String resultId) {
|
||||||
this.resultId = resultId;
|
this.resultId = resultId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Projects> getProjectsList() {
|
public List<Projects> getProjectsList() {
|
||||||
return projectsList;
|
return projectsList;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setProjectsList(List<Projects> projectsList) {
|
public void setProjectsList(List<Projects> projectsList) {
|
||||||
this.projectsList = projectsList;
|
this.projectsList = projectsList;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
@ -6,10 +7,8 @@ import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
import javax.management.Query;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -19,127 +18,129 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import javax.management.Query;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
public class SparkDumpCommunityProducts implements Serializable {
|
public class SparkDumpCommunityProducts implements Serializable {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
|
||||||
private QueryInformationSystem queryInformationSystem;
|
private static QueryInformationSystem queryInformationSystem;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkDumpCommunityProducts.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
String jsonConfiguration = IOUtils
|
parser.parseArgument(args);
|
||||||
.toString(
|
|
||||||
SparkDumpCommunityProducts.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
Boolean isSparkSessionManaged = Optional
|
||||||
parser.parseArgument(args);
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
final String inputPath = parser.get("sourcePath");
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
log.info("inputPath: {}", inputPath);
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String dumpClassName = parser.get("dumpTableName");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("dumpClassName: {}", dumpClassName);
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("isLookUpUrl: {}", isLookUpUrl);
|
||||||
|
|
||||||
final String dumpClassName = parser.get("dumpTableName");
|
// final String resultType = parser.get("resultType");
|
||||||
log.info("dumpClassName: {}", dumpClassName);
|
// log.info("resultType: {}", resultType);
|
||||||
|
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
final Optional<String> cm = Optional.ofNullable(parser.get("communityMap"));
|
||||||
log.info("isLookUpUrl: {}", isLookUpUrl);
|
|
||||||
|
|
||||||
final String resultType = parser.get("resultType");
|
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
log.info("resultType: {}", resultType);
|
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> dumpClazz = (Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result>) Class
|
||||||
|
.forName(dumpClassName);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
SparkDumpCommunityProducts sdcp = new SparkDumpCommunityProducts();
|
CommunityMap communityMap;
|
||||||
|
|
||||||
sdcp.exec(isLookUpUrl, isSparkSessionManaged, outputPath,
|
if (!isLookUpUrl.equals("BASEURL:8280/is/services/isLookUp")) {
|
||||||
inputPath, resultClassName, dumpClassName);
|
queryInformationSystem = new QueryInformationSystem();
|
||||||
|
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl));
|
||||||
|
communityMap = queryInformationSystem.getCommunityMap();
|
||||||
|
} else {
|
||||||
|
communityMap = new Gson().fromJson(cm.get(), CommunityMap.class);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
|
execDump(spark, inputPath, outputPath, communityMap, inputClazz, dumpClazz);
|
||||||
|
|
||||||
public QueryInformationSystem getQueryInformationSystem() {
|
});
|
||||||
return queryInformationSystem;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setQueryInformationSystem(QueryInformationSystem queryInformationSystem) {
|
}
|
||||||
this.queryInformationSystem = queryInformationSystem;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ISLookUpService getIsLookUpService(String isLookUpUrl){
|
public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
|
||||||
return ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
return ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void exec(String isLookUpUrl, Boolean isSparkSessionManaged, String outputPath, String inputPath,
|
public static <I extends Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result> void execDump(SparkSession spark,
|
||||||
String resultClassName, String dumpClassName) throws ISLookUpException, ClassNotFoundException {
|
String inputPath,
|
||||||
SparkConf conf = new SparkConf();
|
String outputPath,
|
||||||
|
CommunityMap communityMap,
|
||||||
|
Class<I> inputClazz,
|
||||||
|
Class<O> dumpClazz) {
|
||||||
|
|
||||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
// Set<String> communities = communityMap.keySet();
|
||||||
Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result> dumpClazz =
|
Dataset<I> tmp = Utils.readPath(spark, inputPath, inputClazz);
|
||||||
(Class<? extends eu.dnetlib.dhp.schema.dump.oaf.Result>) Class.forName(dumpClassName);
|
|
||||||
|
|
||||||
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl));
|
tmp
|
||||||
Map<String,String>
|
.map(value -> execMap(value, communityMap), Encoders.bean(dumpClazz))
|
||||||
communityMap = queryInformationSystem.getCommunityMap();
|
.filter(Objects::nonNull)
|
||||||
runWithSparkSession(
|
.write()
|
||||||
conf,
|
.mode(SaveMode.Overwrite)
|
||||||
isSparkSessionManaged,
|
.option("compression", "gzip")
|
||||||
spark -> {
|
.json(outputPath);
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
|
||||||
execDump(spark, inputPath, outputPath , communityMap, inputClazz, dumpClazz);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private <I extends Result, O extends eu.dnetlib.dhp.schema.dump.oaf.Result > void execDump(
|
private static <O extends eu.dnetlib.dhp.schema.dump.oaf.Result, I extends Result> O execMap(I value,
|
||||||
SparkSession spark,
|
CommunityMap communityMap) {
|
||||||
String inputPath,
|
{
|
||||||
String outputPath,
|
Set<String> communities = communityMap.keySet();
|
||||||
Map<String,String> communityMap,
|
Optional<List<Context>> inputContext = Optional.ofNullable(value.getContext());
|
||||||
Class<I> inputClazz,
|
if (!inputContext.isPresent()) {
|
||||||
Class<O> dumpClazz) {
|
return null;
|
||||||
|
}
|
||||||
Set<String> communities = communityMap.keySet();
|
List<String> toDumpFor = inputContext.get().stream().map(c -> {
|
||||||
Dataset<I> tmp = Utils.readPath(spark, inputPath, inputClazz);
|
if (communities.contains(c.getId())) {
|
||||||
tmp.map(value -> {
|
return c.getId();
|
||||||
Optional<List<Context>> inputContext = Optional.ofNullable(value.getContext());
|
}
|
||||||
if(!inputContext.isPresent()){
|
if (c.getId().contains("::") && communities.contains(c.getId().substring(0, c.getId().indexOf("::")))) {
|
||||||
return null;
|
return c.getId().substring(0, 3);
|
||||||
}
|
}
|
||||||
List<String> toDumpFor = inputContext.get().stream().map(c -> {
|
return null;
|
||||||
if (communities.contains(c.getId())) {
|
}).filter(Objects::nonNull).collect(Collectors.toList());
|
||||||
return c.getId();
|
if (toDumpFor.size() == 0) {
|
||||||
}
|
return null;
|
||||||
return null;
|
}
|
||||||
}).filter(Objects::nonNull).collect(Collectors.toList());
|
return Mapper.map(value, communityMap);
|
||||||
if(toDumpFor.size() == 0){
|
}
|
||||||
return null;
|
}
|
||||||
}
|
|
||||||
return Mapper.map(value, communityMap);
|
|
||||||
},Encoders.bean(dumpClazz))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression","gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
|
import java.io.Serializable;
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
import java.io.StringReader;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
import java.util.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -14,86 +16,165 @@ import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.Element;
|
||||||
|
import org.dom4j.Node;
|
||||||
|
import org.dom4j.io.SAXReader;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Funder;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Projects;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.Serializable;
|
public class SparkPrepareResultProject implements Serializable {
|
||||||
import java.util.Arrays;
|
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkPrepareResultProject.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json"));
|
||||||
|
|
||||||
public class SparkPrepareResultProject implements Serializable {
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkPrepareResultProject.class);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
final String inputPath = parser.get("sourcePath");
|
||||||
String jsonConfiguration = IOUtils
|
log.info("inputPath: {}", inputPath);
|
||||||
.toString(
|
|
||||||
SparkPrepareResultProject.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/dump/project_prepare_parameters.json"));
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final String outputPath = parser.get("outputPath");
|
||||||
parser.parseArgument(args);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
SparkConf conf = new SparkConf();
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
runWithSparkSession(
|
||||||
log.info("inputPath: {}", inputPath);
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
|
prepareResultProjectList(spark, inputPath, outputPath);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
||||||
log.info("outputPath: {}", outputPath);
|
Dataset<Relation> relation = Utils
|
||||||
|
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||||
|
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
|
||||||
|
Dataset<Project> projects = Utils.readPath(spark, inputPath + "/project", Project.class);
|
||||||
|
|
||||||
|
projects
|
||||||
|
.joinWith(relation, projects.col("id").equalTo(relation.col("source")))
|
||||||
|
.groupByKey(
|
||||||
|
(MapFunction<Tuple2<Project, Relation>, String>) value -> value._2().getTarget(), Encoders.STRING())
|
||||||
|
.mapGroups((MapGroupsFunction<String, Tuple2<Project, Relation>, ResultProject>) (s, it) -> {
|
||||||
|
Set<String> projectSet = new HashSet<>();
|
||||||
|
Tuple2<Project, Relation> first = it.next();
|
||||||
|
ResultProject rp = new ResultProject();
|
||||||
|
rp.setResultId(first._2().getTarget());
|
||||||
|
Project p = first._1();
|
||||||
|
projectSet.add(p.getId());
|
||||||
|
Projects ps = Projects
|
||||||
|
.newInstance(
|
||||||
|
p.getId(), p.getCode().getValue(),
|
||||||
|
Optional
|
||||||
|
.ofNullable(p.getAcronym())
|
||||||
|
.map(a -> a.getValue())
|
||||||
|
.orElse(null),
|
||||||
|
Optional
|
||||||
|
.ofNullable(p.getTitle())
|
||||||
|
.map(v -> v.getValue())
|
||||||
|
.orElse(null),
|
||||||
|
Optional
|
||||||
|
.ofNullable(p.getFundingtree())
|
||||||
|
.map(
|
||||||
|
value -> value
|
||||||
|
.stream()
|
||||||
|
.map(ft -> getFunder(ft.getValue()))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.get(0))
|
||||||
|
.orElse(null));
|
||||||
|
List<Projects> projList = new ArrayList<>();
|
||||||
|
projList.add(ps);
|
||||||
|
rp.setProjectsList(projList);
|
||||||
|
it.forEachRemaining(c -> {
|
||||||
|
Project op = c._1();
|
||||||
|
if (!projectSet.contains(op.getId())) {
|
||||||
|
projList
|
||||||
|
.add(
|
||||||
|
Projects
|
||||||
|
.newInstance(
|
||||||
|
op.getId(),
|
||||||
|
op.getCode().getValue(),
|
||||||
|
Optional
|
||||||
|
.ofNullable(op.getAcronym())
|
||||||
|
.map(a -> a.getValue())
|
||||||
|
.orElse(null),
|
||||||
|
Optional
|
||||||
|
.ofNullable(op.getTitle())
|
||||||
|
.map(v -> v.getValue())
|
||||||
|
.orElse(null),
|
||||||
|
Optional
|
||||||
|
.ofNullable(op.getFundingtree())
|
||||||
|
.map(
|
||||||
|
value -> value
|
||||||
|
.stream()
|
||||||
|
.map(ft -> getFunder(ft.getValue()))
|
||||||
|
.collect(Collectors.toList())
|
||||||
|
.get(0))
|
||||||
|
.orElse(null)));
|
||||||
|
projectSet.add(op.getId());
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
}
|
||||||
|
|
||||||
runWithSparkSession(
|
});
|
||||||
conf,
|
return rp;
|
||||||
isSparkSessionManaged,
|
}, Encoders.bean(ResultProject.class))
|
||||||
spark -> {
|
.write()
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
.mode(SaveMode.Overwrite)
|
||||||
prepareResultProjectList(spark, inputPath, outputPath);
|
.option("compression", "gzip")
|
||||||
});
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareResultProjectList(SparkSession spark, String inputPath, String outputPath) {
|
private static Funder getFunder(String fundingtree) {
|
||||||
Dataset<Relation> relation = Utils.readPath(spark, inputPath + "/relation" , Relation.class)
|
// ["<fundingtree><funder><id>nsf_________::NSF</id><shortname>NSF</shortname><name>National Science
|
||||||
.filter("dataInfo.deletedbyinference = false and relClass = 'produces'");
|
// Foundation</name><jurisdiction>US</jurisdiction></funder><funding_level_1><id>nsf_________::NSF::CISE/OAD::CISE/CCF</id><description>Division
|
||||||
Dataset<Project> projects = Utils.readPath(spark, inputPath + "/project" , Project.class);
|
// of Computing and Communication Foundations</description><name>Division of Computing and Communication
|
||||||
|
// Foundations</name><parent><funding_level_0><id>nsf_________::NSF::CISE/OAD</id><description>Directorate for
|
||||||
|
// Computer & Information Science & Engineering</description><name>Directorate for Computer &
|
||||||
|
// Information Science &
|
||||||
|
// Engineering</name><parent/><class>nsf:fundingStream</class></funding_level_0></parent></funding_level_1></fundingtree>"]
|
||||||
|
Funder f = new Funder();
|
||||||
|
final Document doc;
|
||||||
|
try {
|
||||||
|
doc = new SAXReader().read(new StringReader(fundingtree));
|
||||||
|
f.setShortName(((Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
|
||||||
|
f.setName(((Node) (doc.selectNodes("//funder/name").get(0))).getText());
|
||||||
|
f.setJurisdiction(((Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
|
||||||
|
for (Object o : doc.selectNodes("//funding_level_0")) {
|
||||||
|
List node = ((Node) o).selectNodes("./name");
|
||||||
|
f.setFundingStream(((Node) node.get(0)).getText());
|
||||||
|
|
||||||
projects.joinWith(relation, projects.col("id").equalTo(relation.col("source")))
|
}
|
||||||
.groupByKey((MapFunction<Tuple2<Project,Relation>,String>)value -> value._2().getTarget(), Encoders.STRING())
|
|
||||||
.mapGroups((MapGroupsFunction<String, Tuple2<Project,Relation>, ResultProject>) (s, it) ->
|
return f;
|
||||||
{
|
} catch (DocumentException e) {
|
||||||
Tuple2<Project, Relation> first = it.next();
|
e.printStackTrace();
|
||||||
ResultProject rp = new ResultProject();
|
}
|
||||||
rp.setResultId(first._2().getTarget());
|
return f;
|
||||||
Project p = first._1();
|
}
|
||||||
Projects ps = Projects.newInstance(p.getId(), p.getCode().getValue(), p.getAcronym().getValue(),
|
|
||||||
p.getTitle().getValue(), p.getFundingtree()
|
|
||||||
.stream()
|
|
||||||
.map(ft -> ft.getValue()).collect(Collectors.toList()));
|
|
||||||
List<Projects> projList = Arrays.asList(ps);
|
|
||||||
rp.setProjectsList(projList);
|
|
||||||
it.forEachRemaining(c -> {
|
|
||||||
Project op = c._1();
|
|
||||||
projList.add(Projects.newInstance(op.getId(), op.getCode().getValue(),
|
|
||||||
op.getAcronym().getValue(), op.getTitle().getValue(),
|
|
||||||
op.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())));
|
|
||||||
});
|
|
||||||
return rp;
|
|
||||||
} ,Encoders.bean(ResultProject.class))
|
|
||||||
.write()
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.option("compression", "gzip")
|
|
||||||
.json(outputPath);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,98 +1,122 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.dom4j.Document;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.dom4j.Element;
|
||||||
|
import org.dom4j.io.SAXReader;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import com.google.gson.Gson;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
public class SparkSplitForCommunity implements Serializable {
|
public class SparkSplitForCommunity implements Serializable {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkSplitForCommunity.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkSplitForCommunity.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
String jsonConfiguration = IOUtils
|
parser.parseArgument(args);
|
||||||
.toString(
|
|
||||||
SparkSplitForCommunity.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/graph/dump/split_parameters.json"));
|
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
Boolean isSparkSessionManaged = Optional
|
||||||
parser.parseArgument(args);
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
final String inputPath = parser.get("sourcePath");
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
log.info("inputPath: {}", inputPath);
|
||||||
.map(Boolean::valueOf)
|
|
||||||
.orElse(Boolean.TRUE);
|
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("isLookUpUrl: {}", isLookUpUrl);
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final Optional<String> cm = Optional.ofNullable(parser.get("communityMap"));
|
||||||
log.info("resultTableName: {}", resultClassName);
|
|
||||||
|
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
log.info("isLookUpUrl: {}", isLookUpUrl);
|
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
CommunityMap communityMap;
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
if (!isLookUpUrl.equals("BASEURL:8280/is/services/isLookUp")) {
|
||||||
|
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
|
||||||
|
queryInformationSystem.setIsLookUp(getIsLookUpService(isLookUpUrl));
|
||||||
|
communityMap = queryInformationSystem.getCommunityMap();
|
||||||
|
} else {
|
||||||
|
communityMap = new Gson().fromJson(cm.get(), CommunityMap.class);
|
||||||
|
}
|
||||||
|
|
||||||
Map<String,String>
|
runWithSparkSession(
|
||||||
communityMap = QueryInformationSystem.getCommunityMap(isLookUpUrl);
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
|
execSplit(spark, inputPath, outputPath, communityMap.keySet(), inputClazz);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ISLookUpService getIsLookUpService(String isLookUpUrl) {
|
||||||
|
return ISLookupClientFactory.getLookUpService(isLookUpUrl);
|
||||||
|
}
|
||||||
|
|
||||||
runWithSparkSession(
|
private static <R extends Result> void execSplit(SparkSession spark, String inputPath, String outputPath,
|
||||||
conf,
|
Set<String> communities, Class<R> inputClazz) {
|
||||||
isSparkSessionManaged,
|
|
||||||
spark -> {
|
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
|
||||||
execSplit(spark, inputPath, outputPath , communityMap.keySet(), inputClazz);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R extends Result> void execSplit(SparkSession spark, String inputPath, String outputPath, Set<String> communities
|
Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
|
||||||
, Class<R> inputClazz) {
|
|
||||||
|
|
||||||
Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
|
communities
|
||||||
|
.stream()
|
||||||
|
.forEach(c -> printResult(c, result, outputPath));
|
||||||
|
|
||||||
communities.stream()
|
}
|
||||||
.forEach(c -> printResult(c, result, outputPath));
|
|
||||||
|
|
||||||
}
|
private static <R extends Result> void printResult(String c, Dataset<R> result, String outputPath) {
|
||||||
|
result
|
||||||
|
.filter(r -> containsCommunity(r, c))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.json(outputPath + "/" + c);
|
||||||
|
}
|
||||||
|
|
||||||
private static <R extends Result> void printResult(String c, Dataset<R> result, String outputPath) {
|
private static <R extends Result> boolean containsCommunity(R r, String c) {
|
||||||
result.filter(r -> containsCommunity(r, c))
|
if (Optional.ofNullable(r.getContext()).isPresent()) {
|
||||||
.write()
|
return r
|
||||||
.option("compression","gzip")
|
.getContext()
|
||||||
.mode(SaveMode.Append)
|
.stream()
|
||||||
.json(outputPath + "/" + c);
|
.filter(con -> con.getCode().equals(c))
|
||||||
}
|
.collect(Collectors.toList())
|
||||||
|
.size() > 0;
|
||||||
private static <R extends Result> boolean containsCommunity(R r, String c) {
|
}
|
||||||
if(Optional.ofNullable(r.getContext()).isPresent()) {
|
return false;
|
||||||
return r.getContext().stream().filter(con -> con.getCode().equals(c)).collect(Collectors.toList()).size() > 0;
|
}
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
@ -16,84 +16,86 @@ import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|
||||||
|
|
||||||
public class SparkUpdateProjectInfo implements Serializable {
|
public class SparkUpdateProjectInfo implements Serializable {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkUpdateProjectInfo.class);
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
SparkUpdateProjectInfo.class
|
SparkUpdateProjectInfo.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
|
"/eu/dnetlib/dhp/oa/graph/dump/project_input_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
Boolean isSparkSessionManaged = Optional
|
Boolean isSparkSessionManaged = Optional
|
||||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
.map(Boolean::valueOf)
|
.map(Boolean::valueOf)
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
final String resultClassName = parser.get("resultTableName");
|
final String resultClassName = parser.get("resultTableName");
|
||||||
log.info("resultTableName: {}", resultClassName);
|
log.info("resultTableName: {}", resultClassName);
|
||||||
|
|
||||||
|
final String preparedInfoPath = parser.get("preparedInfoPath");
|
||||||
|
log.info("preparedInfoPath: {}", preparedInfoPath);
|
||||||
|
|
||||||
final String preparedInfoPath = parser.get("preparedInfoPath");
|
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
log.info("preparedInfoPath: {}", preparedInfoPath);
|
|
||||||
|
|
||||||
Class<? extends Result> inputClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> {
|
||||||
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
|
extend(spark, inputPath, outputPath, preparedInfoPath, inputClazz);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
runWithSparkSession(
|
private static <R extends Result> void extend(
|
||||||
conf,
|
SparkSession spark,
|
||||||
isSparkSessionManaged,
|
String inputPath,
|
||||||
spark -> {
|
String outputPath,
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
String preparedInfoPath,
|
||||||
extend(spark, inputPath, outputPath , preparedInfoPath, inputClazz);
|
Class<R> inputClazz) {
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <R extends Result > void extend(
|
Dataset<R> result = Utils.readPath(spark, inputPath, inputClazz);
|
||||||
SparkSession spark,
|
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
|
||||||
String inputPath,
|
result
|
||||||
String outputPath,
|
.joinWith(
|
||||||
String preparedInfoPath,
|
resultProject, result.col("id").equalTo(resultProject.col("resultId")),
|
||||||
Class<R> inputClazz) {
|
"left")
|
||||||
|
.map(value -> {
|
||||||
|
R r = value._1();
|
||||||
|
Optional.ofNullable(value._2()).ifPresent(rp -> {
|
||||||
|
r.setProjects(rp.getProjectsList());
|
||||||
|
});
|
||||||
|
return r;
|
||||||
|
}, Encoders.bean(inputClazz))
|
||||||
|
.write()
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.json(outputPath);
|
||||||
|
|
||||||
Dataset<R> result = Utils.readPath(spark, inputPath , inputClazz);
|
}
|
||||||
Dataset<ResultProject> resultProject = Utils.readPath(spark, preparedInfoPath, ResultProject.class);
|
|
||||||
result.joinWith(resultProject, result.col("id").equalTo(resultProject.col("resultId")),
|
|
||||||
"left")
|
|
||||||
.map(value -> {
|
|
||||||
R r = value._1();
|
|
||||||
Optional.ofNullable(value._2()).ifPresent(rp -> {
|
|
||||||
r.setProjects(rp.getProjectsList());
|
|
||||||
});
|
|
||||||
return r;
|
|
||||||
},Encoders.bean(inputClazz))
|
|
||||||
.write()
|
|
||||||
.option("compression","gzip")
|
|
||||||
.mode(SaveMode.Overwrite)
|
|
||||||
.json(outputPath);
|
|
||||||
|
|
||||||
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,23 +1,27 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.dump;
|
package eu.dnetlib.dhp.oa.graph.dump;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
public class Utils {
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
|
||||||
public static void removeOutputDir(SparkSession spark, String path) {
|
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
|
||||||
return spark
|
public class Utils {
|
||||||
.read()
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
.textFile(inputPath)
|
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
public static void removeOutputDir(SparkSession spark, String path) {
|
||||||
}
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <R> Dataset<R> readPath(
|
||||||
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue