split the classes related to the communities dump and to the whole graph dump

This commit is contained in:
Miriam Baglioni 2020-07-24 17:21:48 +02:00
parent 355d7e426e
commit 332258d199
39 changed files with 1142 additions and 629 deletions

View File

@ -1,41 +0,0 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import java.io.Serializable;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
public abstract class Oaf implements Serializable {
/**
* The list of datasource id/name pairs providing this relationship.
*/
protected List<KeyValue> collectedfrom;
private Long lastupdatetimestamp;
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
public Long getLastupdatetimestamp() {
return lastupdatetimestamp;
}
public void setLastupdatetimestamp(Long lastupdatetimestamp) {
this.lastupdatetimestamp = lastupdatetimestamp;
}
// public void setAllowedValues(eu.dnetlib.dhp.schema.oaf.Oaf o){
// collectedfrom = o.getCollectedfrom().stream().map(cf -> KeyValue.newInstance(cf)).collect(Collectors.toList());
//
// lastupdatetimestamp = o.getLastupdatetimestamp();
//
// }
}

View File

@ -1,61 +0,0 @@
package eu.dnetlib.dhp.schema.dump.oaf;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
import java.io.Serializable;
import java.util.List;
public abstract class OafEntity extends Oaf implements Serializable {
private String id;
private List<String> originalId;
private List<ControlledField> pid;
private String dateofcollection;
private List<Project> projects;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<String> getOriginalId() {
return originalId;
}
public void setOriginalId(List<String> originalId) {
this.originalId = originalId;
}
public List<ControlledField> getPid() {
return pid;
}
public void setPid(List<ControlledField> pid) {
this.pid = pid;
}
public String getDateofcollection() {
return dateofcollection;
}
public void setDateofcollection(String dateofcollection) {
this.dateofcollection = dateofcollection;
}
public List<Project> getProjects() {
return projects;
}
public void setProjects(List<Project> projects) {
this.projects = projects;
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.Context;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.Result;
public class CommunityResult extends Result {
private List<Project> projects;
private List<Context> context;
protected List<KeyValue> collectedfrom;
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
public List<Project> getProjects() {
return projects;
}
public void setProjects(List<Project> projects) {
this.projects = projects;
}
public List<Context> getContext() {
return context;
}
public void setContext(List<Context> context) {
this.context = context;
}
}

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.schema.dump.oaf.community;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.dump.oaf.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import java.io.Serializable;
public class Project implements Serializable {
private String id;// OpenAIRE id

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class Constants implements Serializable {
// collectedFrom va con isProvidedBy -> becco da ModelSupport
public static final String HOSTED_BY = "isHostedBy";
public static final String HOSTS = "hosts";
// community result uso isrelatedto
public static final String RESULT_ENTITY = "result";
public static final String DATASOURCE_ENTITY = "datasource";
public static final String CONTEXT_ENTITY = "context";
public static final String CONTEXT_ID = "60";
public static final String CONTEXT_NS_PREFIX = "context____";
}

View File

@ -1,27 +1,28 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
public class Funder implements Serializable {
private String id ;
// private String id ;
private String shortName;
private String name;
private List<Levels> funding_levels;
// private List<Levels> funding_levels;
private Fundings funding_stream;
private String jurisdiction;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
// public String getId() {
// return id;
// }
//
// public void setId(String id) {
// this.id = id;
// }
public String getShortName() {
return shortName;
@ -39,13 +40,13 @@ public class Funder implements Serializable {
this.name = name;
}
public List<Levels> getFunding_levels() {
return funding_levels;
}
public void setFunding_levels(List<Levels> funding_levels) {
this.funding_levels = funding_levels;
}
// public List<Levels> getFunding_levels() {
// return funding_levels;
// }
//
// public void setFunding_levels(List<Levels> funding_levels) {
// this.funding_levels = funding_levels;
// }
public String getJurisdiction() {
return jurisdiction;
@ -54,4 +55,12 @@ public class Funder implements Serializable {
public void setJurisdiction(String jurisdiction) {
this.jurisdiction = jurisdiction;
}
public Fundings getFunding_stream() {
return funding_stream;
}
public void setFunding_stream(Fundings funding_stream) {
this.funding_stream = funding_stream;
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class Fundings implements Serializable {
private String id;
private String description;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
}

View File

@ -1,11 +1,13 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.Optional;
public class Granted implements Serializable {
private String currency;
private String totalcost;
private String fundedamount;
private float totalcost;
private float fundedamount;
public String getCurrency() {
return currency;
@ -15,19 +17,34 @@ public class Granted implements Serializable {
this.currency = currency;
}
public String getTotalcost() {
public float getTotalcost() {
return totalcost;
}
public void setTotalcost(String totalcost) {
public void setTotalcost(float totalcost) {
this.totalcost = totalcost;
}
public String getFundedamount() {
public float getFundedamount() {
return fundedamount;
}
public void setFundedamount(String fundedamount) {
public void setFundedamount(float fundedamount) {
this.fundedamount = fundedamount;
}
public static Granted newInstance(String currency, float totalcost, float fundedamount) {
Granted granted = new Granted();
granted.currency = currency;
granted.totalcost = totalcost;
granted.fundedamount = fundedamount;
return granted;
}
public static Granted newInstance(String currency, float fundedamount) {
Granted granted = new Granted();
granted.currency = currency;
granted.fundedamount = fundedamount;
return granted;
}
}

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class Levels implements Serializable {
private String level;
private String il;
private String id;
private String description;
private String name;
@ -16,12 +17,12 @@ public class Levels implements Serializable {
this.level = level;
}
public String getIl() {
return il;
public String getId() {
return id;
}
public void setIl(String il) {
this.il = il;
public void setId(String il) {
this.id = il;
}
public String getDescription() {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
@ -21,4 +22,11 @@ public class Node implements Serializable {
public void setType(String type) {
this.type = type;
}
public static Node newInstance(String id, String type) {
Node node = new Node();
node.id = id;
node.type = type;
return node;
}
}

View File

@ -1,21 +1,24 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.Qualifier;
import eu.dnetlib.dhp.schema.dump.oaf.community.Project;
public class Organization implements Serializable {
private String legalshortname;
private String legalname;
private String websiteurl;
private List<String> alternativenames;
private Country country;
private Qualifier country;
private String id;
private List<ControlledField> pid;
private String collectedfrom;
private List<KeyValue> collectedfrom;
public String getLegalshortname() {
return legalshortname;
@ -49,11 +52,11 @@ public class Organization implements Serializable {
this.alternativenames = alternativenames;
}
public Country getCountry() {
public Qualifier getCountry() {
return country;
}
public void setCountry(Country country) {
public void setCountry(Qualifier country) {
this.country = country;
}
@ -73,11 +76,11 @@ public class Organization implements Serializable {
this.pid = pid;
}
public String getCollectedfrom() {
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(String collectedfrom) {
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
@ -22,5 +23,10 @@ public class Programme implements Serializable {
this.description = description;
}
public static Programme newInstance(String code, String description) {
Programme p = new Programme();
p.code = code;
p.description = description;
return p;
}
}

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
public class Project implements Serializable {
private String id;
private List<KeyValue> collectedfrom;
private String websiteurl;
private String code;
private String acronym;
@ -27,13 +27,14 @@ public class Project implements Serializable {
private boolean openaccessmandatefordataset;
private List<String> subject;
private Funder funding;
private List<Funder> funding;
private String summary;
private Granted granted;
private Programme programme;
private List<Programme> programme;
public String getId() {
return id;
@ -139,11 +140,11 @@ public class Project implements Serializable {
this.subject = subject;
}
public Funder getFunding() {
public List<Funder> getFunding() {
return funding;
}
public void setFunding(Funder funding) {
public void setFunding(List<Funder> funding) {
this.funding = funding;
}
@ -163,11 +164,19 @@ public class Project implements Serializable {
this.granted = granted;
}
public Programme getProgramme() {
public List<Programme> getProgramme() {
return programme;
}
public void setProgramme(Programme programme) {
public void setProgramme(List<Programme> programme) {
this.programme = programme;
}
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
}

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class RelType implements Serializable {
private String name ; //relclass
private String type ; //subreltype
private String name; // relclass
private String type; // subreltype
public String getName() {
return name;
@ -21,4 +22,11 @@ public class RelType implements Serializable {
public void setType(String type) {
this.type = type;
}
public static RelType newInstance(String name, String type) {
RelType rel = new RelType();
rel.name = name;
rel.type = type;
return rel;
}
}

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
import java.io.Serializable;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
public class Relation implements Serializable {
private Node source;
private Node target;

View File

@ -0,0 +1,16 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.util.List;
public class ResearchCommunity extends ResearchInitiative {
private List<String> subject;
public List<String> getSubject() {
return subject;
}
public void setSubject(List<String> subject) {
this.subject = subject;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.dhp.schema.dump.oaf.graph;
import java.io.Serializable;
public class ResearchInitiative implements Serializable {
private String id; // openaireId
private String originalId; // context id
private String name; // context name
private String type; // context type: research initiative or research community
private String description;
private String zenodo_community;
public String getZenodo_community() {
return zenodo_community;
}
public void setZenodo_community(String zenodo_community) {
this.zenodo_community = zenodo_community;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String label) {
this.name = label;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class ContextInfo {
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchCommunity;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.Optional;
public class SparkCreateContextEntities implements Serializable {
//leggo i context dall'is e mi faccio la mappa id -> contextinfo
//creo le entities con le info generali
//creo le relazioni con le info in projectList e datasourceList. Le relazioni sono di tipo isRelatedTo da una parte e dall'altra
//prendo un parametro community_organization per creare relazioni fra community ed organizzazioni . n.b. l'id dell'organizzazione
//e' non deduplicato => bisogna risolverlo e prendere i dedup id distinti
private static final Logger log = LoggerFactory.getLogger(SparkCreateContextEntities.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkCreateContextEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl));
CommunityMap communityMap = queryInformationSystem.getCommunityMap();
createEntities(communityMap, outputPath + "/context");
}
private static void createEntities(CommunityMap communityMap, String s) {
communityMap.keySet().stream()
.map(key -> {
ResearchInitiative community;
ContextInfo cinfo = communityMap.get(key);
if(cinfo.getType().equals("community")){
community = new ResearchCommunity();
}else{
community = new ResearchInitiative();
}
return community;
})
}
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class CreateContextRelation {
}

View File

@ -0,0 +1,253 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.KeyValue;
import eu.dnetlib.dhp.schema.dump.oaf.Qualifier;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class DumpGraph implements Serializable {
public void run(Boolean isSparkSessionManaged,
String inputPath,
String outputPath,
Class<? extends OafEntity> inputClazz,
CommunityMap communityMap) {
SparkConf conf = new SparkConf();
switch (ModelSupport.idPrefixMap.get(inputClazz)){
case "50":
DumpProducts d = new DumpProducts();
d.run(isSparkSessionManaged,inputPath,outputPath,communityMap, inputClazz, true);
break;
case "40":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
projectMap(spark, inputPath, outputPath, inputClazz);
});
break;
case "20":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath, inputClazz);
});
break;
}
}
private static <E extends OafEntity> void projectMap(SparkSession spark, String inputPath, String outputPath, Class<E> inputClazz) {
Utils.readPath(spark, inputPath, inputClazz)
.map(p -> mapProject((eu.dnetlib.dhp.schema.oaf.Project)p), Encoders.bean(Project.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
private static Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) {
Project project = new Project();
project.setCollectedfrom(Optional.ofNullable(p.getCollectedfrom())
.map(cf -> cf.stream().map(coll -> coll.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional.ofNullable(p.getId())
.ifPresent(id -> project.setId(id));
Optional.ofNullable(p.getWebsiteurl())
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
Optional.ofNullable(p.getCode())
.ifPresent(code -> project.setCode(code.getValue()));
Optional.ofNullable(p.getAcronym())
.ifPresent(acronynim -> project.setAcronym(acronynim.getValue()));
Optional.ofNullable(p.getTitle())
.ifPresent(title -> project.setTitle(title.getValue()));
Optional.ofNullable(p.getStartdate())
.ifPresent(sdate -> project.setStartdate(sdate.getValue()));
Optional.ofNullable(p.getEnddate())
.ifPresent(edate -> project.setEnddate(edate.getValue()));
Optional.ofNullable(p.getCallidentifier())
.ifPresent(cide -> project.setCallidentifier(cide.getValue()));
Optional.ofNullable(p.getKeywords())
.ifPresent(key -> project.setKeywords(key.getValue()));
Optional.ofNullable(p.getDuration())
.ifPresent(duration -> project.setDuration(duration.getValue()));
Optional<Field<String>> omandate = Optional.ofNullable(p.getOamandatepublications());
Optional<Field<String>> oecsc39 = Optional.ofNullable(p.getEcsc39());
boolean mandate = false;
if(omandate.isPresent()){
if (!omandate.get().getValue().equals("N")){
mandate = true;
}
}
if(oecsc39.isPresent()){
if(!oecsc39.get().getValue().equals("N")){
mandate = true;
}
}
project.setOpenaccessmandateforpublications(mandate);
project.setOpenaccessmandatefordataset(false);
Optional.ofNullable(p.getEcarticle29_3())
.ifPresent(oamandate -> project.setOpenaccessmandatefordataset(oamandate.getValue().equals("Y")));
project.setSubject(Optional.ofNullable(p.getSubjects())
.map(subjs -> subjs.stream().map(s -> s.getValue()).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
Optional.ofNullable(p.getSummary())
.ifPresent(summary -> project.setSummary(summary.getValue()));
Optional<Float> ofundedamount = Optional.ofNullable(p.getFundedamount());
Optional<Field<String>> ocurrency = Optional.ofNullable(p.getCurrency());
Optional<Float> ototalcost = Optional.ofNullable(p.getTotalcost());
if(ocurrency.isPresent()){
if(ofundedamount.isPresent()){
if(ototalcost.isPresent()){
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ototalcost.get(), ofundedamount.get()));
}else{
project.setGranted(Granted.newInstance(ocurrency.get().getValue(), ofundedamount.get()));
}
}
}
project.setProgramme(Optional.ofNullable(p.getProgramme())
.map(programme -> programme.stream().map(pg -> Programme.newInstance(pg.getCode(), pg.getDescription()))
.collect(Collectors.toList()))
.orElse(new ArrayList<>()));
project.setFunding(Optional.ofNullable(p.getFundingtree())
.map(value -> value.stream()
.map(fundingtree -> getFunder(fundingtree.getValue())).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
return project;
}
public static Funder getFunder(String fundingtree){
Funder f = new Funder();
final Document doc;
try {
doc = new SAXReader().read(new StringReader(fundingtree));
f.setShortName(((org.dom4j.Node) (doc.selectNodes("//funder/shortname").get(0))).getText());
f.setName(((org.dom4j.Node) (doc.selectNodes("//funder/name").get(0))).getText());
f.setJurisdiction(((org.dom4j.Node) (doc.selectNodes("//funder/jurisdiction").get(0))).getText());
f.setId(((org.dom4j.Node) (doc.selectNodes("//funder/id").get(0))).getText());
List<Levels> fundings = new ArrayList<>();
int level = 0;
List<org.dom4j.Node> nodes = doc.selectNodes("//funding_level_"+level);
while(nodes.size() > 0 ) {
for(org.dom4j.Node n: nodes) {
Levels funding_stream = new Levels();
funding_stream.setLevel(String.valueOf(level));
List node = n.selectNodes("./name");
funding_stream.setName(((org.dom4j.Node)node.get(0)).getText());
node = n.selectNodes("./id");
funding_stream.setId(((org.dom4j.Node)node.get(0)).getText());
node = n.selectNodes("./description");
funding_stream.setDescription(((Node)node.get(0)).getText());
fundings.add(funding_stream);
}
level += 1;
nodes = doc.selectNodes("//funding_level_"+level);
}
if(fundings.size() > 0 ) {
f.setFunding_levels(fundings);
}
return f;
} catch (DocumentException e) {
e.printStackTrace();
}
return f;
}
private static <E extends OafEntity> void organizationMap(SparkSession spark, String inputPath, String outputPath, Class<E> inputClazz) {
Utils.readPath(spark, inputPath,inputClazz)
.map(o -> mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization)o), Encoders.bean(Organization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
private static Organization mapOrganization(eu.dnetlib.dhp.schema.oaf.Organization org){
Organization organization = new Organization();
Optional.ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional.ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue()));
Optional.ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional.ofNullable(org.getAlternativeNames())
.ifPresent(value -> organization.setAlternativenames(value.stream()
.map( v-> v.getValue()).collect(Collectors.toList())));
Optional.ofNullable(org.getCountry())
.ifPresent(value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname())));
Optional.ofNullable(org.getId())
.ifPresent(value -> organization.setId(value));
Optional.ofNullable(org.getPid())
.ifPresent(value -> organization.setPid(
value.stream().map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue())).collect(Collectors.toList())
));
organization.setCollectedfrom(Optional.ofNullable(org.getCollectedfrom())
.map(value -> value.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(),cf.getValue())).collect(Collectors.toList()))
.orElse(new ArrayList<>()));
return organization;
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.resulttocommunityfromorganization;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class OrganizationMap extends HashMap<String, List<String>> {
public OrganizationMap() {
super();
}
public List<String> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.oa.graph.dump;
import java.io.StringReader;
import java.util.List;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem {
private ISLookUpService isLookUp;
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+
" where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " +
" and ($x//context/param[./@name = 'status']/text() = 'manager' or $x//context/param[./@name = 'status']/text() = 'all') "
+
" return " +
"<community> " +
"{$x//CONFIGURATION/context/@id}" +
"{$x//CONFIGURATION/context/@label}" +
"</community>";
public CommunityMap getCommunityMap()
throws ISLookUpException {
return getMap(isLookUp.quickSearchProfile(XQUERY));
}
public ISLookUpService getIsLookUp() {
return isLookUp;
}
public void setIsLookUp(ISLookUpService isLookUpService) {
this.isLookUp = isLookUpService;
}
private CommunityMap getMap(List<String> communityMap) {
final CommunityMap map = new CommunityMap();
communityMap.stream().forEach(xml -> {
final Document doc;
try {
doc = new SAXReader().read(new StringReader(xml));
Element root = doc.getRootElement();
map.put(root.attribute("id").getValue(), root.attribute("label").getValue());
} catch (DocumentException e) {
e.printStackTrace();
}
});
return map;
}
}

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.avro.generic.GenericData;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.swing.text.html.Option;
import javax.xml.bind.annotation.adapters.CollapsedStringAdapter;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class SparkDumpJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
Class<? extends OafEntity> inputClazz = (Class<? extends OafEntity>) Class.forName(resultClassName);
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl));
CommunityMap communityMap = queryInformationSystem.getCommunityMap();
DumpGraph dg = new DumpGraph();
dg.run(isSparkSessionManaged, inputPath, outputPath, inputClazz, communityMap);
}
}

View File

@ -1,184 +0,0 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.DumpProducts;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.community.SparkDumpCommunityProducts;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.ControlledField;
import eu.dnetlib.dhp.schema.dump.oaf.Country;
import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
public class SparkDumpJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkDumpCommunityProducts.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkDumpJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/input_graphdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
final String isLookUpUrl = parser.get("isLookUpUrl");
log.info("isLookUpUrl: {}", isLookUpUrl);
Class<? extends OafEntity> inputClazz = (Class<? extends OafEntity>) Class.forName(resultClassName);
QueryInformationSystem queryInformationSystem = new QueryInformationSystem();
queryInformationSystem.setIsLookUp(Utils.getIsLookUpService(isLookUpUrl));
CommunityMap communityMap = queryInformationSystem.getCommunityMap();
SparkConf conf = new SparkConf();
switch (ModelSupport.idPrefixMap.get(inputClazz)){
case "50":
DumpProducts d = new DumpProducts();
d.run(isSparkSessionManaged,inputPath,outputPath,communityMap, inputClazz, true);
break;
case "40":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
projectMap(spark, inputPath, outputPath);
});
break;
case "20":
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath);
});
break;
}
}
private static void projectMap(SparkSession spark, String inputPath, String outputPath) {
Utils.readPath(spark, inputPath, eu.dnetlib.dhp.schema.oaf.Project.class)
.map(p -> mapProject(p), Encoders.bean(Project.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
private static Project mapProject(eu.dnetlib.dhp.schema.oaf.Project p) {
Project project = new Project();
Optional.ofNullable(p.getId())
.ifPresent(id -> project.setId(id));
Optional.ofNullable(p.getWebsiteurl())
.ifPresent(w -> project.setWebsiteurl(w.getValue()));
Optional.ofNullable(p.getCode())
.ifPresent(code -> project.setCode(code));
// private String acronym;
// private String title;
// private String startdate;
//
// private String enddate;
//
// private String callidentifier;
//
// private String keywords;
//
// private String duration;
//
// private boolean openaccessmandateforpublications;
//
// private boolean openaccessmandatefordataset;
// private List<String> subject;
// private Funder funding;
//
// private String summary;
//
// private Granted granted;
//
// private Programme programme;
return project;
}
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
Utils.readPath(spark, inputPath, eu.dnetlib.dhp.schema.oaf.Organization.class)
.map(o -> mapOrganization(o), Encoders.bean(Organization.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.json(outputPath);
}
private static Organization mapOrganization(eu.dnetlib.dhp.schema.oaf.Organization org){
Organization organization = new Organization();
Optional.ofNullable(org.getLegalshortname())
.ifPresent(value -> organization.setLegalshortname(value.getValue()));
Optional.ofNullable(org.getLegalname())
.ifPresent(value -> organization.setLegalname(value.getValue()));
Optional.ofNullable(org.getWebsiteurl())
.ifPresent(value -> organization.setWebsiteurl(value.getValue()));
Optional.ofNullable(org.getAlternativeNames())
.ifPresent(value -> organization.setAlternativenames(value.stream()
.map( v-> v.getValue()).collect(Collectors.toList())));
Optional.ofNullable(org.getCountry())
.ifPresent(value -> organization.setCountry(Country.newInstance(value.getClassid(), value.getClassname(), null)));
Optional.ofNullable(org.getId())
.ifPresent(value -> organization.setId(value));
Optional.ofNullable(org.getPid())
.ifPresent(value -> organization.setPid(
value.stream().map(p -> ControlledField.newInstance(p.getQualifier().getClassid(), p.getValue())).collect(Collectors.toList())
));
return organization;
}
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class SparkDumpRelationJob {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class SparkExtractRelationFromEntities {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class SparkOrganizationRelation {
}

View File

@ -0,0 +1,38 @@
[
{
"paramName":"is",
"paramLongName":"isLookUpUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": false
},
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
}
]

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class DumpOrganizationProjectTest {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class DumpRelationTest {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class FunderParsingTest {
}

View File

@ -0,0 +1,4 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
public class QueryInformationSystemTest {
}