This commit is contained in:
Miriam Baglioni 2020-09-14 14:33:10 +02:00
parent b72a7dad46
commit 1f893e63dc
9 changed files with 72 additions and 20 deletions

View File

@ -0,0 +1,23 @@
package eu.dnetlib.dhp.schema.dump.pidgraph;
import java.io.Serializable;
public class Entity implements Serializable {
private String id;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public static Entity newInstance(String id) {
Entity entity = new Entity();
entity.id = id;
return entity;
}
}

View File

@ -21,6 +21,7 @@ public class Constants implements Serializable {
public static final String CONTEXT_ID = "00"; public static final String CONTEXT_ID = "00";
public static final String CONTEXT_NS_PREFIX = "context_____"; public static final String CONTEXT_NS_PREFIX = "context_____";
public static final String UNKNOWN = "UNKNOWN";
// public static final String FUNDER_DS = "entityregistry::projects"; // public static final String FUNDER_DS = "entityregistry::projects";
} }

View File

@ -484,7 +484,12 @@ public class DumpGraphEntities implements Serializable {
Optional Optional
.ofNullable(org.getCountry()) .ofNullable(org.getCountry())
.ifPresent( .ifPresent(
value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()))); value -> {
if (!value.getClassid().equals(Constants.UNKNOWN)) {
organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()));
}
});
Optional Optional
.ofNullable(org.getId()) .ofNullable(org.getId())

View File

@ -27,7 +27,8 @@ import eu.dnetlib.dhp.schema.oaf.Result;
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related * new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides * to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result * -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result * and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
* context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
*/ */
public class Extractor implements Serializable { public class Extractor implements Serializable {

View File

@ -5,6 +5,7 @@ import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -38,7 +39,9 @@ public class Process implements Serializable {
ri.setDescription(ci.getDescription()); ri.setDescription(ci.getDescription());
ri.setName(ci.getName()); ri.setName(ci.getName());
if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity()); ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
}
return (R) ri; return (R) ri;
} catch (final Exception e) { } catch (final Exception e) {

View File

@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.Result; import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation; import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
/** /**
@ -69,10 +69,10 @@ public class SparkCollectAndSave implements Serializable {
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) { private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
if (aggregate) { if (aggregate) {
Utils Utils
.readPath(spark, inputPath + "/result/publication", Result.class) .readPath(spark, inputPath + "/result/publication", GraphResult.class)
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class)) .union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
@ -80,19 +80,19 @@ public class SparkCollectAndSave implements Serializable {
} else { } else {
write( write(
Utils Utils
.readPath(spark, inputPath + "/result/publication", Result.class), .readPath(spark, inputPath + "/result/publication", GraphResult.class),
outputPath + "/publication"); outputPath + "/publication");
write( write(
Utils Utils
.readPath(spark, inputPath + "/result/dataset", Result.class), .readPath(spark, inputPath + "/result/dataset", GraphResult.class),
outputPath + "/dataset"); outputPath + "/dataset");
write( write(
Utils Utils
.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class), .readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
outputPath + "/otheresearchproduct"); outputPath + "/otheresearchproduct");
write( write(
Utils Utils
.readPath(spark, inputPath + "/result/software", Result.class), .readPath(spark, inputPath + "/result/software", GraphResult.class),
outputPath + "/software"); outputPath + "/software");
} }
@ -112,7 +112,7 @@ public class SparkCollectAndSave implements Serializable {
} }
private static void write(Dataset<Result> dataSet, String outputPath) { private static void write(Dataset<GraphResult> dataSet, String outputPath) {
dataSet dataSet
.write() .write()
.option("compression", "gzip") .option("compression", "gzip")

View File

@ -19,6 +19,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils; import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.Provenance; import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
@ -59,6 +60,9 @@ public class SparkOrganizationRelation implements Serializable {
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class); .fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
log.info("organization map : {}", new Gson().toJson(organizationMap)); log.info("organization map : {}", new Gson().toJson(organizationMap));
final String communityMapPath = parser.get("communityMapPath");
log.info("communityMapPath: {} ", communityMapPath);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -66,14 +70,17 @@ public class SparkOrganizationRelation implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Utils.removeOutputDir(spark, outputPath); Utils.removeOutputDir(spark, outputPath);
extractRelation(spark, inputPath, organizationMap, outputPath); extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
}); });
} }
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap, private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
String outputPath) { String outputPath, String communityMapPath) {
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class); Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
relationDataset.createOrReplaceTempView("relation"); relationDataset.createOrReplaceTempView("relation");
@ -97,32 +104,43 @@ public class SparkOrganizationRelation implements Serializable {
}, Encoders.bean(MergedRels.class)) }, Encoders.bean(MergedRels.class))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.collectAsList() .collectAsList()
.forEach(getMergedRelsConsumer(organizationMap, relList)); .forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
organizationMap organizationMap
.keySet() .keySet()
.forEach( .forEach(
oId -> organizationMap oId -> organizationMap
.get(oId) .get(oId)
.forEach(community -> addRelations(relList, community, oId))); .forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(relList, community, oId);
}
}));
// if (relList.size() > 0) {
spark spark
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class)) .createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath); .json(outputPath);
// }
} }
@NotNull @NotNull
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap, private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList) { List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList, CommunityMap communityMap) {
return mergedRels -> { return mergedRels -> {
String oId = mergedRels.getOrganizationId(); String oId = mergedRels.getOrganizationId();
organizationMap organizationMap
.get(oId) .get(oId)
.forEach(community -> addRelations(relList, community, mergedRels.getRepresentativeId())); .forEach(community -> {
if (communityMap.containsKey(community)) {
addRelations(relList, community, mergedRels.getRepresentativeId());
}
});
organizationMap.remove(oId); organizationMap.remove(oId);
}; };
} }

View File

@ -0,0 +1 @@
{"ee":"SDSN - Greece","epos":"EPOS","enrmaps":"Energy Research","fet-h2020":"FET H2020","instruct":"Instruct-Eric","egi":"EGI Federation","euromarine":"Euromarine","covid-19":"COVID-19","dariah":"DARIAH EU","rda":"Research Data Alliance","clarin":"CLARIN","aginfra":"Agricultural and Food Sciences","risis":"RISI","fam":"Fisheries and Aquaculture Management","beopen":"Transport Research","elixir-gr":"ELIXIR GR","fet-fp7":"FET FP7","ifremer":"Ifremer","science-innovation-policy":"Science and Innovation Policy Studies","mes":"European Marine Scinece","oa-pg":"EC Post-Grant Open Access Pilot","ni":"Neuroinformatics","dh-ch":"Digital Humanities and Cultural Heritage"}