forked from antonis.lempesis/dnet-hadoop
This commit is contained in:
parent
b72a7dad46
commit
1f893e63dc
|
@ -0,0 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.dump.pidgraph;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class Entity implements Serializable {
|
||||
private String id;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static Entity newInstance(String id) {
|
||||
Entity entity = new Entity();
|
||||
entity.id = id;
|
||||
|
||||
return entity;
|
||||
}
|
||||
}
|
|
@ -59,7 +59,7 @@ public class DumpProducts implements Serializable {
|
|||
|
||||
Utils
|
||||
.readPath(spark, inputPath, inputClazz)
|
||||
.map((MapFunction<I, O>) value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
|
||||
.map((MapFunction<I, O>) value -> execMap(value, communityMap, graph), Encoders.bean(outputClazz))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
|
|
@ -21,6 +21,7 @@ public class Constants implements Serializable {
|
|||
|
||||
public static final String CONTEXT_ID = "00";
|
||||
public static final String CONTEXT_NS_PREFIX = "context_____";
|
||||
public static final String UNKNOWN = "UNKNOWN";
|
||||
|
||||
// public static final String FUNDER_DS = "entityregistry::projects";
|
||||
}
|
||||
|
|
|
@ -484,7 +484,12 @@ public class DumpGraphEntities implements Serializable {
|
|||
Optional
|
||||
.ofNullable(org.getCountry())
|
||||
.ifPresent(
|
||||
value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname())));
|
||||
value -> {
|
||||
if (!value.getClassid().equals(Constants.UNKNOWN)) {
|
||||
organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()));
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
Optional
|
||||
.ofNullable(org.getId())
|
||||
|
|
|
@ -27,7 +27,8 @@ import eu.dnetlib.dhp.schema.oaf.Result;
|
|||
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
|
||||
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
|
||||
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
|
||||
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result
|
||||
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
|
||||
* context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
|
||||
*/
|
||||
public class Extractor implements Serializable {
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.io.Serializable;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -38,7 +39,9 @@ public class Process implements Serializable {
|
|||
|
||||
ri.setDescription(ci.getDescription());
|
||||
ri.setName(ci.getName());
|
||||
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
|
||||
if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
|
||||
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
|
||||
}
|
||||
return (R) ri;
|
||||
|
||||
} catch (final Exception e) {
|
||||
|
|
|
@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||
|
||||
/**
|
||||
|
@ -69,10 +69,10 @@ public class SparkCollectAndSave implements Serializable {
|
|||
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
|
||||
if (aggregate) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/publication", Result.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
|
||||
.readPath(spark, inputPath + "/result/publication", GraphResult.class)
|
||||
.union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
|
||||
.union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
|
@ -80,19 +80,19 @@ public class SparkCollectAndSave implements Serializable {
|
|||
} else {
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/publication", Result.class),
|
||||
.readPath(spark, inputPath + "/result/publication", GraphResult.class),
|
||||
outputPath + "/publication");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/dataset", Result.class),
|
||||
.readPath(spark, inputPath + "/result/dataset", GraphResult.class),
|
||||
outputPath + "/dataset");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class),
|
||||
.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
|
||||
outputPath + "/otheresearchproduct");
|
||||
write(
|
||||
Utils
|
||||
.readPath(spark, inputPath + "/result/software", Result.class),
|
||||
.readPath(spark, inputPath + "/result/software", GraphResult.class),
|
||||
outputPath + "/software");
|
||||
|
||||
}
|
||||
|
@ -112,7 +112,7 @@ public class SparkCollectAndSave implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
private static void write(Dataset<Result> dataSet, String outputPath) {
|
||||
private static void write(Dataset<GraphResult> dataSet, String outputPath) {
|
||||
dataSet
|
||||
.write()
|
||||
.option("compression", "gzip")
|
||||
|
|
|
@ -19,6 +19,7 @@ import com.google.gson.Gson;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
|
||||
|
@ -59,6 +60,9 @@ public class SparkOrganizationRelation implements Serializable {
|
|||
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
|
||||
log.info("organization map : {}", new Gson().toJson(organizationMap));
|
||||
|
||||
final String communityMapPath = parser.get("communityMapPath");
|
||||
log.info("communityMapPath: {} ", communityMapPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(
|
||||
|
@ -66,14 +70,17 @@ public class SparkOrganizationRelation implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Utils.removeOutputDir(spark, outputPath);
|
||||
extractRelation(spark, inputPath, organizationMap, outputPath);
|
||||
extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
|
||||
String outputPath) {
|
||||
String outputPath, String communityMapPath) {
|
||||
|
||||
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||
|
||||
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
|
||||
|
||||
relationDataset.createOrReplaceTempView("relation");
|
||||
|
@ -97,32 +104,43 @@ public class SparkOrganizationRelation implements Serializable {
|
|||
}, Encoders.bean(MergedRels.class))
|
||||
.filter(Objects::nonNull)
|
||||
.collectAsList()
|
||||
.forEach(getMergedRelsConsumer(organizationMap, relList));
|
||||
.forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
|
||||
|
||||
organizationMap
|
||||
.keySet()
|
||||
.forEach(
|
||||
oId -> organizationMap
|
||||
.get(oId)
|
||||
.forEach(community -> addRelations(relList, community, oId)));
|
||||
.forEach(community -> {
|
||||
if (communityMap.containsKey(community)) {
|
||||
addRelations(relList, community, oId);
|
||||
}
|
||||
}));
|
||||
|
||||
// if (relList.size() > 0) {
|
||||
spark
|
||||
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
// }
|
||||
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
|
||||
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList) {
|
||||
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList, CommunityMap communityMap) {
|
||||
return mergedRels -> {
|
||||
String oId = mergedRels.getOrganizationId();
|
||||
organizationMap
|
||||
.get(oId)
|
||||
.forEach(community -> addRelations(relList, community, mergedRels.getRepresentativeId()));
|
||||
.forEach(community -> {
|
||||
if (communityMap.containsKey(community)) {
|
||||
addRelations(relList, community, mergedRels.getRepresentativeId());
|
||||
}
|
||||
|
||||
});
|
||||
organizationMap.remove(oId);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
{"ee":"SDSN - Greece","epos":"EPOS","enrmaps":"Energy Research","fet-h2020":"FET H2020","instruct":"Instruct-Eric","egi":"EGI Federation","euromarine":"Euromarine","covid-19":"COVID-19","dariah":"DARIAH EU","rda":"Research Data Alliance","clarin":"CLARIN","aginfra":"Agricultural and Food Sciences","risis":"RISI","fam":"Fisheries and Aquaculture Management","beopen":"Transport Research","elixir-gr":"ELIXIR GR","fet-fp7":"FET FP7","ifremer":"Ifremer","science-innovation-policy":"Science and Innovation Policy Studies","mes":"European Marine Scinece","oa-pg":"EC Post-Grant Open Access Pilot","ni":"Neuroinformatics","dh-ch":"Digital Humanities and Cultural Heritage"}
|
Loading…
Reference in New Issue