forked from antonis.lempesis/dnet-hadoop
This commit is contained in:
parent
b72a7dad46
commit
1f893e63dc
|
@ -0,0 +1,23 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.dump.pidgraph;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
public class Entity implements Serializable {
|
||||||
|
private String id;
|
||||||
|
|
||||||
|
public String getId() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setId(String id) {
|
||||||
|
this.id = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Entity newInstance(String id) {
|
||||||
|
Entity entity = new Entity();
|
||||||
|
entity.id = id;
|
||||||
|
|
||||||
|
return entity;
|
||||||
|
}
|
||||||
|
}
|
|
@ -21,6 +21,7 @@ public class Constants implements Serializable {
|
||||||
|
|
||||||
public static final String CONTEXT_ID = "00";
|
public static final String CONTEXT_ID = "00";
|
||||||
public static final String CONTEXT_NS_PREFIX = "context_____";
|
public static final String CONTEXT_NS_PREFIX = "context_____";
|
||||||
|
public static final String UNKNOWN = "UNKNOWN";
|
||||||
|
|
||||||
// public static final String FUNDER_DS = "entityregistry::projects";
|
// public static final String FUNDER_DS = "entityregistry::projects";
|
||||||
}
|
}
|
||||||
|
|
|
@ -484,7 +484,12 @@ public class DumpGraphEntities implements Serializable {
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(org.getCountry())
|
.ofNullable(org.getCountry())
|
||||||
.ifPresent(
|
.ifPresent(
|
||||||
value -> organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname())));
|
value -> {
|
||||||
|
if (!value.getClassid().equals(Constants.UNKNOWN)) {
|
||||||
|
organization.setCountry(Qualifier.newInstance(value.getClassid(), value.getClassname()));
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
Optional
|
Optional
|
||||||
.ofNullable(org.getId())
|
.ofNullable(org.getId())
|
||||||
|
|
|
@ -27,7 +27,8 @@ import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
|
* new Relations are created for the datasource in the collectedfrom and hostedby elements and for the context related
|
||||||
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
|
* to communities and research initiative/infrastructures. For collectedfrom elements it creates: datasource -> provides
|
||||||
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
|
* -> result and result -> isProvidedBy -> datasource For hostedby elements it creates: datasource -> hosts -> result
|
||||||
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result
|
* and result -> isHostedBy -> datasource For context elements it creates: context <-> isRelatedTo <-> result. Note for
|
||||||
|
* context: it gets the first provenance in the dataInfo. If more than one is present the others are not dumped
|
||||||
*/
|
*/
|
||||||
public class Extractor implements Serializable {
|
public class Extractor implements Serializable {
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.io.Serializable;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -38,7 +39,9 @@ public class Process implements Serializable {
|
||||||
|
|
||||||
ri.setDescription(ci.getDescription());
|
ri.setDescription(ci.getDescription());
|
||||||
ri.setName(ci.getName());
|
ri.setName(ci.getName());
|
||||||
|
if (StringUtils.isNotEmpty(ci.getZenodocommunity())) {
|
||||||
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
|
ri.setZenodo_community(Constants.ZENODO_COMMUNITY_PREFIX + ci.getZenodocommunity());
|
||||||
|
}
|
||||||
return (R) ri;
|
return (R) ri;
|
||||||
|
|
||||||
} catch (final Exception e) {
|
} catch (final Exception e) {
|
||||||
|
|
|
@ -16,7 +16,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Result;
|
import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult;
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -69,10 +69,10 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
|
private static void run(SparkSession spark, String inputPath, String outputPath, boolean aggregate) {
|
||||||
if (aggregate) {
|
if (aggregate) {
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/result/publication", Result.class)
|
.readPath(spark, inputPath + "/result/publication", GraphResult.class)
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/dataset", Result.class))
|
.union(Utils.readPath(spark, inputPath + "/result/dataset", GraphResult.class))
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class))
|
.union(Utils.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class))
|
||||||
.union(Utils.readPath(spark, inputPath + "/result/software", Result.class))
|
.union(Utils.readPath(spark, inputPath + "/result/software", GraphResult.class))
|
||||||
.write()
|
.write()
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
@ -80,19 +80,19 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
} else {
|
} else {
|
||||||
write(
|
write(
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/result/publication", Result.class),
|
.readPath(spark, inputPath + "/result/publication", GraphResult.class),
|
||||||
outputPath + "/publication");
|
outputPath + "/publication");
|
||||||
write(
|
write(
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/result/dataset", Result.class),
|
.readPath(spark, inputPath + "/result/dataset", GraphResult.class),
|
||||||
outputPath + "/dataset");
|
outputPath + "/dataset");
|
||||||
write(
|
write(
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/result/otherresearchproduct", Result.class),
|
.readPath(spark, inputPath + "/result/otherresearchproduct", GraphResult.class),
|
||||||
outputPath + "/otheresearchproduct");
|
outputPath + "/otheresearchproduct");
|
||||||
write(
|
write(
|
||||||
Utils
|
Utils
|
||||||
.readPath(spark, inputPath + "/result/software", Result.class),
|
.readPath(spark, inputPath + "/result/software", GraphResult.class),
|
||||||
outputPath + "/software");
|
outputPath + "/software");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -112,7 +112,7 @@ public class SparkCollectAndSave implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void write(Dataset<Result> dataSet, String outputPath) {
|
private static void write(Dataset<GraphResult> dataSet, String outputPath) {
|
||||||
dataSet
|
dataSet
|
||||||
.write()
|
.write()
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -19,6 +19,7 @@ import com.google.gson.Gson;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
|
import eu.dnetlib.dhp.schema.dump.oaf.Provenance;
|
||||||
|
@ -59,6 +60,9 @@ public class SparkOrganizationRelation implements Serializable {
|
||||||
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
|
.fromJson(parser.get("organizationCommunityMap"), OrganizationMap.class);
|
||||||
log.info("organization map : {}", new Gson().toJson(organizationMap));
|
log.info("organization map : {}", new Gson().toJson(organizationMap));
|
||||||
|
|
||||||
|
final String communityMapPath = parser.get("communityMapPath");
|
||||||
|
log.info("communityMapPath: {} ", communityMapPath);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -66,14 +70,17 @@ public class SparkOrganizationRelation implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Utils.removeOutputDir(spark, outputPath);
|
Utils.removeOutputDir(spark, outputPath);
|
||||||
extractRelation(spark, inputPath, organizationMap, outputPath);
|
extractRelation(spark, inputPath, organizationMap, outputPath, communityMapPath);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
|
private static void extractRelation(SparkSession spark, String inputPath, OrganizationMap organizationMap,
|
||||||
String outputPath) {
|
String outputPath, String communityMapPath) {
|
||||||
|
|
||||||
|
CommunityMap communityMap = Utils.getCommunityMap(spark, communityMapPath);
|
||||||
|
|
||||||
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
|
Dataset<Relation> relationDataset = Utils.readPath(spark, inputPath, Relation.class);
|
||||||
|
|
||||||
relationDataset.createOrReplaceTempView("relation");
|
relationDataset.createOrReplaceTempView("relation");
|
||||||
|
@ -97,32 +104,43 @@ public class SparkOrganizationRelation implements Serializable {
|
||||||
}, Encoders.bean(MergedRels.class))
|
}, Encoders.bean(MergedRels.class))
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.forEach(getMergedRelsConsumer(organizationMap, relList));
|
.forEach(getMergedRelsConsumer(organizationMap, relList, communityMap));
|
||||||
|
|
||||||
organizationMap
|
organizationMap
|
||||||
.keySet()
|
.keySet()
|
||||||
.forEach(
|
.forEach(
|
||||||
oId -> organizationMap
|
oId -> organizationMap
|
||||||
.get(oId)
|
.get(oId)
|
||||||
.forEach(community -> addRelations(relList, community, oId)));
|
.forEach(community -> {
|
||||||
|
if (communityMap.containsKey(community)) {
|
||||||
|
addRelations(relList, community, oId);
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
// if (relList.size() > 0) {
|
||||||
spark
|
spark
|
||||||
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
|
.createDataset(relList, Encoders.bean(eu.dnetlib.dhp.schema.dump.oaf.graph.Relation.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
// }
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
|
private static Consumer<MergedRels> getMergedRelsConsumer(OrganizationMap organizationMap,
|
||||||
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList) {
|
List<eu.dnetlib.dhp.schema.dump.oaf.graph.Relation> relList, CommunityMap communityMap) {
|
||||||
return mergedRels -> {
|
return mergedRels -> {
|
||||||
String oId = mergedRels.getOrganizationId();
|
String oId = mergedRels.getOrganizationId();
|
||||||
organizationMap
|
organizationMap
|
||||||
.get(oId)
|
.get(oId)
|
||||||
.forEach(community -> addRelations(relList, community, mergedRels.getRepresentativeId()));
|
.forEach(community -> {
|
||||||
|
if (communityMap.containsKey(community)) {
|
||||||
|
addRelations(relList, community, mergedRels.getRepresentativeId());
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
organizationMap.remove(oId);
|
organizationMap.remove(oId);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
{"ee":"SDSN - Greece","epos":"EPOS","enrmaps":"Energy Research","fet-h2020":"FET H2020","instruct":"Instruct-Eric","egi":"EGI Federation","euromarine":"Euromarine","covid-19":"COVID-19","dariah":"DARIAH EU","rda":"Research Data Alliance","clarin":"CLARIN","aginfra":"Agricultural and Food Sciences","risis":"RISI","fam":"Fisheries and Aquaculture Management","beopen":"Transport Research","elixir-gr":"ELIXIR GR","fet-fp7":"FET FP7","ifremer":"Ifremer","science-innovation-policy":"Science and Innovation Policy Studies","mes":"European Marine Scinece","oa-pg":"EC Post-Grant Open Access Pilot","ni":"Neuroinformatics","dh-ch":"Digital Humanities and Cultural Heritage"}
|
Loading…
Reference in New Issue