added code for dumping only openaorgs organizations #12

Merged
miriam.baglioni merged 1 commits from dumpOrganizationOnly into master 2024-07-29 11:03:57 +02:00
3 changed files with 52 additions and 126 deletions

View File

@ -33,6 +33,7 @@ import eu.dnetlib.dhp.oa.graph.dump.Constants;
import eu.dnetlib.dhp.oa.graph.dump.ResultMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.CardinalityTooHighException;
import eu.dnetlib.dhp.oa.graph.dump.exceptions.NoAvailableEntityTypeException;
import eu.dnetlib.dhp.oa.model.Container;
@ -57,150 +58,45 @@ public class SparkDumpOrganizationJob implements Serializable {
public static final String GZIP = "gzip";
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkOrganizationRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump/organization_only_input_parameters.json"));
Boolean isSparkSessionManaged = Boolean.TRUE;
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = "/tmp/prod_provision/graph/20_graph_blacklisted/";
final String inputPath = parser.get("sourcePath");
log.info("inputPath: {}", inputPath);
final String outputPath = "/tmp/miriam/organizationsOnly/";
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
// Utils.removeOutputDir(spark, outputPath);
Utils.removeOutputDir(spark, outputPath);
organizationMap(spark, inputPath, outputPath);
// relationMap2(spark, inputPath, outputPath);
});
}
private static void relationMap2(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "relation", Relation.class)
.filter((FilterFunction<Relation>) r -> r.getRelType().equalsIgnoreCase("organizationOrganization"))
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setRelType(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void relationMap(SparkSession spark, String inputPath, String outputPath) {
Dataset<eu.dnetlib.dhp.schema.oaf.Organization> organization = Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class);
Dataset<Relation> rels = Utils.readPath(spark, inputPath + "relation", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("source")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSource");
rels = Utils.readPath(spark, "/tmp/orgSource", Relation.class);
organization
.joinWith(rels, organization.col("id").equalTo(rels.col("target")), "left")
.map(
(MapFunction<Tuple2<eu.dnetlib.dhp.schema.oaf.Organization, Relation>, Relation>) t2 -> t2._2(),
Encoders.bean(Relation.class))
.filter(Objects::nonNull)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json("/tmp/orgSourceTarget");
Utils
.readPath(spark, "/tmp/orgSourceTarget", Relation.class)
.map((MapFunction<Relation, eu.dnetlib.dhp.oa.model.graph.Relation>) relation -> {
eu.dnetlib.dhp.oa.model.graph.Relation relNew = new eu.dnetlib.dhp.oa.model.graph.Relation();
relNew
.setSource(getEntityId(relation.getSource(), ENTITY_ID_SEPARATOR));
relNew.setSourceType(ModelSupport.idPrefixEntity.get(relation.getSource().substring(0, 2)));
relNew
.setTarget(getEntityId(relation.getTarget(), ENTITY_ID_SEPARATOR));
relNew.setTargetType(ModelSupport.idPrefixEntity.get(relation.getTarget().substring(0, 2)));
relNew
.setRelType(
RelType
.newInstance(
relation.getRelClass(),
relation.getSubRelType()));
Optional<DataInfo> odInfo = Optional.ofNullable(relation.getDataInfo());
if (odInfo.isPresent()) {
DataInfo dInfo = odInfo.get();
if (Optional.ofNullable(dInfo.getProvenanceaction()).isPresent() &&
Optional.ofNullable(dInfo.getProvenanceaction().getClassname()).isPresent()) {
relNew
.setProvenance(
Provenance
.newInstance(
dInfo.getProvenanceaction().getClassname(),
dInfo.getTrust()));
}
}
if (Boolean.TRUE.equals(relation.getValidated())) {
relNew.setValidated(relation.getValidated());
relNew.setValidationDate(relation.getValidationDate());
}
return relNew;
}, Encoders.bean(eu.dnetlib.dhp.oa.model.graph.Relation.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "relation");
}
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
.filter(
(FilterFunction<eu.dnetlib.dhp.schema.oaf.Organization>) o -> !o.getDataInfo().getDeletedbyinference()
&& o.getId().startsWith("20|openorgs"))
.map(
(MapFunction<eu.dnetlib.dhp.schema.oaf.Organization, Organization>) o -> mapOrganization(o),
Encoders.bean(Organization.class))

View File

@ -0,0 +1,23 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
}
]

View File

@ -1,6 +1,13 @@
<workflow-app name="dump_graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the graph source path</description>
</property>
<property>
<name>outputPath</name>
<description>the dump output path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>