fixed issue. Need to extend wf
This commit is contained in:
parent
fd242c1c87
commit
0f602bae9d
|
@ -76,13 +76,13 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Utils.removeOutputDir(spark, workingPath + resultType + "extendedaffiliation");
|
Utils.removeOutputDir(spark, workingPath + resultType + "extendedaffiliation");
|
||||||
addOrganizations(spark, inputPath, workingPath, outputPath, resultType);
|
addOrganizations(spark, inputPath, workingPath, resultType);
|
||||||
dumpOrganizationAndRelations(spark, inputPath, workingPath, outputPath, resultType);
|
dumpOrganizationAndRelations(spark, inputPath, workingPath, resultType);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void dumpOrganizationAndRelations(SparkSession spark, String inputPath, String workingPath,
|
private static void dumpOrganizationAndRelations(SparkSession spark, String inputPath, String workingPath,
|
||||||
String outputPath, String resultType) {
|
String resultType) {
|
||||||
Dataset<Relation> relation = Utils
|
Dataset<Relation> relation = Utils
|
||||||
.readPath(spark, inputPath + "/relation", Relation.class)
|
.readPath(spark, inputPath + "/relation", Relation.class)
|
||||||
.filter(
|
.filter(
|
||||||
|
@ -102,36 +102,46 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
||||||
.map((MapFunction<Tuple2<Result, Relation>, Relation>) t2 -> t2._2(), Encoders.bean(Relation.class));
|
.map((MapFunction<Tuple2<Result, Relation>, Relation>) t2 -> t2._2(), Encoders.bean(Relation.class));
|
||||||
|
|
||||||
// from eoscRelation select the organization
|
// from eoscRelation select the organization
|
||||||
eoscRelation
|
Dataset<String> organizationIds = eoscRelation
|
||||||
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Organization>) t2 -> mapOrganization(
|
(MapFunction<Tuple2<Relation, Organization>, String>) t2 -> t2._2().getId(),
|
||||||
|
Encoders.STRING())
|
||||||
|
.distinct();
|
||||||
|
|
||||||
|
organizationIds
|
||||||
|
.joinWith(organization, organizationIds.col("value").equalTo(organization.col("id")))
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<String, Organization>, eu.dnetlib.dhp.eosc.model.Organization>) t2 -> mapOrganization(
|
||||||
t2._2()),
|
t2._2()),
|
||||||
Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class))
|
Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class))
|
||||||
.groupByKey((MapFunction<eu.dnetlib.dhp.eosc.model.Organization, String>) o -> o.getId(), Encoders.STRING())
|
|
||||||
.mapGroups(
|
.filter(Objects::nonNull)
|
||||||
(MapGroupsFunction<String, eu.dnetlib.dhp.eosc.model.Organization, eu.dnetlib.dhp.eosc.model.Organization>) (
|
|
||||||
k, v) -> v.next(),
|
|
||||||
Encoders.bean(eu.dnetlib.dhp.eosc.model.Organization.class))
|
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "organization");
|
.json(workingPath + resultType + "/organization");
|
||||||
|
|
||||||
eoscRelation
|
eoscRelation
|
||||||
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
.joinWith(organization, eoscRelation.col("target").equalTo(organization.col("id")))
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Relation>) t2 -> eu.dnetlib.dhp.eosc.model.Relation
|
(MapFunction<Tuple2<Relation, Organization>, eu.dnetlib.dhp.eosc.model.Relation>) t2 -> {
|
||||||
.newInstance(t2._1().getSource(), t2._1().getTarget()),
|
if (isToBeDumpedOrg(t2._2()))
|
||||||
|
return eu.dnetlib.dhp.eosc.model.Relation
|
||||||
|
.newInstance(t2._1().getSource(), t2._1().getTarget());
|
||||||
|
return null;
|
||||||
|
},
|
||||||
Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class))
|
Encoders.bean(eu.dnetlib.dhp.eosc.model.Relation.class))
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + "resultOrganization");
|
.json(workingPath + resultType + "/resultOrganization");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void addOrganizations(SparkSession spark, String inputPath, String workingPath, String outputPath,
|
private static void addOrganizations(SparkSession spark, String inputPath, String workingPath,
|
||||||
String resultType) {
|
String resultType) {
|
||||||
|
|
||||||
Dataset<Result> results = Utils
|
Dataset<Result> results = Utils
|
||||||
|
@ -219,10 +229,7 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
||||||
|
|
||||||
private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org) {
|
private static eu.dnetlib.dhp.eosc.model.Organization mapOrganization(Organization org) {
|
||||||
|
|
||||||
if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference()))
|
if (isToBeDumpedOrg(org))
|
||||||
return null;
|
|
||||||
if (!Optional.ofNullable(org.getLegalname()).isPresent()
|
|
||||||
&& !Optional.ofNullable(org.getLegalshortname()).isPresent())
|
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
eu.dnetlib.dhp.eosc.model.Organization organization = new eu.dnetlib.dhp.eosc.model.Organization();
|
eu.dnetlib.dhp.eosc.model.Organization organization = new eu.dnetlib.dhp.eosc.model.Organization();
|
||||||
|
@ -278,4 +285,13 @@ public class ExtendEoscResultWithOrganizationStep2 implements Serializable {
|
||||||
return organization;
|
return organization;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean isToBeDumpedOrg(Organization org) {
|
||||||
|
if (Boolean.TRUE.equals(org.getDataInfo().getDeletedbyinference()))
|
||||||
|
return true;
|
||||||
|
if (!Optional.ofNullable(org.getLegalname()).isPresent()
|
||||||
|
&& !Optional.ofNullable(org.getLegalshortname()).isPresent())
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,4 +12,5 @@ depositionId=6616871
|
||||||
removeSet=merges;isMergedIn
|
removeSet=merges;isMergedIn
|
||||||
postgresURL=jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus
|
postgresURL=jdbc:postgresql://postgresql.services.openaire.eu:5432/dnet_openaireplus
|
||||||
postgresUser=dnet
|
postgresUser=dnet
|
||||||
postgresPassword=dnetPwd
|
postgresPassword=dnetPwd
|
||||||
|
isLookUpUrl=http://services.openaire.eu:8280/is/services/isLookUp?wsdl
|
Loading…
Reference in New Issue