refactoring

This commit is contained in:
Miriam Baglioni 2023-05-31 18:56:58 +02:00
parent 97d72d41c3
commit daf4d7971b
7 changed files with 518 additions and 496 deletions

View File

@ -185,7 +185,8 @@ public class PropagationConstant {
String source,
String target,
String rel_class) {
return getRelation(source, target ,
return getRelation(
source, target,
rel_class,
ModelConstants.PROJECT_ORGANIZATION,
ModelConstants.PARTICIPATION,
@ -198,7 +199,8 @@ public class PropagationConstant {
String source,
String target,
String rel_class) {
return getRelation(source, target ,
return getRelation(
source, target,
rel_class,
ModelConstants.RESULT_ORGANIZATION,
ModelConstants.AFFILIATION,

View File

@ -7,7 +7,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import java.io.Serializable;
import java.util.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.*;
@ -15,6 +14,8 @@ import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.KeyValueSet;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganizationFromIstRepoJob;
@ -103,7 +104,8 @@ public class PrepareInfo implements Serializable {
}
private static void prepareInfo(SparkSession spark, String inputPath, String childParentOrganizationPath,
String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath, String relationPath) {
String currentIterationPath, String resultOrganizationPath, String projectOrganizationPath,
String relationPath) {
Dataset<Relation> relation = readPath(spark, inputPath + "/relation", Relation.class);
relation.createOrReplaceTempView("relation");

View File

@ -155,7 +155,8 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
}
private static void doPropagate(SparkSession spark, String leavesPath, String childParentPath,
String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath, String outputPath,
String resultOrganizationPath, String projectOrganizationPath, String graphPath, String workingPath,
String outputPath,
PropagationCounter propagationCounter) {
int iteration = 0;
long leavesCount;
@ -253,8 +254,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
.mapGroups(
(MapGroupsFunction<String, Relation, Relation>) (k, it) -> it.next(), Encoders.bean(Relation.class))
.flatMap(
(FlatMapFunction<Relation, Relation>) r ->
{
(FlatMapFunction<Relation, Relation>) r -> {
if (r.getSource().startsWith("50|")) {
return Arrays
.asList(
@ -270,8 +270,6 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
}
}
, Encoders.bean(Relation.class))
.write()

View File

@ -50,7 +50,8 @@ public class StepActions implements Serializable {
spark, resultOrgPath, readPath(spark, selectedRelsPath, Relation.class), orgOutputPath);
}
public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath, String projectOrgPath,
public static void prepareForNextStep(SparkSession spark, String selectedRelsPath, String resultOrgPath,
String projectOrgPath,
String leavesPath, String chldParentOrgPath, String leavesOutputPath,
String orgOutputPath, String outputProjectPath) {
// use of the parents as new leaves set
@ -58,10 +59,12 @@ public class StepActions implements Serializable {
// add the new relations obtained from propagation to the keyvalueset result organization
updateEntityOrganization(
spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class), orgOutputPath);
spark, resultOrgPath, readPath(spark, selectedRelsPath + NEW_RESULT_RELATION_PATH, Relation.class),
orgOutputPath);
updateEntityOrganization(
spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class), outputProjectPath);
spark, projectOrgPath, readPath(spark, selectedRelsPath + NEW_PROJECT_RELATION_PATH, Relation.class),
outputProjectPath);
}
private static void updateEntityOrganization(SparkSession spark, String entityOrgPath,
@ -128,7 +131,6 @@ public class StepActions implements Serializable {
// construction of the set)
// if at least one relation in the set was not produced by propagation no new relation will be returned
relationDataset
.union(newRels)
.groupByKey((MapFunction<Relation, String>) r -> r.getSource() + r.getTarget(), Encoders.STRING())
@ -145,7 +147,8 @@ public class StepActions implements Serializable {
.getDataInfo()
.getProvenanceaction()
.getClassid()
.equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID) && !rel
.equals(PROPAGATION_RELATION_RESULT_ORGANIZATION_SEM_REL_CLASS_ID)
&& !rel
.getDataInfo()
.getProvenanceaction()
.getClassid()
@ -166,8 +169,6 @@ public class StepActions implements Serializable {
.option("compression", "gzip")
.json(newRelationPath);
}
// get the possible relations from propagation
@ -202,7 +203,6 @@ public class StepActions implements Serializable {
"GROUP BY entityId")
.as(Encoders.bean(KeyValueSet.class));
// create new relations from entity to organization for each entity linked to a leaf
return resultParent
.flatMap(
@ -220,6 +220,4 @@ public class StepActions implements Serializable {
}
}

View File

@ -161,19 +161,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_AUTHOR_INSTITUTION, r.getRelClass()));
Assertions
.assertEquals(
2, result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
2,
result.filter(r -> r.getSource().equals("50|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
3, result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
3,
result.filter(r -> r.getSource().equals("50|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
2,
result.filter(r -> r.getSource().equals("50|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
1, result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
1,
result.filter(r -> r.getSource().equals("50|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
1, result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
1,
result.filter(r -> r.getSource().equals("50|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, result.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
result
@ -181,19 +186,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_AUTHOR_INSTITUTION_OF, r.getRelClass()));
Assertions
.assertEquals(
1, result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
1,
result.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
1, result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
1,
result.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
2,
result.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
2, result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
2,
result.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
3, result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
3,
result.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions
.assertTrue(
@ -395,7 +405,8 @@ public class SparkJobTest {
Assertions.assertEquals(36, temp.count());
JavaRDD<Relation> project = temp.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|"));
JavaRDD<Relation> project = temp
.filter(r -> r.getSource().startsWith("40|") || r.getTarget().startsWith("40|"));
Assertions.assertEquals(18, project.count());
project.foreach(r -> Assertions.assertEquals(ModelConstants.PARTICIPATION, r.getSubRelType()));
@ -430,19 +441,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.HAS_PARTICIPANT, r.getRelClass()));
Assertions
.assertEquals(
2, project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
2,
project.filter(r -> r.getSource().equals("40|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
3, project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
3,
project.filter(r -> r.getSource().equals("40|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
2,
project.filter(r -> r.getSource().equals("40|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
1, project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
1,
project.filter(r -> r.getSource().equals("40|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
1, project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
1,
project.filter(r -> r.getSource().equals("40|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions.assertEquals(9, project.filter(r -> r.getSource().substring(0, 3).equals("20|")).count());
project
@ -450,19 +466,24 @@ public class SparkJobTest {
.foreach(r -> Assertions.assertEquals(ModelConstants.IS_PARTICIPANT, r.getRelClass()));
Assertions
.assertEquals(
1, project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
1,
project.filter(r -> r.getSource().equals("20|doajarticles::1cae0b82b56ccd97c2db1f698def7074")).count());
Assertions
.assertEquals(
1, project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
1,
project.filter(r -> r.getSource().equals("20|dedup_wf_001::2899e571609779168222fdeb59cb916d")).count());
Assertions
.assertEquals(
2, project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
2,
project.filter(r -> r.getSource().equals("20|doajarticles::2baa9032dc058d3c8ff780c426b0c19f")).count());
Assertions
.assertEquals(
2, project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
2,
project.filter(r -> r.getSource().equals("20|openaire____::ec653e804967133b9436fdd30d3ff51d")).count());
Assertions
.assertEquals(
3, project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
3,
project.filter(r -> r.getSource().equals("20|doajarticles::03748bcb5d754c951efec9700e18a56d")).count());
Assertions
.assertTrue(

View File

@ -87,7 +87,8 @@ public class StepActionsTest {
getClass()
.getResource(
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/execstep/resultOrganization/")
.getPath(), ModelConstants.HAS_AUTHOR_INSTITUTION);
.getPath(),
ModelConstants.HAS_AUTHOR_INSTITUTION);
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());