Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
7 changed files with 235 additions and 215 deletions
Showing only changes of commit be320ba3c1 - Show all commits

View File

@ -3,7 +3,6 @@ package eu.dnetlib.dhp.actionmanager;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.common.HdfsSupport;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -12,6 +11,7 @@ import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.Subject;
@ -94,6 +94,7 @@ public class Constants {
return s; return s;
} }
public static void removeOutputDir(SparkSession spark, String path) { public static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }

View File

@ -8,12 +8,6 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.actionmanager.bipaffiliations.model.*;
import eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@ -22,17 +16,23 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.apache.spark.sql.Dataset;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.actionmanager.bipaffiliations.model.*;
import eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -79,7 +79,8 @@ public class PrepareAffiliationRelations implements Serializable {
}); });
} }
private static <I extends Result> void prepareAffiliationRelations(SparkSession spark, String inputPath, String outputPath) { private static <I extends Result> void prepareAffiliationRelations(SparkSession spark, String inputPath,
String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
@ -89,15 +90,23 @@ public class PrepareAffiliationRelations implements Serializable {
.map(item -> OBJECT_MAPPER.readValue(item, AffiliationRelationDeserializer.class)); .map(item -> OBJECT_MAPPER.readValue(item, AffiliationRelationDeserializer.class));
// convert affiliation to an internal representation // convert affiliation to an internal representation
Dataset<AffiliationRelationModel> affiliationRelations = Dataset<AffiliationRelationModel> affiliationRelations = spark
spark.createDataset( .createDataset(
affiliationRelationsDeserializeRDD.flatMap(entry -> affiliationRelationsDeserializeRDD
entry.getMatchings().stream().flatMap(matching -> .flatMap(
matching.getRorId().stream().map( rorId -> new AffiliationRelationModel( entry -> entry
.getMatchings()
.stream()
.flatMap(
matching -> matching
.getRorId()
.stream()
.map(
rorId -> new AffiliationRelationModel(
entry.getDoi(), entry.getDoi(),
rorId, rorId,
matching.getConfidence() matching.getConfidence())))
))).collect(Collectors.toList()) .collect(Collectors.toList())
.iterator()) .iterator())
.rdd(), .rdd(),
Encoders.bean(AffiliationRelationModel.class)); Encoders.bean(AffiliationRelationModel.class));
@ -107,19 +116,22 @@ public class PrepareAffiliationRelations implements Serializable {
.flatMap((FlatMapFunction<AffiliationRelationModel, Relation>) affRel -> { .flatMap((FlatMapFunction<AffiliationRelationModel, Relation>) affRel -> {
// DOI to OpenAIRE id // DOI to OpenAIRE id
final String paperId = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", affRel.getDoi())); final String paperId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", affRel.getDoi()));
// ROR id to OpenAIRE id // ROR id to OpenAIRE id
final String affId = GenerateRorActionSetJob.calculateOpenaireId(affRel.getRorId()); final String affId = GenerateRorActionSetJob.calculateOpenaireId(affRel.getRorId());
Qualifier qualifier = OafMapperUtils.qualifier( Qualifier qualifier = OafMapperUtils
.qualifier(
BIP_AFFILIATIONS_CLASSID, BIP_AFFILIATIONS_CLASSID,
BIP_AFFILIATIONS_CLASSNAME, BIP_AFFILIATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS); ModelConstants.DNET_PROVENANCE_ACTIONS);
// format data info; setting `confidence` into relation's `trust` // format data info; setting `confidence` into relation's `trust`
DataInfo dataInfo = OafMapperUtils.dataInfo( DataInfo dataInfo = OafMapperUtils
.dataInfo(
false, false,
BIP_INFERENCE_PROVENANCE, BIP_INFERENCE_PROVENANCE,
true, true,
@ -141,8 +153,10 @@ public class PrepareAffiliationRelations implements Serializable {
} }
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, DataInfo dataInfo) { private static List<Relation> getAffiliationRelationPair(String paperId, String affId, DataInfo dataInfo) {
return Arrays.asList( return Arrays
OafMapperUtils.getRelation( .asList(
OafMapperUtils
.getRelation(
paperId, paperId,
affId, affId,
ModelConstants.RESULT_ORGANIZATION, ModelConstants.RESULT_ORGANIZATION,
@ -151,7 +165,8 @@ public class PrepareAffiliationRelations implements Serializable {
null, null,
dataInfo, dataInfo,
null), null),
OafMapperUtils.getRelation( OafMapperUtils
.getRelation(
affId, affId,
paperId, paperId,
ModelConstants.RESULT_ORGANIZATION, ModelConstants.RESULT_ORGANIZATION,
@ -159,7 +174,6 @@ public class PrepareAffiliationRelations implements Serializable {
ModelConstants.IS_AUTHOR_INSTITUTION_OF, ModelConstants.IS_AUTHOR_INSTITUTION_OF,
null, null,
dataInfo, dataInfo,
null) null));
);
} }
} }

View File

@ -1,11 +1,13 @@
package eu.dnetlib.dhp.actionmanager.bipaffiliations.model;
import com.fasterxml.jackson.annotation.JsonProperty; package eu.dnetlib.dhp.actionmanager.bipaffiliations.model;
import lombok.Data;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Data;
@Data @Data
public class AffiliationRelationDeserializer implements Serializable { public class AffiliationRelationDeserializer implements Serializable {
@JsonProperty("DOI") @JsonProperty("DOI")
@ -23,4 +25,3 @@ public class AffiliationRelationDeserializer implements Serializable {
} }
} }

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.actionmanager.bipaffiliations.model; package eu.dnetlib.dhp.actionmanager.bipaffiliations.model;
import java.io.Serializable;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Data; import lombok.Data;
import lombok.NoArgsConstructor; import lombok.NoArgsConstructor;
import java.io.Serializable;
@Data @Data
@AllArgsConstructor @AllArgsConstructor
public class AffiliationRelationModel implements Serializable { public class AffiliationRelationModel implements Serializable {

View File

@ -28,15 +28,8 @@ oozie.use.system.libpath=true
spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
# I think this should be the oozie workflow directory
# oozieWorkflowPath=/user/ilias.kanellos/workflow_example/
# The workflow application path
wfAppPath=${oozieTopWfApplicationPath}
# The following is needed as a property of a workflow # The following is needed as a property of a workflow
oozie.wf.application.path=${oozieTopWfApplicationPath} oozie.wf.application.path=${oozieTopWfApplicationPath}
inputPath=/user/schatz/affiliations/data-v3.json inputPath=/user/schatz/affiliations/data-v3.1.json
outputPath=/tmp/crossref-affiliations-output-v3 outputPath=/tmp/crossref-affiliations-output-v3.1

View File

@ -1,4 +1,4 @@
<workflow-app name="BipFinderScore" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="BipAffiliations" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
@ -84,7 +84,7 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Produces the atomic action with the inferred by BIP! affiliation relations from Crossref</name> <name>Produces the atomic action with the inferred by BIP! affiliation relations from Crossref</name>
<class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations/class> <class>eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.actionmanager.bipaffiliations; package eu.dnetlib.dhp.actionmanager.bipaffiliations;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
@ -6,10 +7,6 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -29,6 +26,10 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class PrepareAffiliationRelationsTest { public class PrepareAffiliationRelationsTest {
@ -105,16 +106,21 @@ public class PrepareAffiliationRelationsTest {
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
Dataset<Row> execVerification = spark.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r"); Dataset<Row> execVerification = spark
.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r");
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions.assertEquals(8, execVerification Assertions
.assertEquals(
8, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
.size()); .size());
Assertions.assertEquals(8, execVerification Assertions
.assertEquals(
8, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()
@ -123,13 +129,17 @@ public class PrepareAffiliationRelationsTest {
// check confidence value of a specific relation // check confidence value of a specific relation
String sourceDOI = "10.1105/tpc.8.3.343"; String sourceDOI = "10.1105/tpc.8.3.343";
final String sourceOpenaireId = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI)); final String sourceOpenaireId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
Assertions.assertEquals("0.7071067812", execVerification Assertions
.assertEquals(
"0.7071067812", execVerification
.filter( .filter(
"source='" + sourceOpenaireId + "'") "source='" + sourceOpenaireId + "'")
.collectAsList().get(0).getString(4)); .collectAsList()
.get(0)
.getString(4));
} }
} }