-

2021-12-14 11:12:17 +01:00 · 2021-12-14 11:12:17 +01:00 · 4eb8276493
parent 936578aaf1
commit 4eb8276493
8 changed files with 185 additions and 252 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Constants.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Constants.java
@ -0,0 +1,49 @@
+
+package eu.dnetlib.dhp.actionmanager.bipfinder;
+
+import java.util.Optional;
+
+import org.apache.spark.api.java.function.MapFunction;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.SparkSession;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class Constants {
+
+	public static final String DOI = "doi";
+
+	public static final String UPDATE_DATA_INFO_TYPE = "update";
+	public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
+	public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
+	public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
+
+	public static final String FOS_CLASS_ID = "FOS";
+	public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
+
+	public static final String NULL = "NULL";
+
+	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+	private Constants() {
+	}
+
+	public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
+		return Optional
+			.ofNullable(parser.get("isSparkSessionManaged"))
+			.map(Boolean::valueOf)
+			.orElse(Boolean.TRUE);
+	}
+
+	public static <R> Dataset<R> readPath(
+		SparkSession spark, String inputPath, Class<R> clazz) {
+		return spark
+			.read()
+			.textFile(inputPath)
+			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
@ -1,28 +0,0 @@
-
-package eu.dnetlib.dhp.actionmanager.bipfinder;
-
-import java.io.Serializable;
-
-/**
- * Subset of the information of the generic results that are needed to create the atomic action
- */
-public class PreparedResult implements Serializable {
-	private String id; // openaire id
-	private String value; // doi
-
-	public String getId() {
-		return id;
-	}
-
-	public void setId(String id) {
-		this.id = id;
-	}
-
-	public String getValue() {
-		return value;
-	}
-
-	public void setValue(String value) {
-		this.value = value;
-	}
-}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -1,6 +1,7 @@

 package eu.dnetlib.dhp.actionmanager.bipfinder;

+import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

 import java.io.Serializable;
@ -18,6 +19,7 @@ import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
+import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -27,8 +29,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;

 /**
@ -105,32 +109,29 @@ public class SparkAtomicActionScoreJob implements Serializable {

 		results.createOrReplaceTempView("result");

-		Dataset<PreparedResult> preparedResult = spark
+		Dataset<Row> preparedResult = spark
 			.sql(
-				"select pIde.value value, id " +
+				"select id " +
 					"from result " +
-					"lateral view explode (pid) p as pIde " +
-					"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
-			.as(Encoders.bean(PreparedResult.class));
+
+					"where dataInfo.deletedbyinference = false ");

 		bipScores
 			.joinWith(
-				preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
+				preparedResult, bipScores.col("id").equalTo(preparedResult.col("id")),
 				"inner")
-			.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
+			.map((MapFunction<Tuple2<BipScore, Row>, BipScore>) value -> {
 				BipScore ret = value._1();
-				ret.setId(value._2().getId());
+				ret.setId(value._1().getId());
 				return ret;
 			}, Encoders.bean(BipScore.class))
-			.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
-			.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
-				Result ret = new Result();
-				ret.setDataInfo(getDataInfo());
-				BipScore first = it.next();
-				ret.setId(first.getId());

-				ret.setMeasures(getMeasure(first));
-				it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
+			.map((MapFunction<BipScore, Result>) bs -> {
+				Result ret = new Result();
+
+				ret.setId(bs.getId());
+
+				ret.setMeasures(getMeasure(bs));

 				return ret;
 			}, Encoders.bean(Result.class))
@ -159,7 +160,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
 								KeyValue kv = new KeyValue();
 								kv.setValue(unit.getValue());
 								kv.setKey(unit.getKey());
-								kv.setDataInfo(getDataInfo());
+								kv
+									.setDataInfo(
+										OafMapperUtils
+											.dataInfo(
+												false,
+												UPDATE_DATA_INFO_TYPE,
+												true,
+												false,
+												OafMapperUtils
+													.qualifier(
+														UPDATE_MEASURE_BIP_CLASS_ID,
+														UPDATE_CLASS_NAME,
+														ModelConstants.DNET_PROVENANCE_ACTIONS,
+														ModelConstants.DNET_PROVENANCE_ACTIONS),
+												""));
 								return kv;
 							})
 							.collect(Collectors.toList()));
@ -168,21 +183,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			.collect(Collectors.toList());
 	}

-	private static DataInfo getDataInfo() {
-		DataInfo di = new DataInfo();
-		di.setInferred(false);
-		di.setInvisible(false);
-		di.setDeletedbyinference(false);
-		di.setTrust("");
-		Qualifier qualifier = new Qualifier();
-		qualifier.setClassid("sysimport:actionset");
-		qualifier.setClassname("Harvested");
-		qualifier.setSchemename("dnet:provenanceActions");
-		qualifier.setSchemeid("dnet:provenanceActions");
-		di.setProvenanceaction(qualifier);
-		return di;
-	}
-
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
@ -0,0 +1,30 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>hiveMetastoreUris</name>
+        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
+    </property>
+    <property>
+        <name>hiveJdbcUrl</name>
+        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
+    </property>
+    <property>
+        <name>hiveDbName</name>
+        <value>openaire</value>
+    </property>
+    <property>
+        <name>oozie.launcher.mapreduce.user.classpath.first</name>
+        <value>true</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
@ -13,6 +13,40 @@
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
    </parameters>

    <start to="deleteoutputpath"/>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.Publication;
+import eu.dnetlib.dhp.schema.oaf.Result;

 public class SparkAtomicActionScoreJobTest {

@ -69,13 +70,13 @@ public class SparkAtomicActionScoreJobTest {
 	}

 	@Test
-	void matchOne() throws Exception {
+	void testMatch() throws Exception {
 		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
+				"/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json")
 			.getPath();

 		SparkAtomicActionScoreJob
@ -95,223 +96,48 @@ public class SparkAtomicActionScoreJobTest {

 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());

-		JavaRDD<Publication> tmp = sc
+		JavaRDD<Result> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Publication) aa.getPayload()));
+			.map(aa -> ((Result) aa.getPayload()));

-		assertEquals(1, tmp.count());
+		assertEquals(4, tmp.count());

-		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
-		verificationDataset.createOrReplaceTempView("publication");
+		Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
+		verificationDataset.createOrReplaceTempView("result");

 		Dataset<Row> execVerification = spark
 			.sql(
-				"Select p.id oaid, mes.id, mUnit.value from publication p " +
+				"Select p.id oaid, mes.id, mUnit.value from result p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");

-		Assertions.assertEquals(2, execVerification.count());
-
+		Assertions.assertEquals(12, execVerification.count());
 		Assertions
 			.assertEquals(
-				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
-				execVerification.select("oaid").collectAsList().get(0).getString(0));
-
-		Assertions
-			.assertEquals(
-				"1.47565045883e-08",
-				execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
-
-		Assertions
-			.assertEquals(
-				"0.227515392",
-				execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
-
-	}
-
-	@Test
-	void matchOneWithTwo() throws Exception {
-		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
-			.getPath();
-		String inputPath = getClass()
-			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
-			.getPath();
-
-		SparkAtomicActionScoreJob
-			.main(
-				new String[] {
-					"-isSparkSessionManaged",
-					Boolean.FALSE.toString(),
-					"-inputPath",
-					inputPath,
-					"-bipScorePath",
-					bipScoresPath,
-					"-resultTableName",
-					"eu.dnetlib.dhp.schema.oaf.Publication",
-					"-outputPath",
-					workingDir.toString() + "/actionSet"
-				});
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
-		JavaRDD<Publication> tmp = sc
-			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
-			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Publication) aa.getPayload()));
-
-		assertEquals(1, tmp.count());
-
-		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
-		verificationDataset.createOrReplaceTempView("publication");
-
-		Dataset<Row> execVerification = spark
-			.sql(
-				"Select p.id oaid, mes.id, mUnit.value from publication p " +
-					"lateral view explode(measures) m as mes " +
-					"lateral view explode(mes.unit) u as mUnit ");
-
-		Assertions.assertEquals(4, execVerification.count());
-
-		Assertions
-			.assertEquals(
-				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
-				execVerification.select("oaid").collectAsList().get(0).getString(0));
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("id = 'influence'").count());
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("id = 'popularity'").count());
-
-		List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
-		String tmp_influence = tmp_ds.get(0).getString(0);
-		assertTrue(
-			"1.47565045883e-08".equals(tmp_influence) ||
-				"1.98956540239e-08".equals(tmp_influence));
-
-		tmp_influence = tmp_ds.get(1).getString(0);
-		assertTrue(
-			"1.47565045883e-08".equals(tmp_influence) ||
-				"1.98956540239e-08".equals(tmp_influence));
-
-		assertNotEquals(tmp_ds.get(1).getString(0), tmp_ds.get(0).getString(0));
-
-	}
-
-	@Test
-	void matchTwo() throws Exception {
-		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
-			.getPath();
-		String inputPath = getClass()
-			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
-			.getPath();
-
-		SparkAtomicActionScoreJob
-			.main(
-				new String[] {
-					"-isSparkSessionManaged",
-					Boolean.FALSE.toString(),
-					"-inputPath",
-					inputPath,
-					"-bipScorePath",
-					bipScoresPath,
-					"-resultTableName",
-					"eu.dnetlib.dhp.schema.oaf.Publication",
-					"-outputPath",
-					workingDir.toString() + "/actionSet"
-				});
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
-		JavaRDD<Publication> tmp = sc
-			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
-			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Publication) aa.getPayload()));
-
-		assertEquals(2, tmp.count());
-
-		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
-		verificationDataset.createOrReplaceTempView("publication");
-
-		Dataset<Row> execVerification = spark
-			.sql(
-				"Select p.id oaid, mes.id, mUnit.value from publication p " +
-					"lateral view explode(measures) m as mes " +
-					"lateral view explode(mes.unit) u as mUnit ");
-
-		Assertions.assertEquals(4, execVerification.count());
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("id = 'influence'").count());
-
-		Assertions
-			.assertEquals(
-				2,
-				execVerification.filter("id = 'popularity'").count());
-
-		Assertions
-			.assertEquals(
-				"1.47565045883e-08",
-				execVerification
+				"6.63451994567e-09", execVerification
 					.filter(
-						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'influence'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
-
 		Assertions
 			.assertEquals(
-				"1.98956540239e-08",
-				execVerification
+				"0.348694533145", execVerification
 					.filter(
-						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
-							"and id = 'influence'")
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
+							"and id = 'popularity_alt'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
-
 		Assertions
 			.assertEquals(
-				"0.282046161584",
-				execVerification
+				"2.16094680115e-09", execVerification
 					.filter(
-						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
-							"and id = 'popularity'")
-					.select("value")
-					.collectAsList()
-					.get(0)
-					.getString(0));
-
-		Assertions
-			.assertEquals(
-				"0.227515392",
-				execVerification
-					.filter(
-						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'popularity'")
 					.select("value")
 					.collectAsList()
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json
@ -0,0 +1,4 @@
+{"50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1": [{"id": "influence", "unit": [{"value": "6.63451994567e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.348694533145", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.16094680115e-09", "key": "score"}]}]}
+{"50|dedup_wf_001::05b1f8ce98702f69d07aa5f0429de1e3": [{"id": "influence", "unit": [{"value": "6.25057357279e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "7.0208", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.40234462244e-08", "key": "score"}]}]}
+{"50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [{"id": "influence", "unit": [{"value": "5.54921449123e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.53012887452e-10", "key": "score"}]}]}
+{"50|dedup_wf_001::0e72b399325d6efcbe3271891a1dfe4c": [{"id": "influence", "unit": [{"value": "1.63466096315e-08", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "20.9870879741", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.49501495323e-08", "key": "score"}]}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json