-

2021-12-14 11:12:17 +01:00 · 2021-12-14 11:12:17 +01:00 · 4eb8276493
parent 936578aaf1
commit 4eb8276493
8 changed files with 185 additions and 252 deletions
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Constants.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/Constants.java
@ -0,0 +1,49 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.util.Optional;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.SparkSession;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 public class Constants {
 	public static final String DOI = "doi";
 	public static final String UPDATE_DATA_INFO_TYPE = "update";
 	public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
 	public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
 	public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
 	public static final String FOS_CLASS_ID = "FOS";
 	public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
 	public static final String NULL = "NULL";
 	public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private Constants() {
 	}
 	public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
 		return Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 	}
 	public static <R> Dataset<R> readPath(
 		SparkSession spark, String inputPath, Class<R> clazz) {
 		return spark
 			.read()
 			.textFile(inputPath)
 			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/PreparedResult.java
@ -1,28 +0,0 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import java.io.Serializable;
 /**
 * Subset of the information of the generic results that are needed to create the atomic action
 */
 public class PreparedResult implements Serializable {
 	private String id; // openaire id
 	private String value; // doi
 	public String getId() {
 		return id;
 	}
 	public void setId(String id) {
 		this.id = id;
 	}
 	public String getValue() {
 		return value;
 	}
 	public void setValue(String value) {
 		this.value = value;
 	}
 }
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@ -1,6 +1,7 @@
 package eu.dnetlib.dhp.actionmanager.bipfinder;
 import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.Serializable;
@ -18,6 +19,7 @@ import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.api.java.function.MapGroupsFunction;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Encoders;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SparkSession;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -27,8 +29,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
 import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
 import scala.Tuple2;
 /**
@ -105,32 +109,29 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		results.createOrReplaceTempView("result");
-		Dataset<PreparedResult> preparedResult = spark
+		Dataset<Row> preparedResult = spark
 			.sql(
-				"select pIde.value value, id " +
+				"select id " +
 					"from result " +
-					"lateral view explode (pid) p as pIde " +
+
-					"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
+					"where dataInfo.deletedbyinference = false ");
 			.as(Encoders.bean(PreparedResult.class));
 		bipScores
 			.joinWith(
-				preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
+				preparedResult, bipScores.col("id").equalTo(preparedResult.col("id")),
 				"inner")
-			.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
+			.map((MapFunction<Tuple2<BipScore, Row>, BipScore>) value -> {
 				BipScore ret = value._1();
-				ret.setId(value._2().getId());
+				ret.setId(value._1().getId());
 				return ret;
 			}, Encoders.bean(BipScore.class))
 			.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
 			.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
 				Result ret = new Result();
 				ret.setDataInfo(getDataInfo());
 				BipScore first = it.next();
 				ret.setId(first.getId());
-				ret.setMeasures(getMeasure(first));
+			.map((MapFunction<BipScore, Result>) bs -> {
-				it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
+				Result ret = new Result();
 				ret.setId(bs.getId());
 				ret.setMeasures(getMeasure(bs));
 				return ret;
 			}, Encoders.bean(Result.class))
@ -159,7 +160,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
 								KeyValue kv = new KeyValue();
 								kv.setValue(unit.getValue());
 								kv.setKey(unit.getKey());
-								kv.setDataInfo(getDataInfo());
+								kv
 									.setDataInfo(
 										OafMapperUtils
 											.dataInfo(
 												false,
 												UPDATE_DATA_INFO_TYPE,
 												true,
 												false,
 												OafMapperUtils
 													.qualifier(
 														UPDATE_MEASURE_BIP_CLASS_ID,
 														UPDATE_CLASS_NAME,
 														ModelConstants.DNET_PROVENANCE_ACTIONS,
 														ModelConstants.DNET_PROVENANCE_ACTIONS),
 												""));
 								return kv;
 							})
 							.collect(Collectors.toList()));
@ -168,21 +183,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			.collect(Collectors.toList());
 	}
 	private static DataInfo getDataInfo() {
 		DataInfo di = new DataInfo();
 		di.setInferred(false);
 		di.setInvisible(false);
 		di.setDeletedbyinference(false);
 		di.setTrust("");
 		Qualifier qualifier = new Qualifier();
 		qualifier.setClassid("sysimport:actionset");
 		qualifier.setClassname("Harvested");
 		qualifier.setSchemename("dnet:provenanceActions");
 		qualifier.setSchemeid("dnet:provenanceActions");
 		di.setProvenanceaction(qualifier);
 		return di;
 	}
 	private static void removeOutputDir(SparkSession spark, String path) {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml
@ -0,0 +1,30 @@
 <configuration>
    <property>
        <name>jobTracker</name>
        <value>yarnRM</value>
    </property>
    <property>
        <name>nameNode</name>
        <value>hdfs://nameservice1</value>
    </property>
    <property>
        <name>oozie.use.system.libpath</name>
        <value>true</value>
    </property>
    <property>
        <name>hiveMetastoreUris</name>
        <value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
    </property>
    <property>
        <name>hiveJdbcUrl</name>
        <value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
    </property>
    <property>
        <name>hiveDbName</name>
        <value>openaire</value>
    </property>
    <property>
        <name>oozie.launcher.mapreduce.user.classpath.first</name>
        <value>true</value>
    </property>
 </configuration>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml
@ -13,6 +13,40 @@
            <name>outputPath</name>
            <description>the path where to store the actionset</description>
        </property>
        <property>
            <name>sparkDriverMemory</name>
            <description>memory for driver process</description>
        </property>
        <property>
            <name>sparkExecutorMemory</name>
            <description>memory for individual executor</description>
        </property>
        <property>
            <name>sparkExecutorCores</name>
            <description>number of cores used by single executor</description>
        </property>
        <property>
            <name>oozieActionShareLibForSpark2</name>
            <description>oozie action sharelib for spark 2.*</description>
        </property>
        <property>
            <name>spark2ExtraListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
            <description>spark 2.* extra listeners classname</description>
        </property>
        <property>
            <name>spark2SqlQueryExecutionListeners</name>
            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
            <description>spark 2.* sql query execution listeners classname</description>
        </property>
        <property>
            <name>spark2YarnHistoryServerAddress</name>
            <description>spark 2.* yarn history server address</description>
        </property>
        <property>
            <name>spark2EventLogDir</name>
            <description>spark 2.* event log dir location</description>
        </property>
    </parameters>
    <start to="deleteoutputpath"/>
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 public class SparkAtomicActionScoreJobTest {
@ -69,13 +70,13 @@ public class SparkAtomicActionScoreJobTest {
 	}
 	@Test
-	void matchOne() throws Exception {
+	void testMatch() throws Exception {
 		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
-				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
+				"/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json")
 			.getPath();
 		SparkAtomicActionScoreJob
@ -95,223 +96,48 @@ public class SparkAtomicActionScoreJobTest {
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-		JavaRDD<Publication> tmp = sc
+		JavaRDD<Result> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Publication) aa.getPayload()));
+			.map(aa -> ((Result) aa.getPayload()));
-		assertEquals(1, tmp.count());
+		assertEquals(4, tmp.count());
-		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
+		Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
-		verificationDataset.createOrReplaceTempView("publication");
+		verificationDataset.createOrReplaceTempView("result");
 		Dataset<Row> execVerification = spark
 			.sql(
-				"Select p.id oaid, mes.id, mUnit.value from publication p " +
+				"Select p.id oaid, mes.id, mUnit.value from result p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
-		Assertions.assertEquals(2, execVerification.count());
+		Assertions.assertEquals(12, execVerification.count());
 		Assertions
 			.assertEquals(
-				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
+				"6.63451994567e-09", execVerification
 				execVerification.select("oaid").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				"1.47565045883e-08",
 				execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				"0.227515392",
 				execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
 	}
 	@Test
 	void matchOneWithTwo() throws Exception {
 		String bipScoresPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
 			.getPath();
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-bipScorePath",
 					bipScoresPath,
 					"-resultTableName",
 					"eu.dnetlib.dhp.schema.oaf.Publication",
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Publication> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Publication) aa.getPayload()));
 		assertEquals(1, tmp.count());
 		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
 		verificationDataset.createOrReplaceTempView("publication");
 		Dataset<Row> execVerification = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from publication p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 		Assertions.assertEquals(4, execVerification.count());
 		Assertions
 			.assertEquals(
 				"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
 				execVerification.select("oaid").collectAsList().get(0).getString(0));
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'influence'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'popularity'").count());
 		List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
 		String tmp_influence = tmp_ds.get(0).getString(0);
 		assertTrue(
 			"1.47565045883e-08".equals(tmp_influence) ||
 				"1.98956540239e-08".equals(tmp_influence));
 		tmp_influence = tmp_ds.get(1).getString(0);
 		assertTrue(
 			"1.47565045883e-08".equals(tmp_influence) ||
 				"1.98956540239e-08".equals(tmp_influence));
 		assertNotEquals(tmp_ds.get(1).getString(0), tmp_ds.get(0).getString(0));
 	}
 	@Test
 	void matchTwo() throws Exception {
 		String bipScoresPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
 			.getPath();
 		String inputPath = getClass()
 			.getResource(
 				"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
 			.getPath();
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged",
 					Boolean.FALSE.toString(),
 					"-inputPath",
 					inputPath,
 					"-bipScorePath",
 					bipScoresPath,
 					"-resultTableName",
 					"eu.dnetlib.dhp.schema.oaf.Publication",
 					"-outputPath",
 					workingDir.toString() + "/actionSet"
 				});
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 		JavaRDD<Publication> tmp = sc
 			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Publication) aa.getPayload()));
 		assertEquals(2, tmp.count());
 		Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
 		verificationDataset.createOrReplaceTempView("publication");
 		Dataset<Row> execVerification = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from publication p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 		Assertions.assertEquals(4, execVerification.count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'influence'").count());
 		Assertions
 			.assertEquals(
 				2,
 				execVerification.filter("id = 'popularity'").count());
 		Assertions
 			.assertEquals(
 				"1.47565045883e-08",
 				execVerification
 					.filter(
-						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'influence'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
-				"1.98956540239e-08",
+				"0.348694533145", execVerification
 				execVerification
 					.filter(
-						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
-							"and id = 'influence'")
+							"and id = 'popularity_alt'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
-				"0.282046161584",
+				"2.16094680115e-09", execVerification
 				execVerification
 					.filter(
-						"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
+						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'popularity'")
 					.select("value")
 					.collectAsList()
 					.get(0)
 					.getString(0));
 		Assertions
 			.assertEquals(
 				"0.227515392",
 				execVerification
 					.filter(
 						"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
 							"and id = 'popularity'")
 					.select("value")
 					.collectAsList()
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json
@ -0,0 +1,4 @@
 {"50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1": [{"id": "influence", "unit": [{"value": "6.63451994567e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.348694533145", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.16094680115e-09", "key": "score"}]}]}
 {"50|dedup_wf_001::05b1f8ce98702f69d07aa5f0429de1e3": [{"id": "influence", "unit": [{"value": "6.25057357279e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "7.0208", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.40234462244e-08", "key": "score"}]}]}
 {"50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [{"id": "influence", "unit": [{"value": "5.54921449123e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.53012887452e-10", "key": "score"}]}]}
 {"50|dedup_wf_001::0e72b399325d6efcbe3271891a1dfe4c": [{"id": "influence", "unit": [{"value": "1.63466096315e-08", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "20.9870879741", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.49501495323e-08", "key": "score"}]}]}
--- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json