forked from D-Net/dnet-hadoop
This commit is contained in:
parent
936578aaf1
commit
4eb8276493
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||||
|
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
|
public class Constants {
|
||||||
|
|
||||||
|
public static final String DOI = "doi";
|
||||||
|
|
||||||
|
public static final String UPDATE_DATA_INFO_TYPE = "update";
|
||||||
|
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
|
||||||
|
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
|
||||||
|
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
|
||||||
|
|
||||||
|
public static final String FOS_CLASS_ID = "FOS";
|
||||||
|
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
|
||||||
|
|
||||||
|
public static final String NULL = "NULL";
|
||||||
|
|
||||||
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
private Constants() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||||
|
.map(Boolean::valueOf)
|
||||||
|
.orElse(Boolean.TRUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <R> Dataset<R> readPath(
|
||||||
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
|
return spark
|
||||||
|
.read()
|
||||||
|
.textFile(inputPath)
|
||||||
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,28 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Subset of the information of the generic results that are needed to create the atomic action
|
|
||||||
*/
|
|
||||||
public class PreparedResult implements Serializable {
|
|
||||||
private String id; // openaire id
|
|
||||||
private String value; // doi
|
|
||||||
|
|
||||||
public String getId() {
|
|
||||||
return id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setId(String id) {
|
|
||||||
this.id = id;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getValue() {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setValue(String value) {
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
package eu.dnetlib.dhp.actionmanager.bipfinder;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
@ -18,6 +19,7 @@ import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -27,8 +29,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -105,32 +109,29 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
|
|
||||||
results.createOrReplaceTempView("result");
|
results.createOrReplaceTempView("result");
|
||||||
|
|
||||||
Dataset<PreparedResult> preparedResult = spark
|
Dataset<Row> preparedResult = spark
|
||||||
.sql(
|
.sql(
|
||||||
"select pIde.value value, id " +
|
"select id " +
|
||||||
"from result " +
|
"from result " +
|
||||||
"lateral view explode (pid) p as pIde " +
|
|
||||||
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
|
"where dataInfo.deletedbyinference = false ");
|
||||||
.as(Encoders.bean(PreparedResult.class));
|
|
||||||
|
|
||||||
bipScores
|
bipScores
|
||||||
.joinWith(
|
.joinWith(
|
||||||
preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
|
preparedResult, bipScores.col("id").equalTo(preparedResult.col("id")),
|
||||||
"inner")
|
"inner")
|
||||||
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
|
.map((MapFunction<Tuple2<BipScore, Row>, BipScore>) value -> {
|
||||||
BipScore ret = value._1();
|
BipScore ret = value._1();
|
||||||
ret.setId(value._2().getId());
|
ret.setId(value._1().getId());
|
||||||
return ret;
|
return ret;
|
||||||
}, Encoders.bean(BipScore.class))
|
}, Encoders.bean(BipScore.class))
|
||||||
.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
|
|
||||||
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
|
|
||||||
Result ret = new Result();
|
|
||||||
ret.setDataInfo(getDataInfo());
|
|
||||||
BipScore first = it.next();
|
|
||||||
ret.setId(first.getId());
|
|
||||||
|
|
||||||
ret.setMeasures(getMeasure(first));
|
.map((MapFunction<BipScore, Result>) bs -> {
|
||||||
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
Result ret = new Result();
|
||||||
|
|
||||||
|
ret.setId(bs.getId());
|
||||||
|
|
||||||
|
ret.setMeasures(getMeasure(bs));
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}, Encoders.bean(Result.class))
|
}, Encoders.bean(Result.class))
|
||||||
|
@ -159,7 +160,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
KeyValue kv = new KeyValue();
|
KeyValue kv = new KeyValue();
|
||||||
kv.setValue(unit.getValue());
|
kv.setValue(unit.getValue());
|
||||||
kv.setKey(unit.getKey());
|
kv.setKey(unit.getKey());
|
||||||
kv.setDataInfo(getDataInfo());
|
kv
|
||||||
|
.setDataInfo(
|
||||||
|
OafMapperUtils
|
||||||
|
.dataInfo(
|
||||||
|
false,
|
||||||
|
UPDATE_DATA_INFO_TYPE,
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
UPDATE_MEASURE_BIP_CLASS_ID,
|
||||||
|
UPDATE_CLASS_NAME,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
|
""));
|
||||||
return kv;
|
return kv;
|
||||||
})
|
})
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
|
@ -168,21 +183,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DataInfo getDataInfo() {
|
|
||||||
DataInfo di = new DataInfo();
|
|
||||||
di.setInferred(false);
|
|
||||||
di.setInvisible(false);
|
|
||||||
di.setDeletedbyinference(false);
|
|
||||||
di.setTrust("");
|
|
||||||
Qualifier qualifier = new Qualifier();
|
|
||||||
qualifier.setClassid("sysimport:actionset");
|
|
||||||
qualifier.setClassname("Harvested");
|
|
||||||
qualifier.setSchemename("dnet:provenanceActions");
|
|
||||||
qualifier.setSchemeid("dnet:provenanceActions");
|
|
||||||
di.setProvenanceaction(qualifier);
|
|
||||||
return di;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void removeOutputDir(SparkSession spark, String path) {
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveMetastoreUris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveJdbcUrl</name>
|
||||||
|
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveDbName</name>
|
||||||
|
<value>openaire</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -13,6 +13,40 @@
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="deleteoutputpath"/>
|
<start to="deleteoutputpath"/>
|
||||||
|
|
|
@ -28,6 +28,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
|
||||||
public class SparkAtomicActionScoreJobTest {
|
public class SparkAtomicActionScoreJobTest {
|
||||||
|
|
||||||
|
@ -69,13 +70,13 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void matchOne() throws Exception {
|
void testMatch() throws Exception {
|
||||||
String bipScoresPath = getClass()
|
String bipScoresPath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
|
||||||
.getPath();
|
.getPath();
|
||||||
String inputPath = getClass()
|
String inputPath = getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication.json")
|
"/eu/dnetlib/dhp/actionmanager/bipfinder/publicationoaid.json")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
SparkAtomicActionScoreJob
|
SparkAtomicActionScoreJob
|
||||||
|
@ -95,223 +96,48 @@ public class SparkAtomicActionScoreJobTest {
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
JavaRDD<Publication> tmp = sc
|
JavaRDD<Result> tmp = sc
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
||||||
.map(aa -> ((Publication) aa.getPayload()));
|
.map(aa -> ((Result) aa.getPayload()));
|
||||||
|
|
||||||
assertEquals(1, tmp.count());
|
assertEquals(4, tmp.count());
|
||||||
|
|
||||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
|
||||||
verificationDataset.createOrReplaceTempView("publication");
|
verificationDataset.createOrReplaceTempView("result");
|
||||||
|
|
||||||
Dataset<Row> execVerification = spark
|
Dataset<Row> execVerification = spark
|
||||||
.sql(
|
.sql(
|
||||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
"Select p.id oaid, mes.id, mUnit.value from result p " +
|
||||||
"lateral view explode(measures) m as mes " +
|
"lateral view explode(measures) m as mes " +
|
||||||
"lateral view explode(mes.unit) u as mUnit ");
|
"lateral view explode(mes.unit) u as mUnit ");
|
||||||
|
|
||||||
Assertions.assertEquals(2, execVerification.count());
|
Assertions.assertEquals(12, execVerification.count());
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
"6.63451994567e-09", execVerification
|
||||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"1.47565045883e-08",
|
|
||||||
execVerification.filter("id = 'influence'").select("value").collectAsList().get(0).getString(0));
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"0.227515392",
|
|
||||||
execVerification.filter("id = 'popularity'").select("value").collectAsList().get(0).getString(0));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void matchOneWithTwo() throws Exception {
|
|
||||||
String bipScoresPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
|
||||||
.getPath();
|
|
||||||
String inputPath = getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_2.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
SparkAtomicActionScoreJob
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"-isSparkSessionManaged",
|
|
||||||
Boolean.FALSE.toString(),
|
|
||||||
"-inputPath",
|
|
||||||
inputPath,
|
|
||||||
"-bipScorePath",
|
|
||||||
bipScoresPath,
|
|
||||||
"-resultTableName",
|
|
||||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/actionSet"
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
|
||||||
.map(aa -> ((Publication) aa.getPayload()));
|
|
||||||
|
|
||||||
assertEquals(1, tmp.count());
|
|
||||||
|
|
||||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
|
||||||
verificationDataset.createOrReplaceTempView("publication");
|
|
||||||
|
|
||||||
Dataset<Row> execVerification = spark
|
|
||||||
.sql(
|
|
||||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
|
||||||
"lateral view explode(measures) m as mes " +
|
|
||||||
"lateral view explode(mes.unit) u as mUnit ");
|
|
||||||
|
|
||||||
Assertions.assertEquals(4, execVerification.count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb",
|
|
||||||
execVerification.select("oaid").collectAsList().get(0).getString(0));
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("id = 'influence'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("id = 'popularity'").count());
|
|
||||||
|
|
||||||
List<Row> tmp_ds = execVerification.filter("id = 'influence'").select("value").collectAsList();
|
|
||||||
String tmp_influence = tmp_ds.get(0).getString(0);
|
|
||||||
assertTrue(
|
|
||||||
"1.47565045883e-08".equals(tmp_influence) ||
|
|
||||||
"1.98956540239e-08".equals(tmp_influence));
|
|
||||||
|
|
||||||
tmp_influence = tmp_ds.get(1).getString(0);
|
|
||||||
assertTrue(
|
|
||||||
"1.47565045883e-08".equals(tmp_influence) ||
|
|
||||||
"1.98956540239e-08".equals(tmp_influence));
|
|
||||||
|
|
||||||
assertNotEquals(tmp_ds.get(1).getString(0), tmp_ds.get(0).getString(0));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void matchTwo() throws Exception {
|
|
||||||
String bipScoresPath = getClass()
|
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores.json")
|
|
||||||
.getPath();
|
|
||||||
String inputPath = getClass()
|
|
||||||
.getResource(
|
|
||||||
"/eu/dnetlib/dhp/actionmanager/bipfinder/publication_3.json")
|
|
||||||
.getPath();
|
|
||||||
|
|
||||||
SparkAtomicActionScoreJob
|
|
||||||
.main(
|
|
||||||
new String[] {
|
|
||||||
"-isSparkSessionManaged",
|
|
||||||
Boolean.FALSE.toString(),
|
|
||||||
"-inputPath",
|
|
||||||
inputPath,
|
|
||||||
"-bipScorePath",
|
|
||||||
bipScoresPath,
|
|
||||||
"-resultTableName",
|
|
||||||
"eu.dnetlib.dhp.schema.oaf.Publication",
|
|
||||||
"-outputPath",
|
|
||||||
workingDir.toString() + "/actionSet"
|
|
||||||
});
|
|
||||||
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
JavaRDD<Publication> tmp = sc
|
|
||||||
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
|
|
||||||
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
|
|
||||||
.map(aa -> ((Publication) aa.getPayload()));
|
|
||||||
|
|
||||||
assertEquals(2, tmp.count());
|
|
||||||
|
|
||||||
Dataset<Publication> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class));
|
|
||||||
verificationDataset.createOrReplaceTempView("publication");
|
|
||||||
|
|
||||||
Dataset<Row> execVerification = spark
|
|
||||||
.sql(
|
|
||||||
"Select p.id oaid, mes.id, mUnit.value from publication p " +
|
|
||||||
"lateral view explode(measures) m as mes " +
|
|
||||||
"lateral view explode(mes.unit) u as mUnit ");
|
|
||||||
|
|
||||||
Assertions.assertEquals(4, execVerification.count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("id = 'influence'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
2,
|
|
||||||
execVerification.filter("id = 'popularity'").count());
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"1.47565045883e-08",
|
|
||||||
execVerification
|
|
||||||
.filter(
|
.filter(
|
||||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'influence'")
|
"and id = 'influence'")
|
||||||
.select("value")
|
.select("value")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getString(0));
|
.getString(0));
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"1.98956540239e-08",
|
"0.348694533145", execVerification
|
||||||
execVerification
|
|
||||||
.filter(
|
.filter(
|
||||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'influence'")
|
"and id = 'popularity_alt'")
|
||||||
.select("value")
|
.select("value")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getString(0));
|
.getString(0));
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"0.282046161584",
|
"2.16094680115e-09", execVerification
|
||||||
execVerification
|
|
||||||
.filter(
|
.filter(
|
||||||
"oaid = '50|acm_________::faed5b7a1bd8f51118d13ed29cfaee09' " +
|
"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
|
||||||
"and id = 'popularity'")
|
|
||||||
.select("value")
|
|
||||||
.collectAsList()
|
|
||||||
.get(0)
|
|
||||||
.getString(0));
|
|
||||||
|
|
||||||
Assertions
|
|
||||||
.assertEquals(
|
|
||||||
"0.227515392",
|
|
||||||
execVerification
|
|
||||||
.filter(
|
|
||||||
"oaid = '50|355e65625b88::ffa5bad14f4adc0c9a15c00efbbccddb' " +
|
|
||||||
"and id = 'popularity'")
|
"and id = 'popularity'")
|
||||||
.select("value")
|
.select("value")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
{"50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1": [{"id": "influence", "unit": [{"value": "6.63451994567e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.348694533145", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.16094680115e-09", "key": "score"}]}]}
|
||||||
|
{"50|dedup_wf_001::05b1f8ce98702f69d07aa5f0429de1e3": [{"id": "influence", "unit": [{"value": "6.25057357279e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "7.0208", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.40234462244e-08", "key": "score"}]}]}
|
||||||
|
{"50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [{"id": "influence", "unit": [{"value": "5.54921449123e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.53012887452e-10", "key": "score"}]}]}
|
||||||
|
{"50|dedup_wf_001::0e72b399325d6efcbe3271891a1dfe4c": [{"id": "influence", "unit": [{"value": "1.63466096315e-08", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "20.9870879741", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.49501495323e-08", "key": "score"}]}]}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue