1
0
Fork 0

[Bypass Action Set] creation of unresolved entities

This commit is contained in:
Miriam Baglioni 2021-11-11 16:11:25 +01:00
parent c371b23077
commit 935062edec
36 changed files with 1033 additions and 1335 deletions

View File

@ -0,0 +1,55 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class Constants {
public static final String UNREOSLVED_PREFIX = "unresolved:";
public static final String UNREOSLVED_POSTFIX_DOI = ":doi";
public static final String DOI = "doi";
public static final String UPDATE_DATA_INFO_TYPE = "update";
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
public final static String NULL = "NULL";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static String getUnresolvedDoiIndentifier(String doi) {
StringBuilder sb = new StringBuilder();
sb.append(UNREOSLVED_PREFIX).append(doi).append(UNREOSLVED_POSTFIX_DOI);
return sb.toString();
}
}

View File

@ -1,7 +1,9 @@
package eu.dnetlib.dhp.bypassactionset.fos;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.io.*;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
@ -29,17 +31,17 @@ public class GetFOSData implements Serializable {
.requireNonNull(
GetFOSData.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/get_fos_parameters.json"))));
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
parser.parseArgument(args);
// the path where the original fos csv file is stored
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath);
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}", sourcePath);
// the path where to put the file as json
final String outputFile = parser.get("outputFile");
log.info("outputFile {}", outputFile);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
@ -58,7 +60,7 @@ public class GetFOSData implements Serializable {
FileSystem fileSystem = FileSystem.get(conf);
new GetFOSData().doRewrite(inputPath, outputFile, classForName, delimiter, fileSystem);
new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
}

View File

@ -0,0 +1,144 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.UPDATE_CLASS_NAME;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdfs.client.HdfsUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipDeserialize;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipScore;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareBipFinder.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip_prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}: ", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
prepareResults(spark, sourcePath, outputPath);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.map((MapFunction<BipScore, Result>) v -> {
Result r = new Result();
r.setId(getUnresolvedDoiIndentifier(v.getId()));
r.setMeasures(getMeasure(v));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/bip");
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
}

View File

@ -1,30 +1,30 @@
package eu.dnetlib.dhp.bypassactionset.fos;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.bypassactionset.Utils.getIdentifier;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bypassactionset.model.FOSDataModel;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class PrepareFOSSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
@ -35,7 +35,7 @@ public class PrepareFOSSparkJob implements Serializable {
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/distribute_fos_parameters.json"));
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/distribute_fos_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -55,7 +55,6 @@ public class PrepareFOSSparkJob implements Serializable {
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
distributeFOSdois(
spark,
sourcePath,
@ -74,13 +73,60 @@ public class PrepareFOSSparkJob implements Serializable {
final String level3 = v.getLevel3();
Arrays
.stream(v.getDoi().split("\u0002"))
.forEach(d -> fosList.add(FOSDataModel.newInstance(getIdentifier(d), level1, level2, level3)));
.forEach(d -> fosList.add(FOSDataModel.newInstance(d, level1, level2, level3)));
return fosList.iterator();
}, Encoders.bean(FOSDataModel.class))
.map((MapFunction<FOSDataModel, Result>) value -> {
Result r = new Result();
r.setId(getUnresolvedDoiIndentifier(value.getDoi()));
r.setSubject(getSubjects(value));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
.json(outputPath + "/fos");
}
private static List<StructuredProperty> getSubjects(FOSDataModel fos) {
return Arrays
.asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3()))
.stream()
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static StructuredProperty getSubject(String sbj) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
.setQualifier(
OafMapperUtils
.qualifier(
FOS_CLASS_ID,
FOS_CLASS_NAME,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_SUBJECT_FOS_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
}
}

View File

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
public class SparkSaveUnresolved implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
saveUnresolved(
spark,
sourcePath,
outputPath);
});
}
private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) {
spark
.read()
.textFile(sourcePath + "/*")
.map(
(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
Encoders.bean(Result.class))
.groupByKey((MapFunction<Result, String>) r -> r.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> ret.mergeFrom(r));
return ret;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.bypassactionset.model;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.ArrayList;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.bypassactionset.model;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.List;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.bypassactionset.model;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.bypassactionset.model;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.bypassactionset.model;
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.List;

View File

@ -6,8 +6,8 @@
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramName": "sp",
"paramLongName": "sourcePath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},

View File

@ -23,5 +23,11 @@
"paramLongName": "hdfsNameNode",
"paramDescription": "the path used to store the HostedByMap",
"paramRequired": true
},
{
"paramName": "cfn",
"paramLongName": "classForName",
"paramDescription": "the path used to store the HostedByMap",
"paramRequired": true
}
]

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,155 @@
<workflow-app name="UnresolvedEntities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>fosPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<start to="prepareInfo"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="prepareInfo">
<path start="prepareBip"/>
<path start="getFOS"/>
</fork>
<action name="prepareBip">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from bip finder!</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${bipScorePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared/bip</arg>
</spark>
<ok to="join"/>
<error to="Kill"/>
</action>
<action name="getFOS">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSData</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${fosPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel</arg>
</java>
<ok to="prepareFos"/>
<error to="Kill"/>
</action>
<action name="prepareFos">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from FOS!</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/input/fos</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared/fos</arg>
</spark>
<ok to="join"/>
<error to="Kill"/>
</action>
<join name="join" to="produceUnresolved"/>
<action name="produceUnresolved">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Saves the result produced for bip and fos by grouping results with the same id</name>
<class>eu.dnetlib.dhp.actionmanager.bipfinder.CollectAndSave</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/prepared</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -6,8 +6,8 @@
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramName": "sp",
"paramLongName": "sourcePath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
@ -16,17 +16,5 @@
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rtn",
"paramLongName": "resultTableName",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "fp",
"paramLongName": "fosPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,250 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static org.junit.jupiter.api.Assertions.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareTest {
private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProduceTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PrepareTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void bipPrepareTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(86, tmp.count());
String doi1 = "unresolved:10.0000/096020199389707:doi";
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getMeasures().size());
Assertions
.assertEquals(
"6.34596412687e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"0.641151896994", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"2.33375102921e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
}
@Test
void getFOSFileTest() throws CollectorException, IOException, ClassNotFoundException {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv")
.getPath();
final String outputPath = workingDir.toString() + "/fos.json";
new GetFOSData()
.doRewrite(
sourcePath, outputPath, "eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel",
'\t', fs);
BufferedReader in = new BufferedReader(
new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class);
System.out.println(new ObjectMapper().writeValueAsString(fos));
count += 1;
}
assertEquals(38, count);
}
@Test
void fosPrepareTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/fos")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
String doi1 = "unresolved:10.3390/s18072310:doi";
assertEquals(50, tmp.count());
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("engineering and technology"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("nano-technology"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("nanoscience & nanotechnology"));
String doi = "unresolved:10.1111/1365-2656.12831:doi";
assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
assertTrue(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("psychology and cognitive sciences"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("social sciences"));
assertFalse(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("NULL"));
}
}

View File

@ -0,0 +1,234 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProduceTest {
private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________";
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(ProduceTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProduceTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(ProduceTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void produceTest() throws Exception {
final String bipPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(135, tmp.count());
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved:10.3390/s18072310:doi")).count());
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals("unresolved:10.3390/s18072310:doi"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals("unresolved:10.3390/s18072310:doi"))
.collect()
.get(0)
.getMeasures()
.size());
List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals("unresolved:10.3390/s18072310:doi"))
.flatMap(row -> row.getSubject().iterator())
.collect();
sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
"Fields of Science and Technology classification", sbj.getQualifier().getClassname()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename()));
sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
sbjs
.forEach(
sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename()));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("engineering and technology"));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology"));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nanoscience & nanotechnology"));
List<Measure> measures = tmp
.filter(row -> row.getId().equals("unresolved:10.3390/s18072310:doi"))
.flatMap(row -> row.getMeasures().iterator())
.collect();
Assertions
.assertEquals(
"7.5597134689e-09", measures
.stream()
.filter(mes -> mes.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"4.903880192", measures
.stream()
.filter(mes -> mes.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"1.17977512835e-08", measures
.stream()
.filter(mes -> mes.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
49, tmp
.filter(row -> !row.getId().equals("unresolved:10.3390/s18072310:doi"))
.filter(row -> row.getSubject() != null)
.count());
Assertions
.assertEquals(
85,
tmp
.filter(row -> !row.getId().equals("unresolved:10.3390/s18072310:doi"))
.filter(r -> r.getMeasures() != null)
.count());
}
}

View File

@ -1,4 +1,4 @@
{"10.0000/000000": [{"id": "influence", "unit": [{"value": "7.5597134689e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "4.903880192", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.17977512835e-08", "key": "score"}]}]}
{"10.3390/s18072310": [{"id": "influence", "unit": [{"value": "7.5597134689e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "4.903880192", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.17977512835e-08", "key": "score"}]}]}
{"10.0000/096020199389707": [{"id": "influence", "unit": [{"value": "6.34596412687e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.641151896994", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.33375102921e-09", "key": "score"}]}]}
{"10.00000/jpmc.2017.106": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/9781845416881": [{"id": "influence", "unit": [{"value": "5.96492048955e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.12641925838e-08", "key": "score"}]}]}

View File

@ -1,17 +0,0 @@
package eu.dnetlib.dhp.bypassactionset;
import org.jetbrains.annotations.NotNull;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class Utils {
private static final String ID_PREFIX = "50|doi_________";
@NotNull
public static String getIdentifier(String d) {
return ID_PREFIX +
IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", d));
}
}

View File

@ -1,89 +0,0 @@
package eu.dnetlib.dhp.bypassactionset.bipfinder;
import static eu.dnetlib.dhp.bypassactionset.Utils.getIdentifier;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bypassactionset.model.BipDeserialize;
import eu.dnetlib.dhp.bypassactionset.model.BipScore;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateBip.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkUpdateBip.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/bip_prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
prepareResults(spark, inputPath, outputPath);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(getIdentifier(key));
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -1,131 +0,0 @@
package eu.dnetlib.dhp.bypassactionset.bipfinder;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bypassactionset.model.BipScore;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
/**
* created the Atomic Action for each tipe of results
*/
public class SparkUpdateBip implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateBip.class);
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkUpdateBip.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/bip_update_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String bipScorePath = parser.get("bipScorePath");
log.info("bipScorePath: {}", bipScorePath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> updateBipFinder(spark, inputPath, outputPath, bipScorePath, inputClazz)
);
}
private static <I extends Result> void updateBipFinder(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
Dataset<I> results = readPath(spark, inputPath, inputClazz);
Dataset<BipScore> bipScores = readPath(spark, bipScorePath, BipScore.class);
results
.joinWith(bipScores, results.col("id").equalTo(bipScores.col("id")), "left")
.map((MapFunction<Tuple2<I, BipScore>, I>) value -> {
if (!Optional.ofNullable(value._2()).isPresent()) {
return value._1();
}
value._1().setMeasures(getMeasure(value._2()));
return value._1();
}, Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/bip");
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv
.setDataInfo(
getDataInfo(
UPDATE_DATA_INFO_TYPE,
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ""));
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
}

View File

@ -1,121 +0,0 @@
package eu.dnetlib.dhp.bypassactionset.fos;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bypassactionset.model.FOSDataModel;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import scala.Tuple2;
public class SparkUpdateFOS implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateFOS.class);
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
SparkUpdateFOS.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/fos_update_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
final String fosPath = parser.get("fosPath");
log.info("fosPath: {}", fosPath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> updateFos(spark, inputPath, outputPath, fosPath, inputClazz)
);
}
private static <I extends Result> void updateFos(SparkSession spark, String inputPath, String outputPath,
String fosPath, Class<I> inputClazz) {
Dataset<I> results = readPath(spark, inputPath, inputClazz);
Dataset<FOSDataModel> fosDataModelDataset = readPath(spark, fosPath, FOSDataModel.class);
results
.joinWith(fosDataModelDataset, results.col("id").equalTo(fosDataModelDataset.col("doi")), "left")
.map((MapFunction<Tuple2<I, FOSDataModel>, I>) value -> {
if (!Optional.ofNullable(value._2()).isPresent()) {
return value._1();
}
value._1().getSubject().addAll(getSubjects(value._2()));
return value._1();
}, Encoders.bean(inputClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static List<StructuredProperty> getSubjects(FOSDataModel fos) {
return Arrays
.asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3()))
.stream()
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static StructuredProperty getSubject(String sbj) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp.setQualifier(getQualifier(FOS_CLASS_ID, FOS_CLASS_NAME, ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
.setDataInfo(
getDataInfo(
UPDATE_DATA_INFO_TYPE,
UPDATE_SUBJECT_FOS_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ""));
return sp;
}
}

View File

@ -1,93 +0,0 @@
package eu.dnetlib.dhp.bypassactionset.opencitations;
import java.io.*;
import java.io.Serializable;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class GetOpenCitationsRefs implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class);
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
GetOpenCitationsRefs.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bypassactionset/opencitations/input_parameters.json"))));
parser.parseArgument(args);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
for (String file : inputFile) {
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
}
}
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
throws IOException {
final Path path = new Path(inputFile);
FSDataInputStream oc_zip = fileSystem.open(path);
int count = 1;
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
ZipEntry entry = null;
while ((entry = zis.getNextEntry()) != null) {
if (!entry.isDirectory()) {
String fileName = entry.getName();
fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
count++;
try (
FSDataOutputStream out = fileSystem
.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(zis, gzipOs);
}
}
}
}
}
}

View File

@ -1,150 +0,0 @@
package eu.dnetlib.dhp.bypassactionset.opencitations;
import static eu.dnetlib.dhp.PropagationConstant.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class SparkUpdateOCRels implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateOCRels.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
SparkUpdateOCRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath.toString());
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final boolean shouldDuplicateRels = Optional
.ofNullable(parser.get("shouldDuplicateRels"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> addOCRelations(spark, inputPath, outputPath, shouldDuplicateRels));
}
private static void addOCRelations(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.write()
.mode(SaveMode.Append)
.option("compression", "gzip")
.json(outputPath);
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue(DOI, line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue(DOI, line[2]));
relationList
.addAll(
getRelations(
citing,
cited));
if (duplicate && line[1].endsWith(REF_DOI)) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue(DOI, line[1].substring(0, line[1].indexOf(REF_DOI))));
relationList.addAll(getRelations(citing, cited));
}
return relationList;
}
private static Collection<Relation> getRelations(String citing, String cited) {
return Arrays
.asList(
getRelation(citing, cited, ModelConstants.CITES),
getRelation(cited, citing, ModelConstants.IS_CITED_BY));
}
public static Relation getRelation(
String source,
String target,
String relclass) {
Relation r = new Relation();
r.setCollectedfrom(getCollectedFrom());
r.setSource(source);
r.setTarget(target);
r.setRelClass(relclass);
r.setRelType(ModelConstants.RESULT_RESULT);
r.setSubRelType(ModelConstants.CITATION);
r
.setDataInfo(
getDataInfo(
UPDATE_DATA_INFO_TYPE, OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, OC_TRUST, false));
return r;
}
public static List<KeyValue> getCollectedFrom() {
KeyValue kv = new KeyValue();
kv.setKey(ModelConstants.OPENOCITATIONS_ID);
kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
return Arrays.asList(kv);
}
}

View File

@ -1,32 +0,0 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "rtn",
"paramLongName": "resultTableName",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
},
{
"paramName": "bsp",
"paramLongName": "bipScorePath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -1,250 +0,0 @@
package eu.dnetlib.dhp.bypassactionset;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.oaf.Author;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bypassactionset.bipfinder.PrepareBipFinder;
import eu.dnetlib.dhp.bypassactionset.bipfinder.SparkUpdateBip;
import eu.dnetlib.dhp.bypassactionset.model.BipScore;
import eu.dnetlib.dhp.countrypropagation.CountryPropagationJobTest;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class BipTest {
private static final Logger log = LoggerFactory.getLogger(FOSTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________";
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(BipTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(FOSTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CountryPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void prepareBipTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", sourcePath,
"--outputPath", workingDir.toString() + "/remapDoi"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipScore> tmp = sc
.textFile(workingDir.toString() + "/remapDoi")
.map(item -> OBJECT_MAPPER.readValue(item, BipScore.class));
Assertions.assertEquals(86, tmp.count());
// tmp.foreach(v -> System.out.println(OBJECT_MAPPER.writeValueAsString(v)));
String doi1 = ID_PREFIX +
IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.0000/096020199389707"));
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getScoreList().size());
Assertions
.assertEquals(
"6.34596412687e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getScoreList()
.stream()
.filter(sl -> sl.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"0.641151896994", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getScoreList()
.stream()
.filter(sl -> sl.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"2.33375102921e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getScoreList()
.stream()
.filter(sl -> sl.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
}
@Test
void updateResult() throws Exception {
final String bipScorePath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/preparedbip.json")
.getPath();
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/publicationnomatch.json")
.getPath();
SparkUpdateBip
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--bipScorePath", bipScorePath,
"--inputPath", inputPath,
"--outputPath", workingDir.toString() + "/publication",
"--resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(6, tmp.count());
Assertions.assertEquals(0, tmp.filter(r -> r.getMeasures() != null).count());
tmp.foreach(r -> Assertions.assertEquals("publication", r.getResulttype().getClassid()));
}
@Test
void updateResultMatchCheckMeasures() throws Exception {
final String bipScorePath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/preparedbip.json")
.getPath();
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/publicationmatch.json")
.getPath();
SparkUpdateBip
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--bipScorePath", bipScorePath,
"--inputPath", inputPath,
"--outputPath", workingDir.toString() + "/publication",
"--resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(6, tmp.count());
Assertions.assertEquals(1, tmp.filter(r -> r.getMeasures() != null).count());
Assertions
.assertEquals(
1, tmp.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f")).count());
Assertions
.assertEquals(
1,
tmp
.filter(
r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f")
&& r.getMeasures() != null)
.count());
Assertions.assertEquals(3, tmp
.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f"))
.collect()
.get(0)
.getMeasures().size());
Assertions.assertEquals("5.91019644836e-09",
tmp.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f"))
.collect()
.get(0).getMeasures().stream().filter(m -> m.getId().equals("influence")).collect(Collectors.toList()).get(0).getUnit().get(0).getValue());
Assertions.assertEquals("0.0",
tmp.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f"))
.collect()
.get(0).getMeasures().stream().filter(m -> m.getId().equals("popularity_alt")).collect(Collectors.toList()).get(0).getUnit().get(0).getValue());
Assertions.assertEquals("9.88840807598e-09",
tmp.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f"))
.collect()
.get(0).getMeasures().stream().filter(m -> m.getId().equals("popularity")).collect(Collectors.toList()).get(0).getUnit().get(0).getValue());
tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
}
}

View File

@ -1,253 +0,0 @@
package eu.dnetlib.dhp.bypassactionset;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import eu.dnetlib.dhp.PropagationConstant;
import eu.dnetlib.dhp.bypassactionset.fos.SparkUpdateFOS;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bypassactionset.fos.GetFOSData;
import eu.dnetlib.dhp.bypassactionset.fos.PrepareFOSSparkJob;
import eu.dnetlib.dhp.bypassactionset.model.FOSDataModel;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.countrypropagation.CountryPropagationJobTest;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class FOSTest {
private static final Logger log = LoggerFactory.getLogger(FOSTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________";
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(CountryPropagationJobTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(FOSTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CountryPropagationJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void getFOSFileTest() throws CollectorException, IOException, ClassNotFoundException {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/fos/h2020_fos_sbs.csv")
.getPath();
final String outputPath = workingDir.toString() + "/fos.json";
new GetFOSData()
.doRewrite(sourcePath, outputPath, "eu.dnetlib.dhp.bypassactionset.FOSDataModel", '\t', fs);
BufferedReader in = new BufferedReader(
new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class);
System.out.println(new ObjectMapper().writeValueAsString(fos));
count += 1;
}
assertEquals(38, count);
}
@Test
void distributeDoiTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/distribute"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<FOSDataModel> tmp = sc
.textFile(workingDir.toString() + "/distribute")
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
String doi1 = ID_PREFIX +
IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.3390/s18072310"));
assertEquals(50, tmp.count());
assertEquals(1, tmp.filter(row -> row.getDoi().equals(doi1)).count());
assertEquals(
"engineering and technology", tmp.filter(r -> r.getDoi().equals(doi1)).collect().get(0).getLevel1());
assertEquals("nano-technology", tmp.filter(r -> r.getDoi().equals(doi1)).collect().get(0).getLevel2());
assertEquals(
"nanoscience & nanotechnology", tmp.filter(r -> r.getDoi().equals(doi1)).collect().get(0).getLevel3());
String doi = ID_PREFIX +
IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/1365-2656.12831"));
assertEquals(1, tmp.filter(row -> row.getDoi().equals(doi)).count());
assertEquals("social sciences", tmp.filter(r -> r.getDoi().equals(doi)).collect().get(0).getLevel1());
assertEquals(
"psychology and cognitive sciences", tmp.filter(r -> r.getDoi().equals(doi)).collect().get(0).getLevel2());
assertEquals("NULL", tmp.filter(r -> r.getDoi().equals(doi)).collect().get(0).getLevel3());
}
@Test
void updateResult() throws Exception{
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/fos/fos_prepared.json")
.getPath();
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/bip/publicationnomatch.json")
.getPath();
SparkUpdateFOS
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--fosPath", fosPath,
"--inputPath", inputPath,
"--outputPath", workingDir.toString() + "/publication",
"--resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(6, tmp.count());
tmp.filter(r -> r.getSubject() != null).map(p -> p.getSubject())
.foreach(s -> s.stream().forEach(sbj -> Assertions.assertFalse("FOS".equals(sbj.getQualifier().getClassid()))));
}
@Test
void updateResultMatch() throws Exception{
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/fos/fos_prepared.json")
.getPath();
final String inputPath = getClass()
.getResource("/eu/dnetlib/dhp/bypassactionset/fos/publicationmatch.json")
.getPath();
SparkUpdateFOS
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--fosPath", fosPath,
"--inputPath", inputPath,
"--outputPath", workingDir.toString() + "/publication",
"--resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Publication> tmp = sc
.textFile(workingDir.toString() + "/publication")
.map(item -> OBJECT_MAPPER.readValue(item, Publication.class));
Assertions.assertEquals(6, tmp.count());
Assertions.assertEquals(3, tmp.filter(r -> r.getSubject() != null).map(p -> p.getSubject()).flatMap(v -> v.iterator())
.filter(sbj -> sbj.getQualifier().getClassid().equals("FOS")).collect().size());
List<StructuredProperty> sbjs = tmp.filter(r -> r.getId().equals("50|doi_________b24ab3e127aa67e2a1017292988d571f"))
.map(p -> p.getSubject()).collect().get(0);
Assertions.assertEquals(12, sbjs.size());
Stream<StructuredProperty> fosSubjs = sbjs.stream().filter(sbj -> sbj.getQualifier().getClassid().equals("FOS"));
Assertions.assertTrue(fosSubjs
.map(sbj -> sbj.getValue()).collect(Collectors.toList()).contains("engineering and technology"));
Assertions.assertTrue(fosSubjs
.map(sbj -> sbj.getValue()).collect(Collectors.toList()).contains("nano-technology"));
Assertions.assertTrue(fosSubjs
.map(sbj -> sbj.getValue()).collect(Collectors.toList()).contains("nanoscience & nanotechnology"));
fosSubjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()) );
fosSubjs.forEach(sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid()) );
fosSubjs.forEach(sbj -> Assertions.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname() ));
fosSubjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust() ));
fosSubjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference() ));
fosSubjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible() ));
fosSubjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred() ));
}
}

View File

@ -1,86 +0,0 @@
{"id":"50|doi_________63848be3afd635374828253a6f974f11","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________5da9060b89165e3f61a0806dcf2c2696","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________9ea0266b8ddd471eddb20635b89273bb","scoreList":[{"id":"influence","unit":[{"key":"score","value":"7.5597134689e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"4.903880192"}]},{"id":"popularity","unit":[{"key":"score","value":"1.17977512835e-08"}]}]}
{"id":"50|doi_________ab797495de07d4f4a25f08b84fca6021","scoreList":[{"id":"influence","unit":[{"key":"score","value":"6.34596412687e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.641151896994"}]},{"id":"popularity","unit":[{"key":"score","value":"2.33375102921e-09"}]}]}
{"id":"50|doi_________df5ade937e177dcfb82bc5ec3139fefe","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"5.39172290649e-09"}]}]}
{"id":"50|doi_________357976ea7d4e744fe21966607334952c","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"5.39172290649e-09"}]}]}
{"id":"50|doi_________872982548f741b89eb5d30f509316dde","scoreList":[{"id":"influence","unit":[{"key":"score","value":"6.32078461509e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"1.6"}]},{"id":"popularity","unit":[{"key":"score","value":"8.3168486939e-09"}]}]}
{"id":"50|doi_________19c6da2771befb83f9f2715fc84f9edf","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.96492048955e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"1.0"}]},{"id":"popularity","unit":[{"key":"score","value":"1.12641925838e-08"}]}]}
{"id":"50|doi_________46e18e9e477d6fc71e002eae47cfc390","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________5cc344a6da53a8f2bac11faf7607e0b0","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"3.76260934675e-10"}]}]}
{"id":"50|doi_________fdfadf5cefdb8b63f90d5a3475a95959","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"5.39172290649e-09"}]}]}
{"id":"50|doi_________51fc7a9c7b9f1cf6705c7afb1de58967","scoreList":[{"id":"influence","unit":[{"key":"score","value":"6.93311506443e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.002176782336"}]},{"id":"popularity","unit":[{"key":"score","value":"1.7668105708e-09"}]}]}
{"id":"50|doi_________bbf7a94696ab67e4c29429522c31c0ef","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"5.39172290649e-09"}]}]}
{"id":"50|doi_________232498b3aed64dd0f0db87b09b61d5cd","scoreList":[{"id":"influence","unit":[{"key":"score","value":"6.26777280882e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.406656"}]},{"id":"popularity","unit":[{"key":"score","value":"3.39745193285e-09"}]}]}
{"id":"50|doi_________03749bed57efa24ab607527cf1eb94c5","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________65074998446928c1c18e25313dcf974b","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________2b62feaed6ad14760096c2a59f82836e","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________d2d133a6fdfd44f34f96e225b5573447","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________3dd3cccbfcad3f206d1239a580af011f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________84852a20e9bbf1daf89803b5fcfc9c34","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"3.47956715615e-09"}]}]}
{"id":"50|doi_________84c5824b3a10a28a8bc26fed6223a08a","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________8658fda7574016daa0d71c88eb6bab5f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"3.47956715615e-09"}]}]}
{"id":"50|doi_________ab52c136b88151f0c28f6ca4a5ef2a71","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________e8a2912fe3d70b13124e436294411378","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________cf3470c86f338ccf232f2869d0fa6ea2","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________16437c0064576207ba1438bf07fb9e21","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________f0ae1dc0f1dbb7fceaccc02d6e667f24","scoreList":[{"id":"influence","unit":[{"key":"score","value":"6.40470414877e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.6"}]},{"id":"popularity","unit":[{"key":"score","value":"7.89465099068e-09"}]}]}
{"id":"50|doi_________93d03d99bbc0f542fa2679e74882a096","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"3.47956715615e-09"}]}]}
{"id":"50|doi_________fcfce4bc985f22baf2dc1dc733a862e7","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________3b7fd390517f45b0d37e0ec0adbb98b1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"3.47956715615e-09"}]}]}
{"id":"50|doi_________31306ba727c26ba04c0d4bbd0899d433","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________3e6151aa77865a1fdae56392f68b81f1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________1cd6e2c4cf189819fa8a011629b5744c","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________45c855d81df747ff3c2033f6c8cf7bca","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________24ac4c0c4b143e37c2551723e9a55f36","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________5f9e3f1a076c2f587165f8ccd68286e5","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________b637731c913efe3c1f283183ef4a59cc","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"6.26204125721e-09"}]}]}
{"id":"50|doi_________490c0c23e4ed269f807e72e71cad09ae","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________15584e5a3a5bbdc487c85ab18d8a7c22","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________a7fd268447553e7e0fe06c19db28ea85","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________cf1b8db4480aa0e281a4cc122d3b2416","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________8880fd8fa9cafbde2431583dfbab1e1f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________22f8b4f61c49de05dcc0d94bf5e147d8","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________96e2b076fab4931ec2b1b9f43c9c31f9","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________1ac665f912877f93f9084dc38bc8174b","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________394fd70c30b21b1336b5a0405496fbe0","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________73caf44494091b089f8be65427fb4341","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________9b996963325dfae10dba5c3043938fc1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________1f03d5cab86f4963fbd3e0f1284cb9b8","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________87a59f3918f9ac2c394b2509173fb3e1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________9a8465f5343a1c845b6dd91496395c7e","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________b03c0a657c669d24f45844307b9ad6bd","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________04aeabd709d8a486bde733ffc5ef53ae","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.01810569717e-09"}]}]}
{"id":"50|doi_________6c21d2d2133477141ad74226f8fd75b0","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"7.28336930301e-09"}]}]}
{"id":"50|doi_________b746ffb04cd9f815d5880c827c55040e","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________0457b2e62a69f0e104db7cc575f2241a","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________2fa691f5394020f7cb2df4f272a062e7","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________aa2a77dc605567f5f2305b5f38a1a4b7","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________cdb6f10588d85495208cbc6bf4a75000","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________6630766c2ad4783ebd01a9d7e9607eb1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________389315d5f74765688d34b8ee86c2514f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________beef7316dd6665798120698451446d74","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________525a46f117c946b8ae1caea542e24d0c","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________fe3d2250ebaa65dae09aa372945c917f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________72d2ec1744cda7d49cf35707ff3e11c1","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________d7e0c5fd1f00d8a90879f168d73b9e0b","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________6e0e19ee1f9c1dbadf202ccd9919e6a7","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________6b306286f9b4d207f81bc9164713243c","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________acf5c0c1a125cd78fd9193589edc9152","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________66f645f86cdd9592953a1106e0dbf191","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________77ff842449308dd907ec46ebf9ccd539","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________a3d8fc73d1ac1308c117a61b4eb14ab5","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________9ff57cb17abfcd338c2ca768c85e452a","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"5.39172290649e-09"}]}]}
{"id":"50|doi_________f903c5c3305fc7b1598a0ed989bb4f83","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"4.65008652949e-09"}]}]}
{"id":"50|doi_________2260d2cdf6fbcee10d61d5e0f1428ebd","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________549c5c8f66c36cab609b7221eae37040","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________388a95b886946d771a7f38e86d58f65e","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________4141ef3f730fc811930a66c088486c34","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________e3b32800bba3187fbf7f470df08755ea","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"8.48190886761e-09"}]}]}
{"id":"50|doi_________d8d02aa2c2347ecc628724265c5cae27","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________08f1db95b06e62e34948f2b47ac449af","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________cbbab3f7306b0573664b94615f5c04e2","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________b24ab3e127aa67e2a1017292988d571f","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________eb9a4f92bd6934727e2722d22340884a","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________727484ef97b31ecb0806cc959869aa97","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}
{"id":"50|doi_________224c865ef04d10c117913fa08eeb1be4","scoreList":[{"id":"influence","unit":[{"key":"score","value":"5.91019644836e-09"}]},{"id":"popularity_alt","unit":[{"key":"score","value":"0.0"}]},{"id":"popularity","unit":[{"key":"score","value":"9.88840807598e-09"}]}]}

View File

@ -1,50 +0,0 @@
{"doi":"50|doi_________b24ab3e127aa67e2a1017292988d571f","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"}
{"doi":"50|doi_________f648499be6ba8e83226834167f22a7cb","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
{"doi":"50|doi_________439b0885db30c66ecf62a2a6a4451116","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
{"doi":"50|doi_________c22f065c9ab34fee87a3952fb79f5ee6","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________9ea3d8140aaa8add5e929be92f0dab13","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________12f446645a131ab55fc5dabf6692da31","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
{"doi":"50|doi_________550a40c9b57fcc974c127ad69dd60df2","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"}
{"doi":"50|doi_________6b95eee8d0eff26b7cca8e960968ad62","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
{"doi":"50|doi_________5ff24d570b3984ddcf04f58b771ad635","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
{"doi":"50|doi_________b909cd0953065bc611bac1d89e96bf6c","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
{"doi":"50|doi_________d56d9dc21f317b3e009d5b6c8ea87212","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"50|doi_________3ea4d5309ce7934ed281342dd1f70867","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________f23029f6f070b4b2e1c852ce7f9f5f32","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"}
{"doi":"50|doi_________2ea2433fb314647e72cab828dd529f00","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"}
{"doi":"50|doi_________ffe10c217b3a96f7584cf09fdad579e1","level1":"engineering and technology","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________45e682be5a57e2fabea2c02ba0752f1a","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________312f5db945545c66e6f18c62faf6290c","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"}
{"doi":"50|doi_________d53bd3a4e48921415e554a4edc91d6db","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
{"doi":"50|doi_________2819592482582ff33363069d116abd64","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"50|doi_________e41a47550fed93904e443123b752bc2b","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
{"doi":"50|doi_________43e8bf6e95cd3043f510771cf0b0c984","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
{"doi":"50|doi_________000c1dc14e99b89fc52976533338fe4c","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
{"doi":"50|doi_________5cf55e49aebd633f1326d28451d1f6e5","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"}
{"doi":"50|doi_________5b79bd7bd9f87361b4a4abc3cbb2df75","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"}
{"doi":"50|doi_________51a7b4738d332570beb98be13e95d369","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"}
{"doi":"50|doi_________6a9271220296585204ff81d651215d63","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"50|doi_________a2b8554df106bb29a0035e2ea56046a2","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"}
{"doi":"50|doi_________fa2b92d5140f67a6c69c970a9e8d2cf0","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
{"doi":"50|doi_________b5f6c78a31abbb806e1e375a73231dd1","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
{"doi":"50|doi_________358777f48b491554cdee63c4dcabe6aa","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
{"doi":"50|doi_________809f282159a9f46a41f92b6fc8d1095f","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"}
{"doi":"50|doi_________94f1b0b1509700d4370cbe7571000bfc","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"}
{"doi":"50|doi_________33390d9d5163da63f73acd173ef704e0","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"50|doi_________3149f0a909641720480f62faf1f886b9","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"50|doi_________1e29deb6473bdb2c84e5966fc4c557bb","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"50|doi_________effab60be43d653f2cccf1cf6de461df","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
{"doi":"50|doi_________14a6ce8c28da6e82412720f1330e826d","level1":"natural sciences","level2":"biological sciences","level3":"NULL"}
{"doi":"50|doi_________f59abf7f96244398a465b6e9a7375311","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"}
{"doi":"50|doi_________b1b7b5c7251fa1901583ec4840e2d3ec","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"}
{"doi":"50|doi_________e024d1b738df3b24bc58fa0228542571","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"50|doi_________0e03d3592dca1baedfc74f1fbe0b9f22","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"50|doi_________0e1777e31f32984e5a261205be28da69","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
{"doi":"50|doi_________7b464a5eb02a959aeedc7b051fe0f89f","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
{"doi":"50|doi_________32fed3ec7c9ac157b73e050ff1c158d3","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
{"doi":"50|doi_________8302f548fe724bc58e53e2bd329cb3c3","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
{"doi":"50|doi_________44fbef7e57e4123ef44f10bb073f9ef8","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
{"doi":"50|doi_________c8e7d24b1649024e85bcf9a4c520a65b","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
{"doi":"50|doi_________877a3c4a72d2a6b4aa60a2da016237df","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
{"doi":"50|doi_________7af7cda00a082fa8624493d74357d3f7","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
{"doi":"50|doi_________20201cc71fd003dae0d58016dd4ef4af","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"}

View File

@ -331,7 +331,6 @@ public class DumpJobTest {
Assertions
.assertEquals(
Constants.accessRightsCoarMap.get(ModelConstants.ACCESS_RIGHT_OPEN), gr.getBestaccessright().getCode());
Assertions.assertEquals(null, gr.getBestaccessright().getOpenAccessRoute());
Assertions.assertEquals("One Ecosystem", gr.getContainer().getName());
Assertions.assertEquals("2367-8194", gr.getContainer().getIssnOnline());