1
0
Fork 0

Merge pull request '[UnresolvedEntities] changing in the creation of the unresolved entities' (#346) from fos into beta

Reviewed-on: D-Net/dnet-hadoop#346
This commit is contained in:
Claudio Atzori 2023-10-10 15:10:21 +02:00
commit ecea58a41c
12 changed files with 296 additions and 401 deletions

View File

@ -40,6 +40,7 @@ public class Constants {
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
public static final String NULL = "NULL";
public static final String NA = "N/A";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -61,10 +62,16 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
if (sbj == null || sbj.equals(NULL))
public static Subject getSubject(String sbj, String classid, String classname, String diqualifierclassid,
Boolean split) {
if (sbj == null || sbj.equals(NULL) || sbj.startsWith(NA))
return null;
String trust = "";
String subject = sbj;
if (split) {
sbj = subject.split("@@")[0];
trust = subject.split("@@")[1];
}
Subject s = new Subject();
s.setValue(sbj);
s
@ -89,9 +96,14 @@ public class Constants {
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
trust));
return s;
}
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
return getSubject(sbj, classid, classname, diqualifierclassid, false);
}

View File

@ -75,9 +75,12 @@ public class GetFOSSparkJob implements Serializable {
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
FOSDataModel fosDataModel = new FOSDataModel();
fosDataModel.setDoi(r.getString(0).toLowerCase());
fosDataModel.setLevel1(r.getString(1));
fosDataModel.setLevel2(r.getString(2));
fosDataModel.setLevel3(r.getString(3));
fosDataModel.setLevel1(r.getString(2));
fosDataModel.setLevel2(r.getString(3));
fosDataModel.setLevel3(r.getString(4));
fosDataModel.setLevel4(r.getString(5));
fosDataModel.setScoreL3(String.valueOf(r.getDouble(6)));
fosDataModel.setScoreL4(String.valueOf(r.getDouble(7)));
return fosDataModel;
}, Encoders.bean(FOSDataModel.class))
.write()

View File

@ -1,178 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.actionmanager.Constants.UPDATE_CLASS_NAME;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareBipFinder.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}: ", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
prepareResults(spark, sourcePath, outputPath);
});
}
private static void prepareResults(SparkSession spark, String inputPath, String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
.textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.map((MapFunction<BipScore, Result>) v -> {
Result r = new Result();
final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
Instance inst = new Instance();
inst.setMeasures(getMeasure(v));
inst
.setPid(
Arrays
.asList(
OafMapperUtils
.structuredProperty(
cleanedPid,
OafMapperUtils
.qualifier(
DOI, DOI_CLASSNAME,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES),
null)));
r.setInstance(Arrays.asList(inst));
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/bip");
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
}

View File

@ -78,12 +78,20 @@ public class PrepareFOSSparkJob implements Serializable {
HashSet<String> level1 = new HashSet<>();
HashSet<String> level2 = new HashSet<>();
HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
HashSet<String> level4 = new HashSet<>();
addLevels(level1, level2, level3, level4, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, level4, v));
List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level1
.forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2
.forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3
.forEach(
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
level4
.forEach(
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
r.setSubject(sbjs);
r
.setDataInfo(
@ -106,11 +114,18 @@ public class PrepareFOSSparkJob implements Serializable {
.json(outputPath + "/fos");
}
private static void add(List<Subject> sbsjs, Subject sbj) {
if (sbj != null)
sbsjs.add(sbj);
}
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
HashSet<String> level4,
FOSDataModel first) {
level1.add(first.getLevel1());
level2.add(first.getLevel2());
level3.add(first.getLevel3());
level3.add(first.getLevel3() + "@@" + first.getScoreL3());
level4.add(first.getLevel4() + "@@" + first.getScoreL4());
}
}

View File

@ -69,9 +69,9 @@ public class SparkSaveUnresolved implements Serializable {
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> {
if (r.getInstance() != null) {
ret.setInstance(r.getInstance());
}
// if (r.getInstance() != null) {
// ret.setInstance(r.getInstance());
// }
if (r.getSubject() != null) {
if (ret.getSubject() != null)
ret.getSubject().addAll(r.getSubject());

View File

@ -11,21 +11,43 @@ public class FOSDataModel implements Serializable {
private String doi;
@CsvBindByPosition(position = 1)
// @CsvBindByName(column = "doi")
private String oaid;
@CsvBindByPosition(position = 2)
// @CsvBindByName(column = "level1")
private String level1;
@CsvBindByPosition(position = 2)
@CsvBindByPosition(position = 3)
// @CsvBindByName(column = "level2")
private String level2;
@CsvBindByPosition(position = 3)
@CsvBindByPosition(position = 4)
// @CsvBindByName(column = "level3")
private String level3;
@CsvBindByPosition(position = 5)
// @CsvBindByName(column = "level3")
private String level4;
@CsvBindByPosition(position = 6)
private String scoreL3;
@CsvBindByPosition(position = 7)
private String scoreL4;
public FOSDataModel() {
}
public FOSDataModel(String doi, String level1, String level2, String level3, String level4, String l3score,
String l4score) {
this.doi = doi;
this.level1 = level1;
this.level2 = level2;
this.level3 = level3;
this.level4 = level4;
this.scoreL3 = l3score;
this.scoreL4 = l4score;
}
public FOSDataModel(String doi, String level1, String level2, String level3) {
this.doi = doi;
this.level1 = level1;
@ -33,8 +55,41 @@ public class FOSDataModel implements Serializable {
this.level3 = level3;
}
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
return new FOSDataModel(d, level1, level2, level3);
public static FOSDataModel newInstance(String d, String level1, String level2, String level3, String level4,
String scorel3, String scorel4) {
return new FOSDataModel(d, level1, level2, level3, level4, scorel3, scorel4);
}
public String getOaid() {
return oaid;
}
public void setOaid(String oaid) {
this.oaid = oaid;
}
public String getLevel4() {
return level4;
}
public void setLevel4(String level4) {
this.level4 = level4;
}
public String getScoreL3() {
return scoreL3;
}
public void setScoreL3(String scoreL3) {
this.scoreL3 = scoreL3;
}
public String getScoreL4() {
return scoreL4;
}
public void setScoreL4(String scoreL4) {
this.scoreL4 = scoreL4;
}
public String getDoi() {

View File

@ -5,11 +5,6 @@
<name>fosPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
@ -77,35 +72,10 @@
<fork name="prepareInfo">
<path start="prepareBip"/>
<path start="getFOS"/>
<path start="getSDG"/>
</fork>
<action name="prepareBip">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from BIP! Finder</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${bipScorePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
</spark>
<ok to="join"/>
<error to="Kill"/>
</action>
<action name="getFOS">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -125,6 +95,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${fosPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
<arg>--delimiter</arg><arg>${delimiter}</arg>
</spark>
<ok to="prepareFos"/>
<error to="Kill"/>

View File

@ -13,10 +13,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -68,6 +65,7 @@ public class GetFosTest {
}
@Test
@Disabled
void test3() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv")
@ -96,4 +94,37 @@ public class GetFosTest {
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
}
@Test
void test4() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs2.csv")
.getPath();
final String outputPath = workingDir.toString() + "/fos.json";
GetFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--delimiter", ",",
"-outputPath", outputPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<FOSDataModel> tmp = sc
.textFile(outputPath)
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel4() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL3() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL4() != null));
}
}

View File

@ -67,92 +67,6 @@ public class PrepareTest {
spark.stop();
}
@Test
void bipPrepareTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(86, tmp.count());
String doi1 = "unresolved::10.0000/096020199389707::doi";
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getInstance().size());
Assertions
.assertEquals(
3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getInstance().get(0).getMeasures().size());
Assertions
.assertEquals(
"6.34596412687e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getInstance()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"0.641151896994", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getInstance()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"2.33375102921e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getInstance()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
final String doi2 = "unresolved::10.3390/s18072310::doi";
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).count());
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi2)).collect().get(0).getInstance().size());
}
@Test
void fosPrepareTest() throws Exception {
final String sourcePath = getClass()
@ -222,6 +136,76 @@ public class PrepareTest {
}
@Test
void fosPrepareTest2() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/fos")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
String doi1 = "unresolved::10.1016/j.revmed.2006.07.012::doi";
assertEquals(13, tmp.count());
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
Result result = tmp
.filter(r -> r.getId().equals(doi1))
.first();
result.getSubject().forEach(s -> System.out.println(s.getValue() + " trust = " + s.getDataInfo().getTrust()));
Assertions.assertEquals(6, result.getSubject().size());
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s.getValue().contains("03 medical and health sciences")
&& s.getDataInfo().getTrust().equals("")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s.getValue().contains("0302 clinical medicine") && s.getDataInfo().getTrust().equals("")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s
.getValue()
.contains("030204 cardiovascular system & hematology")
&& s.getDataInfo().getTrust().equals("0.5101401805877686")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s
.getValue()
.contains("03020409 Hematology/Coagulopathies")
&& s.getDataInfo().getTrust().equals("0.0546871414174914")));
}
@Test
void sdgPrepareTest() throws Exception {
final String sourcePath = getClass()
@ -268,57 +252,4 @@ public class PrepareTest {
}
// @Test
// void test3() throws Exception {
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_fos_results_20_12_2021.csv.gz";
//
// final String outputPath = workingDir.toString() + "/fos.json";
// GetFOSSparkJob
// .main(
// new String[] {
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
// "--sourcePath", sourcePath,
//
// "-outputPath", outputPath
//
// });
//
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
//
// JavaRDD<FOSDataModel> tmp = sc
// .textFile(outputPath)
// .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
//
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
//
// }
//
// @Test
// void test4() throws Exception {
// final String sourcePath = "/Users/miriam.baglioni/Downloads/doi_sdg_results_20_12_21.csv.gz";
//
// final String outputPath = workingDir.toString() + "/sdg.json";
// GetSDGSparkJob
// .main(
// new String[] {
// "--isSparkSessionManaged", Boolean.FALSE.toString(),
// "--sourcePath", sourcePath,
//
// "-outputPath", outputPath
//
// });
//
// final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
//
// JavaRDD<SDGDataModel> tmp = sc
// .textFile(outputPath)
// .map(item -> OBJECT_MAPPER.readValue(item, SDGDataModel.class));
//
// tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
// tmp.foreach(t -> Assertions.assertTrue(t.getSbj() != null));
//
// }
}

View File

@ -340,18 +340,7 @@ public class ProduceTest {
}
private JavaRDD<Result> getResultJavaRDD() throws Exception {
final String bipPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
@ -379,6 +368,40 @@ public class ProduceTest {
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
}
@Test
public JavaRDD<Result> getResultFosJavaRDD() throws Exception {
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
return tmp;
}
@Test
void prepareTest5Subjects() throws Exception {
final String doi = "unresolved::10.1063/5.0032658::doi";
@ -415,18 +438,7 @@ public class ProduceTest {
}
private JavaRDD<Result> getResultJavaRDDPlusSDG() throws Exception {
final String bipPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
@ -483,14 +495,6 @@ public class ProduceTest {
.filter(row -> row.getSubject() != null)
.count());
Assertions
.assertEquals(
85,
tmp
.filter(row -> !row.getId().equals(doi))
.filter(r -> r.getInstance() != null && r.getInstance().size() > 0)
.count());
}
@Test

View File

@ -0,0 +1,26 @@
DOI,OAID,level1,level2,level3,level4,score_for_L3,score_for_L4
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis,N/A,0.5036656856536865,0.0
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030212 general & internal medicine,N/A,0.4963343143463135,0.0
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation,02090104 Robotics/Robots,0.6111094951629639,0.5053805979936855
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,01 natural sciences,0104 chemical sciences,010401 analytical chemistry,N/A,0.3888905048370361,0.0
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0506 political science,050602 political science & public administration,05060202 Ethnic groups/Ethnicity,0.6159052848815918,0.7369035568037298
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0502 economics and business,050207 economics,N/A,0.3840946555137634,0.0
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010405 organic chemistry,01040508 Functional groups/Ethers,0.5566747188568115,0.5582916736602783
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010402 general chemistry,01040207 Chemical synthesis/Total synthesis,0.4433253407478332,0.4417082965373993
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0302 clinical medicine,030204 cardiovascular system & hematology,03020409 Hematology/Coagulopathies,0.5101401805877686,0.0546871414174914
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0301 basic medicine,030105 genetics & heredity,N/A,0.4898599088191986,0.0
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0301 basic medicine,030104 developmental biology,N/A,0.5,0.0
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0303 health sciences,030304 developmental biology,N/A,0.5,0.0
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050203 business & management,05020311 International business/International trade,0.605047881603241,0.2156608108845153
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050211 marketing,N/A,0.394952118396759,0.0
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0502 economics and business,050203 business & management,05020307 Innovation/Product management,0.5293408632278442,0.5326762795448303
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0509 other social sciences,050905 science studies,05090502 Social philosophy/Capitalism,0.4706590473651886,0.4673237204551697
10.1785/0120050806,doi_________::00d5831d329e7ae4523d78bfc3042e98,02 engineering and technology,0211 other engineering and technologies,021101 geological & geomatics engineering,02110103 Concrete/Building materials,0.5343400835990906,0.3285667930180677
1 DOI OAID level1 level2 level3 level4 score_for_L3 score_for_L4
2 10.1016/j.anucene.2006.02.004 doi_________::00059d9963edf633bec756fb21b5bd72 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy 02020908 Climate change policy/Ethanol fuel 0.5 0.5
3 10.1016/j.anucene.2006.02.004 doi_________::00059d9963edf633bec756fb21b5bd72 02 engineering and technology 0211 other engineering and technologies 021108 energy 02110808 Climate change policy/Ethanol fuel 0.5 0.5
4 10.1016/j.revmed.2006.07.010 doi_________::0026476c1651a92c933d752ff12496c7 03 medical and health sciences 0302 clinical medicine 030220 oncology & carcinogenesis N/A 0.5036656856536865 0.0
5 10.1016/j.revmed.2006.07.010 doi_________::0026476c1651a92c933d752ff12496c7 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine N/A 0.4963343143463135 0.0
6 10.20965/jrm.2006.p0312 doi_________::0028336a2f3826cc83c47dbefac71543 02 engineering and technology 0209 industrial biotechnology 020901 industrial engineering & automation 02090104 Robotics/Robots 0.6111094951629639 0.5053805979936855
7 10.20965/jrm.2006.p0312 doi_________::0028336a2f3826cc83c47dbefac71543 01 natural sciences 0104 chemical sciences 010401 analytical chemistry N/A 0.3888905048370361 0.0
8 10.1111/j.1747-7379.2006.040_1.x doi_________::002c7077e7c114a8304eb90f59e45fa4 05 social sciences 0506 political science 050602 political science & public administration 05060202 Ethnic groups/Ethnicity 0.6159052848815918 0.7369035568037298
9 10.1111/j.1747-7379.2006.040_1.x doi_________::002c7077e7c114a8304eb90f59e45fa4 05 social sciences 0502 economics and business 050207 economics N/A 0.3840946555137634 0.0
10 10.1007/s10512-006-0049-9 doi_________::003f29f9254819cf4c78558b1bc25f10 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy 02020908 Climate change policy/Ethanol fuel 0.5 0.5
11 10.1007/s10512-006-0049-9 doi_________::003f29f9254819cf4c78558b1bc25f10 02 engineering and technology 0211 other engineering and technologies 021108 energy 02110808 Climate change policy/Ethanol fuel 0.5 0.5
12 10.1111/j.1365-2621.2005.01045.x doi_________::00419355b4c3e0646bd0e1b301164c8e 04 agricultural and veterinary sciences 0404 agricultural biotechnology 040401 food science 04040102 Food science/Food industry 0.5 0.5
13 10.1111/j.1365-2621.2005.01045.x doi_________::00419355b4c3e0646bd0e1b301164c8e 04 agricultural and veterinary sciences 0405 other agricultural sciences 040502 food science 04050202 Food science/Food industry 0.5 0.5
14 10.1002/chin.200617262 doi_________::004c8cef80668904961b9e62841793c8 01 natural sciences 0104 chemical sciences 010405 organic chemistry 01040508 Functional groups/Ethers 0.5566747188568115 0.5582916736602783
15 10.1002/chin.200617262 doi_________::004c8cef80668904961b9e62841793c8 01 natural sciences 0104 chemical sciences 010402 general chemistry 01040207 Chemical synthesis/Total synthesis 0.4433253407478332 0.4417082965373993
16 10.1016/j.revmed.2006.07.012 doi_________::005b1d0fb650b680abaf6cfe26a21604 03 medical and health sciences 0302 clinical medicine 030204 cardiovascular system & hematology 03020409 Hematology/Coagulopathies 0.5101401805877686 0.0546871414174914
17 10.1016/j.revmed.2006.07.012 doi_________::005b1d0fb650b680abaf6cfe26a21604 03 medical and health sciences 0301 basic medicine 030105 genetics & heredity N/A 0.4898599088191986 0.0
18 10.4109/jslab.17.132 doi_________::00889baa06de363e37930daaf8e800c0 03 medical and health sciences 0301 basic medicine 030104 developmental biology N/A 0.5 0.0
19 10.4109/jslab.17.132 doi_________::00889baa06de363e37930daaf8e800c0 03 medical and health sciences 0303 health sciences 030304 developmental biology N/A 0.5 0.0
20 10.1108/00251740610715687 doi_________::0092cb1b1920d556719385a26363ecaa 05 social sciences 0502 economics and business 050203 business & management 05020311 International business/International trade 0.605047881603241 0.2156608108845153
21 10.1108/00251740610715687 doi_________::0092cb1b1920d556719385a26363ecaa 05 social sciences 0502 economics and business 050211 marketing N/A 0.394952118396759 0.0
22 10.1080/03067310500248098 doi_________::00a76678d230e3f20b6356804448028f 04 agricultural and veterinary sciences 0404 agricultural biotechnology 040401 food science 04040102 Food science/Food industry 0.5 0.5
23 10.1080/03067310500248098 doi_________::00a76678d230e3f20b6356804448028f 04 agricultural and veterinary sciences 0405 other agricultural sciences 040502 food science 04050202 Food science/Food industry 0.5 0.5
24 10.3152/147154306781778533 doi_________::00acc520f3939e5a6675343881fed4f2 05 social sciences 0502 economics and business 050203 business & management 05020307 Innovation/Product management 0.5293408632278442 0.5326762795448303
25 10.3152/147154306781778533 doi_________::00acc520f3939e5a6675343881fed4f2 05 social sciences 0509 other social sciences 050905 science studies 05090502 Social philosophy/Capitalism 0.4706590473651886 0.4673237204551697
26 10.1785/0120050806 doi_________::00d5831d329e7ae4523d78bfc3042e98 02 engineering and technology 0211 other engineering and technologies 021101 geological & geomatics engineering 02110103 Concrete/Building materials 0.5343400835990906 0.3285667930180677

View File

@ -0,0 +1,25 @@
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis","level4":"N/A","scoreL3":"0.5036656856536865","scoreL4":"0.0"}
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030212 general & internal medicine","level4":"N/A","scoreL3":"0.4963343143463135","scoreL4":"0.0"}
{"doi":"10.20965/jrm.2006.p0312","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation","level4":"02090104 Robotics/Robots","scoreL3":"0.6111094951629639","scoreL4":"0.5053805979936855"}
{"doi":"10.20965/jrm.2006.p0312","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010401 analytical chemistry","level4":"N/A","scoreL3":"0.3888905048370361","scoreL4":"0.0"}
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0506 political science","level3":"050602 political science & public administration","level4":"05060202 Ethnic groups/Ethnicity","scoreL3":"0.6159052848815918","scoreL4":"0.7369035568037298"}
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0502 economics and business","level3":"050207 economics","level4":"N/A","scoreL3":"0.3840946555137634","scoreL4":"0.0"}
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry","level4":"01040508 Functional groups/Ethers","scoreL3":"0.5566747188568115","scoreL4":"0.5582916736602783"}
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry","level4":"01040207 Chemical synthesis/Total synthesis","scoreL3":"0.4433253407478332","scoreL4":"0.4417082965373993"}
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030204 cardiovascular system & hematology","level4":"03020409 Hematology/Coagulopathies","scoreL3":"0.5101401805877686","scoreL4":"0.0546871414174914"}
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030105 genetics & heredity","level4":"N/A","scoreL3":"0.4898599088191986","scoreL4":"0.0"}
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030104 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030304 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020311 International business/International trade","scoreL3":"0.605047881603241","scoreL4":"0.2156608108845153"}
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing","level4":"N/A","scoreL3":"0.394952118396759","scoreL4":"0.0"}
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020307 Innovation/Product management","scoreL3":"0.5293408632278442","scoreL4":"0.5326762795448303"}
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0509 other social sciences","level3":"050905 science studies","level4":"05090502 Social philosophy/Capitalism","scoreL3":"0.4706590473651886","scoreL4":"0.4673237204551697"}
{"doi":"10.1785/0120050806","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021101 geological & geomatics engineering","level4":"02110103 Concrete/Building materials","scoreL3":"0.5343400835990906","scoreL4":"0.3285667930180677"}