forked from D-Net/dnet-hadoop
extend the fos model to include the level4 and the scores for level3 and level4. removed bip indicators from the instance
This commit is contained in:
parent
ef833840c3
commit
110ce4b40f
|
@ -40,6 +40,7 @@ public class Constants {
|
||||||
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
|
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
|
||||||
|
|
||||||
public static final String NULL = "NULL";
|
public static final String NULL = "NULL";
|
||||||
|
public static final String NA = "N/A";
|
||||||
|
|
||||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
@ -61,10 +62,16 @@ public class Constants {
|
||||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Subject getSubject(String sbj, String classid, String classname,
|
public static Subject getSubject(String sbj, String classid, String classname, String diqualifierclassid,
|
||||||
String diqualifierclassid) {
|
Boolean split) {
|
||||||
if (sbj == null || sbj.equals(NULL))
|
if (sbj == null || sbj.equals(NULL) || sbj.startsWith(NA))
|
||||||
return null;
|
return null;
|
||||||
|
String trust = "";
|
||||||
|
String subject = sbj;
|
||||||
|
if (split) {
|
||||||
|
sbj = subject.split("@@")[0];
|
||||||
|
trust = subject.split("@@")[1];
|
||||||
|
}
|
||||||
Subject s = new Subject();
|
Subject s = new Subject();
|
||||||
s.setValue(sbj);
|
s.setValue(sbj);
|
||||||
s
|
s
|
||||||
|
@ -89,9 +96,14 @@ public class Constants {
|
||||||
UPDATE_CLASS_NAME,
|
UPDATE_CLASS_NAME,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||||
""));
|
trust));
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Subject getSubject(String sbj, String classid, String classname,
|
||||||
|
String diqualifierclassid) {
|
||||||
|
return getSubject(sbj, classid, classname, diqualifierclassid, false);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -75,9 +75,12 @@ public class GetFOSSparkJob implements Serializable {
|
||||||
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
|
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
|
||||||
FOSDataModel fosDataModel = new FOSDataModel();
|
FOSDataModel fosDataModel = new FOSDataModel();
|
||||||
fosDataModel.setDoi(r.getString(0).toLowerCase());
|
fosDataModel.setDoi(r.getString(0).toLowerCase());
|
||||||
fosDataModel.setLevel1(r.getString(1));
|
fosDataModel.setLevel1(r.getString(2));
|
||||||
fosDataModel.setLevel2(r.getString(2));
|
fosDataModel.setLevel2(r.getString(3));
|
||||||
fosDataModel.setLevel3(r.getString(3));
|
fosDataModel.setLevel3(r.getString(4));
|
||||||
|
fosDataModel.setLevel4(r.getString(5));
|
||||||
|
fosDataModel.setScoreL3(String.valueOf(r.getDouble(6)));
|
||||||
|
fosDataModel.setScoreL4(String.valueOf(r.getDouble(7)));
|
||||||
return fosDataModel;
|
return fosDataModel;
|
||||||
}, Encoders.bean(FOSDataModel.class))
|
}, Encoders.bean(FOSDataModel.class))
|
||||||
.write()
|
.write()
|
||||||
|
|
|
@ -78,12 +78,20 @@ public class PrepareFOSSparkJob implements Serializable {
|
||||||
HashSet<String> level1 = new HashSet<>();
|
HashSet<String> level1 = new HashSet<>();
|
||||||
HashSet<String> level2 = new HashSet<>();
|
HashSet<String> level2 = new HashSet<>();
|
||||||
HashSet<String> level3 = new HashSet<>();
|
HashSet<String> level3 = new HashSet<>();
|
||||||
addLevels(level1, level2, level3, first);
|
HashSet<String> level4 = new HashSet<>();
|
||||||
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
|
addLevels(level1, level2, level3, level4, first);
|
||||||
|
it.forEachRemaining(v -> addLevels(level1, level2, level3, level4, v));
|
||||||
List<Subject> sbjs = new ArrayList<>();
|
List<Subject> sbjs = new ArrayList<>();
|
||||||
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
level1
|
||||||
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
.forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||||
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
level2
|
||||||
|
.forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
|
||||||
|
level3
|
||||||
|
.forEach(
|
||||||
|
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
|
||||||
|
level4
|
||||||
|
.forEach(
|
||||||
|
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
|
||||||
r.setSubject(sbjs);
|
r.setSubject(sbjs);
|
||||||
r
|
r
|
||||||
.setDataInfo(
|
.setDataInfo(
|
||||||
|
@ -106,11 +114,18 @@ public class PrepareFOSSparkJob implements Serializable {
|
||||||
.json(outputPath + "/fos");
|
.json(outputPath + "/fos");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void add(List<Subject> sbsjs, Subject sbj) {
|
||||||
|
if (sbj != null)
|
||||||
|
sbsjs.add(sbj);
|
||||||
|
}
|
||||||
|
|
||||||
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
|
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
|
||||||
|
HashSet<String> level4,
|
||||||
FOSDataModel first) {
|
FOSDataModel first) {
|
||||||
level1.add(first.getLevel1());
|
level1.add(first.getLevel1());
|
||||||
level2.add(first.getLevel2());
|
level2.add(first.getLevel2());
|
||||||
level3.add(first.getLevel3());
|
level3.add(first.getLevel3() + "@@" + first.getScoreL3());
|
||||||
|
level4.add(first.getLevel4() + "@@" + first.getScoreL4());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,9 +69,9 @@ public class SparkSaveUnresolved implements Serializable {
|
||||||
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
|
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
|
||||||
Result ret = it.next();
|
Result ret = it.next();
|
||||||
it.forEachRemaining(r -> {
|
it.forEachRemaining(r -> {
|
||||||
if (r.getInstance() != null) {
|
// if (r.getInstance() != null) {
|
||||||
ret.setInstance(r.getInstance());
|
// ret.setInstance(r.getInstance());
|
||||||
}
|
// }
|
||||||
if (r.getSubject() != null) {
|
if (r.getSubject() != null) {
|
||||||
if (ret.getSubject() != null)
|
if (ret.getSubject() != null)
|
||||||
ret.getSubject().addAll(r.getSubject());
|
ret.getSubject().addAll(r.getSubject());
|
||||||
|
|
|
@ -11,21 +11,43 @@ public class FOSDataModel implements Serializable {
|
||||||
private String doi;
|
private String doi;
|
||||||
|
|
||||||
@CsvBindByPosition(position = 1)
|
@CsvBindByPosition(position = 1)
|
||||||
|
// @CsvBindByName(column = "doi")
|
||||||
|
private String oaid;
|
||||||
|
@CsvBindByPosition(position = 2)
|
||||||
// @CsvBindByName(column = "level1")
|
// @CsvBindByName(column = "level1")
|
||||||
private String level1;
|
private String level1;
|
||||||
|
|
||||||
@CsvBindByPosition(position = 2)
|
@CsvBindByPosition(position = 3)
|
||||||
// @CsvBindByName(column = "level2")
|
// @CsvBindByName(column = "level2")
|
||||||
private String level2;
|
private String level2;
|
||||||
|
|
||||||
@CsvBindByPosition(position = 3)
|
@CsvBindByPosition(position = 4)
|
||||||
// @CsvBindByName(column = "level3")
|
// @CsvBindByName(column = "level3")
|
||||||
private String level3;
|
private String level3;
|
||||||
|
|
||||||
|
@CsvBindByPosition(position = 5)
|
||||||
|
// @CsvBindByName(column = "level3")
|
||||||
|
private String level4;
|
||||||
|
@CsvBindByPosition(position = 6)
|
||||||
|
private String scoreL3;
|
||||||
|
@CsvBindByPosition(position = 7)
|
||||||
|
private String scoreL4;
|
||||||
|
|
||||||
public FOSDataModel() {
|
public FOSDataModel() {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public FOSDataModel(String doi, String level1, String level2, String level3, String level4, String l3score,
|
||||||
|
String l4score) {
|
||||||
|
this.doi = doi;
|
||||||
|
this.level1 = level1;
|
||||||
|
this.level2 = level2;
|
||||||
|
this.level3 = level3;
|
||||||
|
this.level4 = level4;
|
||||||
|
this.scoreL3 = l3score;
|
||||||
|
this.scoreL4 = l4score;
|
||||||
|
}
|
||||||
|
|
||||||
public FOSDataModel(String doi, String level1, String level2, String level3) {
|
public FOSDataModel(String doi, String level1, String level2, String level3) {
|
||||||
this.doi = doi;
|
this.doi = doi;
|
||||||
this.level1 = level1;
|
this.level1 = level1;
|
||||||
|
@ -33,8 +55,41 @@ public class FOSDataModel implements Serializable {
|
||||||
this.level3 = level3;
|
this.level3 = level3;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
|
public static FOSDataModel newInstance(String d, String level1, String level2, String level3, String level4,
|
||||||
return new FOSDataModel(d, level1, level2, level3);
|
String scorel3, String scorel4) {
|
||||||
|
return new FOSDataModel(d, level1, level2, level3, level4, scorel3, scorel4);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOaid() {
|
||||||
|
return oaid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOaid(String oaid) {
|
||||||
|
this.oaid = oaid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLevel4() {
|
||||||
|
return level4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLevel4(String level4) {
|
||||||
|
this.level4 = level4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getScoreL3() {
|
||||||
|
return scoreL3;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScoreL3(String scoreL3) {
|
||||||
|
this.scoreL3 = scoreL3;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getScoreL4() {
|
||||||
|
return scoreL4;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setScoreL4(String scoreL4) {
|
||||||
|
this.scoreL4 = scoreL4;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getDoi() {
|
public String getDoi() {
|
||||||
|
|
|
@ -7,7 +7,6 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.*;
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
@ -30,6 +29,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.*;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
|
|
@ -6,10 +6,10 @@
|
||||||
<description>the input path of the resources to be extended</description>
|
<description>the input path of the resources to be extended</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
<property>
|
<!-- <property>-->
|
||||||
<name>bipScorePath</name>
|
<!-- <name>bipScorePath</name>-->
|
||||||
<description>the path where to find the bipFinder scores</description>
|
<!-- <description>the path where to find the bipFinder scores</description>-->
|
||||||
</property>
|
<!-- </property>-->
|
||||||
<property>
|
<property>
|
||||||
<name>outputPath</name>
|
<name>outputPath</name>
|
||||||
<description>the path where to store the actionset</description>
|
<description>the path where to store the actionset</description>
|
||||||
|
@ -77,34 +77,34 @@
|
||||||
|
|
||||||
|
|
||||||
<fork name="prepareInfo">
|
<fork name="prepareInfo">
|
||||||
<path start="prepareBip"/>
|
<!-- <path start="prepareBip"/>-->
|
||||||
<path start="getFOS"/>
|
<path start="getFOS"/>
|
||||||
<path start="getSDG"/>
|
<path start="getSDG"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="prepareBip">
|
<!-- <action name="prepareBip">-->
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
|
||||||
<master>yarn</master>
|
<!-- <master>yarn</master>-->
|
||||||
<mode>cluster</mode>
|
<!-- <mode>cluster</mode>-->
|
||||||
<name>Produces the unresolved from BIP! Finder</name>
|
<!-- <name>Produces the unresolved from BIP! Finder</name>-->
|
||||||
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
|
<!-- <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>-->
|
||||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
<!-- <jar>dhp-aggregation-${projectVersion}.jar</jar>-->
|
||||||
<spark-opts>
|
<!-- <spark-opts>-->
|
||||||
--executor-memory=${sparkExecutorMemory}
|
<!-- --executor-memory=${sparkExecutorMemory}-->
|
||||||
--executor-cores=${sparkExecutorCores}
|
<!-- --executor-cores=${sparkExecutorCores}-->
|
||||||
--driver-memory=${sparkDriverMemory}
|
<!-- --driver-memory=${sparkDriverMemory}-->
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
<!-- --conf spark.extraListeners=${spark2ExtraListeners}-->
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
<!-- --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
<!-- --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
<!-- --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
|
||||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
<!-- --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}-->
|
||||||
</spark-opts>
|
<!-- </spark-opts>-->
|
||||||
<arg>--sourcePath</arg><arg>${bipScorePath}</arg>
|
<!-- <arg>--sourcePath</arg><arg>${bipScorePath}</arg>-->
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
|
<!-- <arg>--outputPath</arg><arg>${workingDir}/prepared</arg>-->
|
||||||
</spark>
|
<!-- </spark>-->
|
||||||
<ok to="join"/>
|
<!-- <ok to="join"/>-->
|
||||||
<error to="Kill"/>
|
<!-- <error to="Kill"/>-->
|
||||||
</action>
|
<!-- </action>-->
|
||||||
|
|
||||||
<action name="getFOS">
|
<action name="getFOS">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
|
|
@ -13,10 +13,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.junit.jupiter.api.AfterAll;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -68,6 +65,7 @@ public class GetFosTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@Disabled
|
||||||
void test3() throws Exception {
|
void test3() throws Exception {
|
||||||
final String sourcePath = getClass()
|
final String sourcePath = getClass()
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv")
|
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv")
|
||||||
|
@ -96,4 +94,37 @@ public class GetFosTest {
|
||||||
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void test4() throws Exception {
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs2.csv")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
final String outputPath = workingDir.toString() + "/fos.json";
|
||||||
|
GetFOSSparkJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"--sourcePath", sourcePath,
|
||||||
|
"--delimiter", ",",
|
||||||
|
"-outputPath", outputPath
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<FOSDataModel> tmp = sc
|
||||||
|
.textFile(outputPath)
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
|
||||||
|
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getLevel4() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL3() != null));
|
||||||
|
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL4() != null));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -222,6 +222,76 @@ public class PrepareTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void fosPrepareTest2() throws Exception {
|
||||||
|
final String sourcePath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
PrepareFOSSparkJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"--sourcePath", sourcePath,
|
||||||
|
|
||||||
|
"-outputPath", workingDir.toString() + "/work"
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Result> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/work/fos")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||||
|
|
||||||
|
String doi1 = "unresolved::10.1016/j.revmed.2006.07.012::doi";
|
||||||
|
|
||||||
|
assertEquals(13, tmp.count());
|
||||||
|
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
|
||||||
|
|
||||||
|
Result result = tmp
|
||||||
|
.filter(r -> r.getId().equals(doi1))
|
||||||
|
.first();
|
||||||
|
|
||||||
|
result.getSubject().forEach(s -> System.out.println(s.getValue() + " trust = " + s.getDataInfo().getTrust()));
|
||||||
|
Assertions.assertEquals(6, result.getSubject().size());
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
result
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> s.getValue().contains("03 medical and health sciences")
|
||||||
|
&& s.getDataInfo().getTrust().equals("")));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
result
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> s.getValue().contains("0302 clinical medicine") && s.getDataInfo().getTrust().equals("")));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
result
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> s
|
||||||
|
.getValue()
|
||||||
|
.contains("030204 cardiovascular system & hematology")
|
||||||
|
&& s.getDataInfo().getTrust().equals("0.5101401805877686")));
|
||||||
|
assertTrue(
|
||||||
|
result
|
||||||
|
.getSubject()
|
||||||
|
.stream()
|
||||||
|
.anyMatch(
|
||||||
|
s -> s
|
||||||
|
.getValue()
|
||||||
|
.contains("03020409 Hematology/Coagulopathies")
|
||||||
|
&& s.getDataInfo().getTrust().equals("0.0546871414174914")));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void sdgPrepareTest() throws Exception {
|
void sdgPrepareTest() throws Exception {
|
||||||
final String sourcePath = getClass()
|
final String sourcePath = getClass()
|
||||||
|
|
|
@ -379,6 +379,40 @@ public class ProduceTest {
|
||||||
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public JavaRDD<Result> getResultFosJavaRDD() throws Exception {
|
||||||
|
|
||||||
|
final String fosPath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
|
PrepareFOSSparkJob
|
||||||
|
.main(
|
||||||
|
new String[] {
|
||||||
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"--sourcePath", fosPath,
|
||||||
|
"-outputPath", workingDir.toString() + "/work"
|
||||||
|
});
|
||||||
|
|
||||||
|
SparkSaveUnresolved.main(new String[] {
|
||||||
|
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||||
|
"--sourcePath", workingDir.toString() + "/work",
|
||||||
|
|
||||||
|
"-outputPath", workingDir.toString() + "/unresolved"
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
JavaRDD<Result> tmp = sc
|
||||||
|
.textFile(workingDir.toString() + "/unresolved")
|
||||||
|
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
|
||||||
|
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
|
||||||
|
|
||||||
|
return tmp;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void prepareTest5Subjects() throws Exception {
|
void prepareTest5Subjects() throws Exception {
|
||||||
final String doi = "unresolved::10.1063/5.0032658::doi";
|
final String doi = "unresolved::10.1063/5.0032658::doi";
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
DOI,OAID,level1,level2,level3,level4,score_for_L3,score_for_L4
|
||||||
|
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
|
||||||
|
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
|
||||||
|
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis,N/A,0.5036656856536865,0.0
|
||||||
|
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030212 general & internal medicine,N/A,0.4963343143463135,0.0
|
||||||
|
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation,02090104 Robotics/Robots,0.6111094951629639,0.5053805979936855
|
||||||
|
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,01 natural sciences,0104 chemical sciences,010401 analytical chemistry,N/A,0.3888905048370361,0.0
|
||||||
|
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0506 political science,050602 political science & public administration,05060202 Ethnic groups/Ethnicity,0.6159052848815918,0.7369035568037298
|
||||||
|
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0502 economics and business,050207 economics,N/A,0.3840946555137634,0.0
|
||||||
|
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
|
||||||
|
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
|
||||||
|
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
|
||||||
|
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
|
||||||
|
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010405 organic chemistry,01040508 Functional groups/Ethers,0.5566747188568115,0.5582916736602783
|
||||||
|
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010402 general chemistry,01040207 Chemical synthesis/Total synthesis,0.4433253407478332,0.4417082965373993
|
||||||
|
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0302 clinical medicine,030204 cardiovascular system & hematology,03020409 Hematology/Coagulopathies,0.5101401805877686,0.0546871414174914
|
||||||
|
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0301 basic medicine,030105 genetics & heredity,N/A,0.4898599088191986,0.0
|
||||||
|
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0301 basic medicine,030104 developmental biology,N/A,0.5,0.0
|
||||||
|
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0303 health sciences,030304 developmental biology,N/A,0.5,0.0
|
||||||
|
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050203 business & management,05020311 International business/International trade,0.605047881603241,0.2156608108845153
|
||||||
|
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050211 marketing,N/A,0.394952118396759,0.0
|
||||||
|
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
|
||||||
|
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
|
||||||
|
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0502 economics and business,050203 business & management,05020307 Innovation/Product management,0.5293408632278442,0.5326762795448303
|
||||||
|
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0509 other social sciences,050905 science studies,05090502 Social philosophy/Capitalism,0.4706590473651886,0.4673237204551697
|
||||||
|
10.1785/0120050806,doi_________::00d5831d329e7ae4523d78bfc3042e98,02 engineering and technology,0211 other engineering and technologies,021101 geological & geomatics engineering,02110103 Concrete/Building materials,0.5343400835990906,0.3285667930180677
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis","level4":"N/A","scoreL3":"0.5036656856536865","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030212 general & internal medicine","level4":"N/A","scoreL3":"0.4963343143463135","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.20965/jrm.2006.p0312","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation","level4":"02090104 Robotics/Robots","scoreL3":"0.6111094951629639","scoreL4":"0.5053805979936855"}
|
||||||
|
{"doi":"10.20965/jrm.2006.p0312","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010401 analytical chemistry","level4":"N/A","scoreL3":"0.3888905048370361","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0506 political science","level3":"050602 political science & public administration","level4":"05060202 Ethnic groups/Ethnicity","scoreL3":"0.6159052848815918","scoreL4":"0.7369035568037298"}
|
||||||
|
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0502 economics and business","level3":"050207 economics","level4":"N/A","scoreL3":"0.3840946555137634","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry","level4":"01040508 Functional groups/Ethers","scoreL3":"0.5566747188568115","scoreL4":"0.5582916736602783"}
|
||||||
|
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry","level4":"01040207 Chemical synthesis/Total synthesis","scoreL3":"0.4433253407478332","scoreL4":"0.4417082965373993"}
|
||||||
|
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030204 cardiovascular system & hematology","level4":"03020409 Hematology/Coagulopathies","scoreL3":"0.5101401805877686","scoreL4":"0.0546871414174914"}
|
||||||
|
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030105 genetics & heredity","level4":"N/A","scoreL3":"0.4898599088191986","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030104 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030304 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020311 International business/International trade","scoreL3":"0.605047881603241","scoreL4":"0.2156608108845153"}
|
||||||
|
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing","level4":"N/A","scoreL3":"0.394952118396759","scoreL4":"0.0"}
|
||||||
|
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
|
||||||
|
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020307 Innovation/Product management","scoreL3":"0.5293408632278442","scoreL4":"0.5326762795448303"}
|
||||||
|
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0509 other social sciences","level3":"050905 science studies","level4":"05090502 Social philosophy/Capitalism","scoreL3":"0.4706590473651886","scoreL4":"0.4673237204551697"}
|
||||||
|
{"doi":"10.1785/0120050806","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021101 geological & geomatics engineering","level4":"02110103 Concrete/Building materials","scoreL3":"0.5343400835990906","scoreL4":"0.3285667930180677"}
|
|
@ -33,15 +33,15 @@ case class mappingAuthor(
|
||||||
|
|
||||||
case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {}
|
case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
||||||
|
|
||||||
case object Crossref2Oaf {
|
case object Crossref2Oaf {
|
||||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||||
|
|
||||||
val irishFunder: List[funderInfo] = {
|
val irishFunder: List[funderInfo] = {
|
||||||
val s = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json")).mkString
|
val s = Source
|
||||||
|
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json"))
|
||||||
|
.mkString
|
||||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||||
lazy val json: org.json4s.JValue = parse(s)
|
lazy val json: org.json4s.JValue = parse(s)
|
||||||
json.extract[List[funderInfo]]
|
json.extract[List[funderInfo]]
|
||||||
|
@ -102,7 +102,9 @@ case object Crossref2Oaf {
|
||||||
|
|
||||||
def getIrishId(doi: String): Option[String] = {
|
def getIrishId(doi: String): Option[String] = {
|
||||||
val id = doi.split("/").last
|
val id = doi.split("/").last
|
||||||
irishFunder.find(f => id.equalsIgnoreCase(f.id) || (f.synonym.nonEmpty && f.synonym.exists(s => s.equalsIgnoreCase(id)))).map(f => f.id)
|
irishFunder
|
||||||
|
.find(f => id.equalsIgnoreCase(f.id) || (f.synonym.nonEmpty && f.synonym.exists(s => s.equalsIgnoreCase(id))))
|
||||||
|
.map(f => f.id)
|
||||||
}
|
}
|
||||||
|
|
||||||
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
|
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
|
||||||
|
|
Loading…
Reference in New Issue