extend the fos model to include the level4 and the scores for level3 and level4. removed bip indicators from the instance

This commit is contained in:
Miriam Baglioni 2023-10-10 09:46:40 +02:00
parent ef833840c3
commit 110ce4b40f
14 changed files with 334 additions and 61 deletions

View File

@ -40,6 +40,7 @@ public class Constants {
public static final String SDG_CLASS_NAME = "Sustainable Development Goals"; public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
public static final String NULL = "NULL"; public static final String NULL = "NULL";
public static final String NA = "N/A";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -61,10 +62,16 @@ public class Constants {
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); .map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} }
public static Subject getSubject(String sbj, String classid, String classname, public static Subject getSubject(String sbj, String classid, String classname, String diqualifierclassid,
String diqualifierclassid) { Boolean split) {
if (sbj == null || sbj.equals(NULL)) if (sbj == null || sbj.equals(NULL) || sbj.startsWith(NA))
return null; return null;
String trust = "";
String subject = sbj;
if (split) {
sbj = subject.split("@@")[0];
trust = subject.split("@@")[1];
}
Subject s = new Subject(); Subject s = new Subject();
s.setValue(sbj); s.setValue(sbj);
s s
@ -89,9 +96,14 @@ public class Constants {
UPDATE_CLASS_NAME, UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS), ModelConstants.DNET_PROVENANCE_ACTIONS),
"")); trust));
return s; return s;
}
public static Subject getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
return getSubject(sbj, classid, classname, diqualifierclassid, false);
} }

View File

@ -75,9 +75,12 @@ public class GetFOSSparkJob implements Serializable {
fosData.map((MapFunction<Row, FOSDataModel>) r -> { fosData.map((MapFunction<Row, FOSDataModel>) r -> {
FOSDataModel fosDataModel = new FOSDataModel(); FOSDataModel fosDataModel = new FOSDataModel();
fosDataModel.setDoi(r.getString(0).toLowerCase()); fosDataModel.setDoi(r.getString(0).toLowerCase());
fosDataModel.setLevel1(r.getString(1)); fosDataModel.setLevel1(r.getString(2));
fosDataModel.setLevel2(r.getString(2)); fosDataModel.setLevel2(r.getString(3));
fosDataModel.setLevel3(r.getString(3)); fosDataModel.setLevel3(r.getString(4));
fosDataModel.setLevel4(r.getString(5));
fosDataModel.setScoreL3(String.valueOf(r.getDouble(6)));
fosDataModel.setScoreL4(String.valueOf(r.getDouble(7)));
return fosDataModel; return fosDataModel;
}, Encoders.bean(FOSDataModel.class)) }, Encoders.bean(FOSDataModel.class))
.write() .write()

View File

@ -78,12 +78,20 @@ public class PrepareFOSSparkJob implements Serializable {
HashSet<String> level1 = new HashSet<>(); HashSet<String> level1 = new HashSet<>();
HashSet<String> level2 = new HashSet<>(); HashSet<String> level2 = new HashSet<>();
HashSet<String> level3 = new HashSet<>(); HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first); HashSet<String> level4 = new HashSet<>();
it.forEachRemaining(v -> addLevels(level1, level2, level3, v)); addLevels(level1, level2, level3, level4, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, level4, v));
List<Subject> sbjs = new ArrayList<>(); List<Subject> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level1
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); .forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level2
.forEach(l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3
.forEach(
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
level4
.forEach(
l -> add(sbjs, getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID, true)));
r.setSubject(sbjs); r.setSubject(sbjs);
r r
.setDataInfo( .setDataInfo(
@ -106,11 +114,18 @@ public class PrepareFOSSparkJob implements Serializable {
.json(outputPath + "/fos"); .json(outputPath + "/fos");
} }
private static void add(List<Subject> sbsjs, Subject sbj) {
if (sbj != null)
sbsjs.add(sbj);
}
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3, private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
HashSet<String> level4,
FOSDataModel first) { FOSDataModel first) {
level1.add(first.getLevel1()); level1.add(first.getLevel1());
level2.add(first.getLevel2()); level2.add(first.getLevel2());
level3.add(first.getLevel3()); level3.add(first.getLevel3() + "@@" + first.getScoreL3());
level4.add(first.getLevel4() + "@@" + first.getScoreL4());
} }
} }

View File

@ -69,9 +69,9 @@ public class SparkSaveUnresolved implements Serializable {
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> { .mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next(); Result ret = it.next();
it.forEachRemaining(r -> { it.forEachRemaining(r -> {
if (r.getInstance() != null) { // if (r.getInstance() != null) {
ret.setInstance(r.getInstance()); // ret.setInstance(r.getInstance());
} // }
if (r.getSubject() != null) { if (r.getSubject() != null) {
if (ret.getSubject() != null) if (ret.getSubject() != null)
ret.getSubject().addAll(r.getSubject()); ret.getSubject().addAll(r.getSubject());

View File

@ -11,21 +11,43 @@ public class FOSDataModel implements Serializable {
private String doi; private String doi;
@CsvBindByPosition(position = 1) @CsvBindByPosition(position = 1)
// @CsvBindByName(column = "doi")
private String oaid;
@CsvBindByPosition(position = 2)
// @CsvBindByName(column = "level1") // @CsvBindByName(column = "level1")
private String level1; private String level1;
@CsvBindByPosition(position = 2) @CsvBindByPosition(position = 3)
// @CsvBindByName(column = "level2") // @CsvBindByName(column = "level2")
private String level2; private String level2;
@CsvBindByPosition(position = 3) @CsvBindByPosition(position = 4)
// @CsvBindByName(column = "level3") // @CsvBindByName(column = "level3")
private String level3; private String level3;
@CsvBindByPosition(position = 5)
// @CsvBindByName(column = "level3")
private String level4;
@CsvBindByPosition(position = 6)
private String scoreL3;
@CsvBindByPosition(position = 7)
private String scoreL4;
public FOSDataModel() { public FOSDataModel() {
} }
public FOSDataModel(String doi, String level1, String level2, String level3, String level4, String l3score,
String l4score) {
this.doi = doi;
this.level1 = level1;
this.level2 = level2;
this.level3 = level3;
this.level4 = level4;
this.scoreL3 = l3score;
this.scoreL4 = l4score;
}
public FOSDataModel(String doi, String level1, String level2, String level3) { public FOSDataModel(String doi, String level1, String level2, String level3) {
this.doi = doi; this.doi = doi;
this.level1 = level1; this.level1 = level1;
@ -33,8 +55,41 @@ public class FOSDataModel implements Serializable {
this.level3 = level3; this.level3 = level3;
} }
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) { public static FOSDataModel newInstance(String d, String level1, String level2, String level3, String level4,
return new FOSDataModel(d, level1, level2, level3); String scorel3, String scorel4) {
return new FOSDataModel(d, level1, level2, level3, level4, scorel3, scorel4);
}
public String getOaid() {
return oaid;
}
public void setOaid(String oaid) {
this.oaid = oaid;
}
public String getLevel4() {
return level4;
}
public void setLevel4(String level4) {
this.level4 = level4;
}
public String getScoreL3() {
return scoreL3;
}
public void setScoreL3(String scoreL3) {
this.scoreL3 = scoreL3;
}
public String getScoreL4() {
return scoreL4;
}
public void setScoreL4(String scoreL4) {
this.scoreL4 = scoreL4;
} }
public String getDoi() { public String getDoi() {

View File

@ -7,7 +7,6 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import eu.dnetlib.dhp.schema.oaf.utils.*;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
@ -30,6 +29,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.*;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;

View File

@ -6,10 +6,10 @@
<description>the input path of the resources to be extended</description> <description>the input path of the resources to be extended</description>
</property> </property>
<property> <!-- <property>-->
<name>bipScorePath</name> <!-- <name>bipScorePath</name>-->
<description>the path where to find the bipFinder scores</description> <!-- <description>the path where to find the bipFinder scores</description>-->
</property> <!-- </property>-->
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the path where to store the actionset</description> <description>the path where to store the actionset</description>
@ -77,34 +77,34 @@
<fork name="prepareInfo"> <fork name="prepareInfo">
<path start="prepareBip"/> <!-- <path start="prepareBip"/>-->
<path start="getFOS"/> <path start="getFOS"/>
<path start="getSDG"/> <path start="getSDG"/>
</fork> </fork>
<action name="prepareBip"> <!-- <action name="prepareBip">-->
<spark xmlns="uri:oozie:spark-action:0.2"> <!-- <spark xmlns="uri:oozie:spark-action:0.2">-->
<master>yarn</master> <!-- <master>yarn</master>-->
<mode>cluster</mode> <!-- <mode>cluster</mode>-->
<name>Produces the unresolved from BIP! Finder</name> <!-- <name>Produces the unresolved from BIP! Finder</name>-->
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class> <!-- <class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>-->
<jar>dhp-aggregation-${projectVersion}.jar</jar> <!-- <jar>dhp-aggregation-${projectVersion}.jar</jar>-->
<spark-opts> <!-- <spark-opts>-->
--executor-memory=${sparkExecutorMemory} <!-- &#45;&#45;executor-memory=${sparkExecutorMemory}-->
--executor-cores=${sparkExecutorCores} <!-- &#45;&#45;executor-cores=${sparkExecutorCores}-->
--driver-memory=${sparkDriverMemory} <!-- &#45;&#45;driver-memory=${sparkDriverMemory}-->
--conf spark.extraListeners=${spark2ExtraListeners} <!-- &#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} <!-- &#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} <!-- &#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} <!-- &#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} <!-- &#45;&#45;conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}-->
</spark-opts> <!-- </spark-opts>-->
<arg>--sourcePath</arg><arg>${bipScorePath}</arg> <!-- <arg>&#45;&#45;sourcePath</arg><arg>${bipScorePath}</arg>-->
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg> <!-- <arg>&#45;&#45;outputPath</arg><arg>${workingDir}/prepared</arg>-->
</spark> <!-- </spark>-->
<ok to="join"/> <!-- <ok to="join"/>-->
<error to="Kill"/> <!-- <error to="Kill"/>-->
</action> <!-- </action>-->
<action name="getFOS"> <action name="getFOS">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">

View File

@ -13,10 +13,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -68,6 +65,7 @@ public class GetFosTest {
} }
@Test @Test
@Disabled
void test3() throws Exception { void test3() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv") .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv")
@ -96,4 +94,37 @@ public class GetFosTest {
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null)); tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
} }
@Test
void test4() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs2.csv")
.getPath();
final String outputPath = workingDir.toString() + "/fos.json";
GetFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--delimiter", ",",
"-outputPath", outputPath
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<FOSDataModel> tmp = sc
.textFile(outputPath)
.map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class));
tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getLevel4() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL3() != null));
tmp.foreach(t -> Assertions.assertTrue(t.getScoreL4() != null));
}
} }

View File

@ -222,6 +222,76 @@ public class PrepareTest {
} }
@Test
void fosPrepareTest2() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/fos")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
String doi1 = "unresolved::10.1016/j.revmed.2006.07.012::doi";
assertEquals(13, tmp.count());
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
Result result = tmp
.filter(r -> r.getId().equals(doi1))
.first();
result.getSubject().forEach(s -> System.out.println(s.getValue() + " trust = " + s.getDataInfo().getTrust()));
Assertions.assertEquals(6, result.getSubject().size());
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s.getValue().contains("03 medical and health sciences")
&& s.getDataInfo().getTrust().equals("")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s.getValue().contains("0302 clinical medicine") && s.getDataInfo().getTrust().equals("")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s
.getValue()
.contains("030204 cardiovascular system & hematology")
&& s.getDataInfo().getTrust().equals("0.5101401805877686")));
assertTrue(
result
.getSubject()
.stream()
.anyMatch(
s -> s
.getValue()
.contains("03020409 Hematology/Coagulopathies")
&& s.getDataInfo().getTrust().equals("0.0546871414174914")));
}
@Test @Test
void sdgPrepareTest() throws Exception { void sdgPrepareTest() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()

View File

@ -379,6 +379,40 @@ public class ProduceTest {
.map(item -> OBJECT_MAPPER.readValue(item, Result.class)); .map(item -> OBJECT_MAPPER.readValue(item, Result.class));
} }
@Test
public JavaRDD<Result> getResultFosJavaRDD() throws Exception {
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs_2.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r)));
return tmp;
}
@Test @Test
void prepareTest5Subjects() throws Exception { void prepareTest5Subjects() throws Exception {
final String doi = "unresolved::10.1063/5.0032658::doi"; final String doi = "unresolved::10.1063/5.0032658::doi";

View File

@ -0,0 +1,26 @@
DOI,OAID,level1,level2,level3,level4,score_for_L3,score_for_L4
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
10.1016/j.anucene.2006.02.004,doi_________::00059d9963edf633bec756fb21b5bd72,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030220 oncology & carcinogenesis,N/A,0.5036656856536865,0.0
10.1016/j.revmed.2006.07.010,doi_________::0026476c1651a92c933d752ff12496c7,03 medical and health sciences,0302 clinical medicine,030212 general & internal medicine,N/A,0.4963343143463135,0.0
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,02 engineering and technology,0209 industrial biotechnology,020901 industrial engineering & automation,02090104 Robotics/Robots,0.6111094951629639,0.5053805979936855
10.20965/jrm.2006.p0312,doi_________::0028336a2f3826cc83c47dbefac71543,01 natural sciences,0104 chemical sciences,010401 analytical chemistry,N/A,0.3888905048370361,0.0
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0506 political science,050602 political science & public administration,05060202 Ethnic groups/Ethnicity,0.6159052848815918,0.7369035568037298
10.1111/j.1747-7379.2006.040_1.x,doi_________::002c7077e7c114a8304eb90f59e45fa4,05 social sciences,0502 economics and business,050207 economics,N/A,0.3840946555137634,0.0
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,"0202 electrical engineering, electronic engineering, information engineering",020209 energy,02020908 Climate change policy/Ethanol fuel,0.5,0.5
10.1007/s10512-006-0049-9,doi_________::003f29f9254819cf4c78558b1bc25f10,02 engineering and technology,0211 other engineering and technologies,021108 energy,02110808 Climate change policy/Ethanol fuel,0.5,0.5
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
10.1111/j.1365-2621.2005.01045.x,doi_________::00419355b4c3e0646bd0e1b301164c8e,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010405 organic chemistry,01040508 Functional groups/Ethers,0.5566747188568115,0.5582916736602783
10.1002/chin.200617262,doi_________::004c8cef80668904961b9e62841793c8,01 natural sciences,0104 chemical sciences,010402 general chemistry,01040207 Chemical synthesis/Total synthesis,0.4433253407478332,0.4417082965373993
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0302 clinical medicine,030204 cardiovascular system & hematology,03020409 Hematology/Coagulopathies,0.5101401805877686,0.0546871414174914
10.1016/j.revmed.2006.07.012,doi_________::005b1d0fb650b680abaf6cfe26a21604,03 medical and health sciences,0301 basic medicine,030105 genetics & heredity,N/A,0.4898599088191986,0.0
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0301 basic medicine,030104 developmental biology,N/A,0.5,0.0
10.4109/jslab.17.132,doi_________::00889baa06de363e37930daaf8e800c0,03 medical and health sciences,0303 health sciences,030304 developmental biology,N/A,0.5,0.0
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050203 business & management,05020311 International business/International trade,0.605047881603241,0.2156608108845153
10.1108/00251740610715687,doi_________::0092cb1b1920d556719385a26363ecaa,05 social sciences,0502 economics and business,050211 marketing,N/A,0.394952118396759,0.0
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0404 agricultural biotechnology,040401 food science,04040102 Food science/Food industry,0.5,0.5
10.1080/03067310500248098,doi_________::00a76678d230e3f20b6356804448028f,04 agricultural and veterinary sciences,0405 other agricultural sciences,040502 food science,04050202 Food science/Food industry,0.5,0.5
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0502 economics and business,050203 business & management,05020307 Innovation/Product management,0.5293408632278442,0.5326762795448303
10.3152/147154306781778533,doi_________::00acc520f3939e5a6675343881fed4f2,05 social sciences,0509 other social sciences,050905 science studies,05090502 Social philosophy/Capitalism,0.4706590473651886,0.4673237204551697
10.1785/0120050806,doi_________::00d5831d329e7ae4523d78bfc3042e98,02 engineering and technology,0211 other engineering and technologies,021101 geological & geomatics engineering,02110103 Concrete/Building materials,0.5343400835990906,0.3285667930180677
1 DOI OAID level1 level2 level3 level4 score_for_L3 score_for_L4
2 10.1016/j.anucene.2006.02.004 doi_________::00059d9963edf633bec756fb21b5bd72 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy 02020908 Climate change policy/Ethanol fuel 0.5 0.5
3 10.1016/j.anucene.2006.02.004 doi_________::00059d9963edf633bec756fb21b5bd72 02 engineering and technology 0211 other engineering and technologies 021108 energy 02110808 Climate change policy/Ethanol fuel 0.5 0.5
4 10.1016/j.revmed.2006.07.010 doi_________::0026476c1651a92c933d752ff12496c7 03 medical and health sciences 0302 clinical medicine 030220 oncology & carcinogenesis N/A 0.5036656856536865 0.0
5 10.1016/j.revmed.2006.07.010 doi_________::0026476c1651a92c933d752ff12496c7 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine N/A 0.4963343143463135 0.0
6 10.20965/jrm.2006.p0312 doi_________::0028336a2f3826cc83c47dbefac71543 02 engineering and technology 0209 industrial biotechnology 020901 industrial engineering & automation 02090104 Robotics/Robots 0.6111094951629639 0.5053805979936855
7 10.20965/jrm.2006.p0312 doi_________::0028336a2f3826cc83c47dbefac71543 01 natural sciences 0104 chemical sciences 010401 analytical chemistry N/A 0.3888905048370361 0.0
8 10.1111/j.1747-7379.2006.040_1.x doi_________::002c7077e7c114a8304eb90f59e45fa4 05 social sciences 0506 political science 050602 political science & public administration 05060202 Ethnic groups/Ethnicity 0.6159052848815918 0.7369035568037298
9 10.1111/j.1747-7379.2006.040_1.x doi_________::002c7077e7c114a8304eb90f59e45fa4 05 social sciences 0502 economics and business 050207 economics N/A 0.3840946555137634 0.0
10 10.1007/s10512-006-0049-9 doi_________::003f29f9254819cf4c78558b1bc25f10 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy 02020908 Climate change policy/Ethanol fuel 0.5 0.5
11 10.1007/s10512-006-0049-9 doi_________::003f29f9254819cf4c78558b1bc25f10 02 engineering and technology 0211 other engineering and technologies 021108 energy 02110808 Climate change policy/Ethanol fuel 0.5 0.5
12 10.1111/j.1365-2621.2005.01045.x doi_________::00419355b4c3e0646bd0e1b301164c8e 04 agricultural and veterinary sciences 0404 agricultural biotechnology 040401 food science 04040102 Food science/Food industry 0.5 0.5
13 10.1111/j.1365-2621.2005.01045.x doi_________::00419355b4c3e0646bd0e1b301164c8e 04 agricultural and veterinary sciences 0405 other agricultural sciences 040502 food science 04050202 Food science/Food industry 0.5 0.5
14 10.1002/chin.200617262 doi_________::004c8cef80668904961b9e62841793c8 01 natural sciences 0104 chemical sciences 010405 organic chemistry 01040508 Functional groups/Ethers 0.5566747188568115 0.5582916736602783
15 10.1002/chin.200617262 doi_________::004c8cef80668904961b9e62841793c8 01 natural sciences 0104 chemical sciences 010402 general chemistry 01040207 Chemical synthesis/Total synthesis 0.4433253407478332 0.4417082965373993
16 10.1016/j.revmed.2006.07.012 doi_________::005b1d0fb650b680abaf6cfe26a21604 03 medical and health sciences 0302 clinical medicine 030204 cardiovascular system & hematology 03020409 Hematology/Coagulopathies 0.5101401805877686 0.0546871414174914
17 10.1016/j.revmed.2006.07.012 doi_________::005b1d0fb650b680abaf6cfe26a21604 03 medical and health sciences 0301 basic medicine 030105 genetics & heredity N/A 0.4898599088191986 0.0
18 10.4109/jslab.17.132 doi_________::00889baa06de363e37930daaf8e800c0 03 medical and health sciences 0301 basic medicine 030104 developmental biology N/A 0.5 0.0
19 10.4109/jslab.17.132 doi_________::00889baa06de363e37930daaf8e800c0 03 medical and health sciences 0303 health sciences 030304 developmental biology N/A 0.5 0.0
20 10.1108/00251740610715687 doi_________::0092cb1b1920d556719385a26363ecaa 05 social sciences 0502 economics and business 050203 business & management 05020311 International business/International trade 0.605047881603241 0.2156608108845153
21 10.1108/00251740610715687 doi_________::0092cb1b1920d556719385a26363ecaa 05 social sciences 0502 economics and business 050211 marketing N/A 0.394952118396759 0.0
22 10.1080/03067310500248098 doi_________::00a76678d230e3f20b6356804448028f 04 agricultural and veterinary sciences 0404 agricultural biotechnology 040401 food science 04040102 Food science/Food industry 0.5 0.5
23 10.1080/03067310500248098 doi_________::00a76678d230e3f20b6356804448028f 04 agricultural and veterinary sciences 0405 other agricultural sciences 040502 food science 04050202 Food science/Food industry 0.5 0.5
24 10.3152/147154306781778533 doi_________::00acc520f3939e5a6675343881fed4f2 05 social sciences 0502 economics and business 050203 business & management 05020307 Innovation/Product management 0.5293408632278442 0.5326762795448303
25 10.3152/147154306781778533 doi_________::00acc520f3939e5a6675343881fed4f2 05 social sciences 0509 other social sciences 050905 science studies 05090502 Social philosophy/Capitalism 0.4706590473651886 0.4673237204551697
26 10.1785/0120050806 doi_________::00d5831d329e7ae4523d78bfc3042e98 02 engineering and technology 0211 other engineering and technologies 021101 geological & geomatics engineering 02110103 Concrete/Building materials 0.5343400835990906 0.3285667930180677

View File

@ -0,0 +1,25 @@
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1016/j.anucene.2006.02.004","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030220 oncology & carcinogenesis","level4":"N/A","scoreL3":"0.5036656856536865","scoreL4":"0.0"}
{"doi":"10.1016/j.revmed.2006.07.010","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030212 general & internal medicine","level4":"N/A","scoreL3":"0.4963343143463135","scoreL4":"0.0"}
{"doi":"10.20965/jrm.2006.p0312","level1":"02 engineering and technology","level2":"0209 industrial biotechnology","level3":"020901 industrial engineering & automation","level4":"02090104 Robotics/Robots","scoreL3":"0.6111094951629639","scoreL4":"0.5053805979936855"}
{"doi":"10.20965/jrm.2006.p0312","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010401 analytical chemistry","level4":"N/A","scoreL3":"0.3888905048370361","scoreL4":"0.0"}
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0506 political science","level3":"050602 political science & public administration","level4":"05060202 Ethnic groups/Ethnicity","scoreL3":"0.6159052848815918","scoreL4":"0.7369035568037298"}
{"doi":"10.1111/j.1747-7379.2006.040_1.x","level1":"05 social sciences","level2":"0502 economics and business","level3":"050207 economics","level4":"N/A","scoreL3":"0.3840946555137634","scoreL4":"0.0"}
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0202 electrical engineering, electronic engineering, information engineering","level3":"020209 energy","level4":"02020908 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1007/s10512-006-0049-9","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021108 energy","level4":"02110808 Climate change policy/Ethanol fuel","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1111/j.1365-2621.2005.01045.x","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010405 organic chemistry","level4":"01040508 Functional groups/Ethers","scoreL3":"0.5566747188568115","scoreL4":"0.5582916736602783"}
{"doi":"10.1002/chin.200617262","level1":"01 natural sciences","level2":"0104 chemical sciences","level3":"010402 general chemistry","level4":"01040207 Chemical synthesis/Total synthesis","scoreL3":"0.4433253407478332","scoreL4":"0.4417082965373993"}
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0302 clinical medicine","level3":"030204 cardiovascular system & hematology","level4":"03020409 Hematology/Coagulopathies","scoreL3":"0.5101401805877686","scoreL4":"0.0546871414174914"}
{"doi":"10.1016/j.revmed.2006.07.012","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030105 genetics & heredity","level4":"N/A","scoreL3":"0.4898599088191986","scoreL4":"0.0"}
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0301 basic medicine","level3":"030104 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
{"doi":"10.4109/jslab.17.132","level1":"03 medical and health sciences","level2":"0303 health sciences","level3":"030304 developmental biology","level4":"N/A","scoreL3":"0.5","scoreL4":"0.0"}
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020311 International business/International trade","scoreL3":"0.605047881603241","scoreL4":"0.2156608108845153"}
{"doi":"10.1108/00251740610715687","level1":"05 social sciences","level2":"0502 economics and business","level3":"050211 marketing","level4":"N/A","scoreL3":"0.394952118396759","scoreL4":"0.0"}
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0404 agricultural biotechnology","level3":"040401 food science","level4":"04040102 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.1080/03067310500248098","level1":"04 agricultural and veterinary sciences","level2":"0405 other agricultural sciences","level3":"040502 food science","level4":"04050202 Food science/Food industry","scoreL3":"0.5","scoreL4":"0.5"}
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0502 economics and business","level3":"050203 business & management","level4":"05020307 Innovation/Product management","scoreL3":"0.5293408632278442","scoreL4":"0.5326762795448303"}
{"doi":"10.3152/147154306781778533","level1":"05 social sciences","level2":"0509 other social sciences","level3":"050905 science studies","level4":"05090502 Social philosophy/Capitalism","scoreL3":"0.4706590473651886","scoreL4":"0.4673237204551697"}
{"doi":"10.1785/0120050806","level1":"02 engineering and technology","level2":"0211 other engineering and technologies","level3":"021101 geological & geomatics engineering","level4":"02110103 Concrete/Building materials","scoreL3":"0.5343400835990906","scoreL4":"0.3285667930180677"}

View File

@ -33,15 +33,15 @@ case class mappingAuthor(
case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {} case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {}
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {} case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
case object Crossref2Oaf { case object Crossref2Oaf {
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
val irishFunder: List[funderInfo] = { val irishFunder: List[funderInfo] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json")).mkString val s = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/crossref/irish_funder.json"))
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s) lazy val json: org.json4s.JValue = parse(s)
json.extract[List[funderInfo]] json.extract[List[funderInfo]]
@ -102,7 +102,9 @@ case object Crossref2Oaf {
def getIrishId(doi: String): Option[String] = { def getIrishId(doi: String): Option[String] = {
val id = doi.split("/").last val id = doi.split("/").last
irishFunder.find(f => id.equalsIgnoreCase(f.id) || (f.synonym.nonEmpty && f.synonym.exists(s => s.equalsIgnoreCase(id)))).map(f => f.id) irishFunder
.find(f => id.equalsIgnoreCase(f.id) || (f.synonym.nonEmpty && f.synonym.exists(s => s.equalsIgnoreCase(id))))
.map(f => f.id)
} }
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {