forked from antonis.lempesis/dnet-hadoop
improved crossref mapping
This commit is contained in:
parent
5d46ec7d5f
commit
e4b105cece
|
@ -2,8 +2,10 @@ package eu.dnetlib.doiboost.crossref
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf._
|
import eu.dnetlib.dhp.schema.oaf._
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
|
import org.apache.commons.lang.StringUtils
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
|
import org.json4s.JsonAST._
|
||||||
import org.json4s.jackson.JsonMethods._
|
import org.json4s.jackson.JsonMethods._
|
||||||
import org.slf4j.Logger
|
import org.slf4j.Logger
|
||||||
|
|
||||||
|
@ -11,7 +13,7 @@ import scala.collection.JavaConverters._
|
||||||
|
|
||||||
class Crossref2Oaf {
|
class Crossref2Oaf {
|
||||||
|
|
||||||
//STATIC STRING
|
//STATIC STRING
|
||||||
val MAG = "MAG"
|
val MAG = "MAG"
|
||||||
val ORCID = "ORCID"
|
val ORCID = "ORCID"
|
||||||
val CROSSREF = "Crossref"
|
val CROSSREF = "Crossref"
|
||||||
|
@ -105,24 +107,79 @@ class Crossref2Oaf {
|
||||||
// Add DataInfo
|
// Add DataInfo
|
||||||
result.setDataInfo(generateDataInfo())
|
result.setDataInfo(generateDataInfo())
|
||||||
|
|
||||||
result.setLastupdatetimestamp((json \"indexed" \"timestamp").extract[Long])
|
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
|
||||||
result.setDateofcollection((json \"indexed" \"date-time").extract[String])
|
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
|
||||||
|
|
||||||
//result.setCollectedfrom()
|
result.setCollectedfrom(List(createCollectedFrom()).asJava)
|
||||||
|
|
||||||
|
// Publisher ( Name of work's publisher mapped into Result/Publisher)
|
||||||
|
val publisher = (json \ "publisher").extract[String]
|
||||||
|
result.setPublisher(asField(publisher))
|
||||||
|
|
||||||
|
// TITLE
|
||||||
|
val mainTitles = for {JString(title) <- json \ "title"} yield createSP(title, "main title", "dnet:dataCite_title")
|
||||||
|
val originalTitles = for {JString(title) <- json \ "original-title"} yield createSP(title, "alternative title", "dnet:dataCite_title")
|
||||||
|
val shortTitles = for {JString(title) <- json \ "short-title"} yield createSP(title, "alternative title", "dnet:dataCite_title")
|
||||||
|
result.setTitle((mainTitles ::: originalTitles ::: shortTitles).asJava)
|
||||||
|
|
||||||
|
// DESCRIPTION
|
||||||
|
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
|
||||||
|
result.setDescription(descriptionList.asJava)
|
||||||
|
// Source
|
||||||
|
val sourceList = for {JString(source) <- json \ "source"} yield asField(source)
|
||||||
|
|
||||||
|
result.setSource(sourceList.asJava)
|
||||||
|
|
||||||
|
|
||||||
|
//RELEVANT DATE Mapping
|
||||||
|
val createdDate =generateDate((json \ "created" \"date-time").extract[String],(json \ "created"\"date-parts").extract[List[List[Int]]],"created", "dnet:dataCite_date" )
|
||||||
|
val postedDate =generateDate((json \ "posted" \"date-time").extractOrElse[String](null),(json \ "posted"\"date-parts").extract[List[List[Int]]],"available", "dnet:dataCite_date" )
|
||||||
|
val acceptedDate =generateDate((json \ "accepted" \"date-time").extractOrElse[String](null),(json \ "accepted"\"date-parts").extract[List[List[Int]]],"accepted", "dnet:dataCite_date" )
|
||||||
|
val publishedPrintDate =generateDate((json \ "published-print" \"date-time").extractOrElse[String](null),(json \ "published-print"\"date-parts").extract[List[List[Int]]],"published-print", "dnet:dataCite_date" )
|
||||||
|
val publishedOnlineDate =generateDate((json \ "published-online" \"date-time").extractOrElse[String](null),(json \ "published-online"\"date-parts").extract[List[List[Int]]],"published-online", "dnet:dataCite_date" )
|
||||||
|
|
||||||
|
result.setRelevantdate(List(createdDate ,postedDate, acceptedDate,publishedOnlineDate, publishedPrintDate).asJava)
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateIdentifier(oaf: Result, doi:String): String = {
|
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
|
||||||
|
if (StringUtils.isNotBlank(dt))
|
||||||
|
return createSP(dt, classId, schemeId)
|
||||||
|
|
||||||
|
|
||||||
|
if (datePart != null && datePart.size == 1) {
|
||||||
|
val res = datePart.head
|
||||||
|
if (res.size == 3) {
|
||||||
|
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
|
||||||
|
println(dp)
|
||||||
|
if (dp.length == 10) {
|
||||||
|
return createSP(dp, classId, schemeId)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
null
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def generateIdentifier(oaf: Result, doi: String): String = {
|
||||||
val id = DHPUtils.md5(doi.toLowerCase)
|
val id = DHPUtils.md5(doi.toLowerCase)
|
||||||
if (oaf.isInstanceOf[Dataset])
|
if (oaf.isInstanceOf[Dataset])
|
||||||
return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
return s"60|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
s"50|${doiBoostNSPREFIX}${SEPARATOR}${id}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def asField[T](value: T): Field[T] = {
|
||||||
|
val tmp = new Field[T]
|
||||||
|
tmp.setValue(value)
|
||||||
|
tmp
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def generateDataInfo(): DataInfo = {
|
def generateDataInfo(): DataInfo = {
|
||||||
val di =new DataInfo
|
val di = new DataInfo
|
||||||
di.setDeletedbyinference(false)
|
di.setDeletedbyinference(false)
|
||||||
di.setInferred(false)
|
di.setInferred(false)
|
||||||
di.setInvisible(false)
|
di.setInvisible(false)
|
||||||
|
@ -140,7 +197,16 @@ class Crossref2Oaf {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def createQualifier(cls:String, sch:String):Qualifier = {
|
def createCollectedFrom(): KeyValue = {
|
||||||
|
|
||||||
|
val cf = new KeyValue
|
||||||
|
cf.setValue(CROSSREF)
|
||||||
|
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5("crossref"))
|
||||||
|
cf
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||||
val q = new Qualifier
|
val q = new Qualifier
|
||||||
q.setClassid(cls)
|
q.setClassid(cls)
|
||||||
q.setClassname(cls)
|
q.setClassname(cls)
|
||||||
|
@ -160,4 +226,4 @@ class Crossref2Oaf {
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="import Crossref from index into HDFS" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="import ORCID from index into HDFS" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
|
|
|
@ -1,68 +1,107 @@
|
||||||
package eu.dnetlib.doiboost;
|
package eu.dnetlib.doiboost;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.fasterxml.jackson.databind.SerializationFeature;
|
import com.fasterxml.jackson.databind.SerializationFeature;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.doiboost.crossref.Crossref2Oaf;
|
import eu.dnetlib.doiboost.crossref.Crossref2Oaf;
|
||||||
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF;
|
import eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.platform.commons.util.StringUtils;
|
import org.junit.platform.commons.util.StringUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class DoiBoostTest {
|
public class DoiBoostTest {
|
||||||
|
|
||||||
Logger logger = LoggerFactory.getLogger(DoiBoostTest.class);
|
Logger logger = LoggerFactory.getLogger(DoiBoostTest.class);
|
||||||
|
|
||||||
|
|
||||||
public void test() throws Exception {
|
public void test() throws Exception {
|
||||||
|
|
||||||
//SparkDownloadContentFromCrossref.main(null);
|
// SparkDownloadContentFromCrossref.main(null);
|
||||||
//CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" "));
|
// CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split("
|
||||||
SparkMapDumpIntoOAF.main("-m local[*] -s file:///data/doiboost/crossref_dump.seq".split(" "));
|
// "));
|
||||||
|
SparkMapDumpIntoOAF.main(
|
||||||
|
"-m local[*] -s file:///data/doiboost/crossref_dump.seq".split(" "));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConvertCrossRef2Oaf() throws IOException {
|
public void testConvertCrossRef2Oaf() throws IOException {
|
||||||
|
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream("pc.json"));
|
final String json = IOUtils.toString(getClass().getResourceAsStream("pc.json"));
|
||||||
ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
|
ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
|
||||||
Assertions.assertNotNull(json);
|
assertNotNull(json);
|
||||||
Assertions.assertFalse(StringUtils.isBlank(json));
|
assertFalse(StringUtils.isBlank(json));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Crossref2Oaf cf = new Crossref2Oaf();
|
Crossref2Oaf cf = new Crossref2Oaf();
|
||||||
final Result result = cf.convert(json, logger);
|
final Result result = cf.convert(json, logger);
|
||||||
Assertions.assertNotNull(result);
|
assertNotNull(result);
|
||||||
|
|
||||||
|
assertNotNull(result.getDataInfo(), "Datainfo test not null Failed");
|
||||||
|
assertNotNull(
|
||||||
|
result.getDataInfo().getProvenanceaction(),
|
||||||
|
"DataInfo/Provenance test not null Failed");
|
||||||
|
assertFalse(
|
||||||
|
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassid()),
|
||||||
|
"DataInfo/Provenance/classId test not null Failed");
|
||||||
|
assertFalse(
|
||||||
|
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassname()),
|
||||||
|
"DataInfo/Provenance/className test not null Failed");
|
||||||
|
assertFalse(
|
||||||
|
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemeid()),
|
||||||
|
"DataInfo/Provenance/SchemeId test not null Failed");
|
||||||
|
assertFalse(
|
||||||
|
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemename()),
|
||||||
|
"DataInfo/Provenance/SchemeName test not null Failed");
|
||||||
|
|
||||||
|
assertNotNull(result.getCollectedfrom(), "CollectedFrom test not null Failed");
|
||||||
|
assertTrue(result.getCollectedfrom().size() > 0);
|
||||||
|
assertTrue(
|
||||||
|
result.getCollectedfrom().stream()
|
||||||
|
.anyMatch(
|
||||||
|
c ->
|
||||||
|
c.getKey()
|
||||||
|
.equalsIgnoreCase(
|
||||||
|
"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")));
|
||||||
|
assertTrue(
|
||||||
|
result.getCollectedfrom().stream()
|
||||||
|
.anyMatch(c -> c.getValue().equalsIgnoreCase("crossref")));
|
||||||
|
|
||||||
|
assertTrue(
|
||||||
|
result.getRelevantdate().stream()
|
||||||
|
.anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created")));
|
||||||
|
assertTrue(
|
||||||
|
result.getRelevantdate().stream()
|
||||||
|
.anyMatch(
|
||||||
|
d -> d.getQualifier().getClassid().equalsIgnoreCase("available")));
|
||||||
|
assertTrue(
|
||||||
|
result.getRelevantdate().stream()
|
||||||
|
.anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("accepted")));
|
||||||
|
assertTrue(
|
||||||
|
result.getRelevantdate().stream()
|
||||||
|
.anyMatch(
|
||||||
|
d ->
|
||||||
|
d.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase("published-online")));
|
||||||
|
assertTrue(
|
||||||
|
result.getRelevantdate().stream()
|
||||||
|
.anyMatch(
|
||||||
|
d ->
|
||||||
|
d.getQualifier()
|
||||||
|
.getClassid()
|
||||||
|
.equalsIgnoreCase("published-print")));
|
||||||
|
|
||||||
logger.info(mapper.writeValueAsString(result));
|
logger.info(mapper.writeValueAsString(result));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPath() throws Exception {
|
public void testPath() throws Exception {
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream("response.json"));
|
final String json = IOUtils.toString(getClass().getResourceAsStream("response.json"));
|
||||||
final List<String > res = JsonPath.read(json, "$.hits.hits[*]._source.blob");
|
final List<String> res = JsonPath.read(json, "$.hits.hits[*]._source.blob");
|
||||||
System.out.println(res.size());
|
System.out.println(res.size());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -112,9 +112,35 @@
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
"published-print": {
|
||||||
|
"timestamp": 1446095513000,
|
||||||
|
"date-time": "2015-10-29T05:11:53Z",
|
||||||
|
"date-parts": [
|
||||||
|
[
|
||||||
|
2015,
|
||||||
|
2,
|
||||||
|
29
|
||||||
|
]
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"published-online": {
|
||||||
|
"date-parts": [
|
||||||
|
[
|
||||||
|
2015,
|
||||||
|
2,
|
||||||
|
2
|
||||||
|
]
|
||||||
|
]
|
||||||
|
},
|
||||||
"title": [
|
"title": [
|
||||||
"Genetic transformation of micropropagated shoots ofPinus radiataD.Don"
|
"Genetic transformation of micropropagated shoots ofPinus radiataD.Don"
|
||||||
],
|
],
|
||||||
|
"original-title": [
|
||||||
|
"OR TITLE"
|
||||||
|
],
|
||||||
|
"short-title": [
|
||||||
|
"SHORT TITLE"
|
||||||
|
],
|
||||||
"group-title": "Plant Biology",
|
"group-title": "Plant Biology",
|
||||||
"subtype": "preprint"
|
"subtype": "preprint"
|
||||||
}
|
}
|
Loading…
Reference in New Issue