enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
5 changed files with 559 additions and 97 deletions
Showing only changes of commit 157915988c - Show all commits

View File

@ -8,9 +8,11 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST._
import org.json4s.jackson.JsonMethods._
import org.slf4j.Logger
import scala.collection.JavaConverters._
case class mappingAffiliation(name:String)
case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation:Option[mappingAffiliation]) {}
class Crossref2Oaf {
//STATIC STRING
@ -81,26 +83,22 @@ class Crossref2Oaf {
"report" -> "0017 Report"
)
def convert(input: String, logger: Logger): Result = {
def mappingResult(result: Result, json: JValue, cobjCategory:String): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return null
val result = generateItemFromType(objectType, objectSubType)
if (result == null)
return result
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"));
logger.debug(mappingCrossrefType(objectType))
logger.debug(cOBJCategory)
//MAPPING Crossref DOI into PID
val doi: String = (json \ "DOI").extract[String]
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
//MAPPING Crossref DOI into OriginalId
result.setOriginalId(List(doi).asJava)
//and Other Original Identifier of dataset like clinical-trial-number
val clinicalTrialNumbers = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr
val alternativeIds = for (JString(ids) <- json \ "alternative-id") yield ids
val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
result.setOriginalId(tmp.filter(id => id != null).asJava)
//Set identifier as {50|60} | doiboost____::md5(DOI)
result.setId(generateIdentifier(result, doi))
@ -120,17 +118,16 @@ class Crossref2Oaf {
val mainTitles = for {JString(title) <- json \ "title"} yield createSP(title, "main title", "dnet:dataCite_title")
val originalTitles = for {JString(title) <- json \ "original-title"} yield createSP(title, "alternative title", "dnet:dataCite_title")
val shortTitles = for {JString(title) <- json \ "short-title"} yield createSP(title, "alternative title", "dnet:dataCite_title")
result.setTitle((mainTitles ::: originalTitles ::: shortTitles).asJava)
val subtitles = for {JString(title) <- json \ "subtitle"} yield createSP(title, "subtitle", "dnet:dataCite_title")
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
// DESCRIPTION
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
result.setDescription(descriptionList.asJava)
// Source
val sourceList = for {JString(source) <- json \ "source"} yield asField(source)
result.setSource(sourceList.asJava)
//RELEVANT DATE Mapping
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", "dnet:dataCite_date")
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", "dnet:dataCite_date")
@ -138,26 +135,145 @@ class Crossref2Oaf {
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", "dnet:dataCite_date")
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", "dnet:dataCite_date")
result.setRelevantdate(List(createdDate ,postedDate, acceptedDate,publishedOnlineDate, publishedPrintDate).asJava)
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
if (StringUtils.isNotBlank(issuedDate)) {
result.setDateofacceptance(asField(issuedDate))
}
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
//Mapping AUthor
val authorList:List[mappingAuthor] = (json \ "author").extract[List[mappingAuthor]]
result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava)
// Mapping instance
val instance = new Instance()
val license = for {
JString(lic) <- json \ "license" \ "URL"
} yield asField(lic)
val l = license.filter(d => StringUtils.isNotBlank(d.getValue))
if (l.nonEmpty)
instance.setLicense(l.head)
instance.setAccessright(createQualifier("Restricted", "dnet:access_modes"))
result.setInstance(List(instance).asJava)
instance.setInstancetype(createQualifier(cobjCategory.substring(0,4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
instance.setCollectedfrom(createCollectedFrom())
if (StringUtils.isNotBlank(issuedDate)) {
instance.setDateofacceptance(asField(issuedDate))
}
val s: String =(json \ "URL").extract[String]
val links:List[String] = ((for {JString(url) <-json \ "link" \ "URL"} yield url) ::: List(s)).filter(p =>p != null).distinct
if (links.nonEmpty)
instance.setUrl(links.asJava)
result
}
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
def generateAuhtor(given:String, family:String, orcid:String):Author = {
val a =new Author
a.setName(given)
a.setSurname(family)
a.setFullname(s"${given} ${family}")
if (StringUtils.isNotBlank(orcid))
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
a
}
def convert(input: String, logger: Logger): Result = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
val objectType = (json \ "type").extractOrElse[String](null)
val objectSubType = (json \ "subtype").extractOrElse[String](null)
if (objectType == null)
return null
val result = generateItemFromType(objectType, objectSubType)
if (result == null)
return result
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"));
logger.debug(mappingCrossrefType(objectType))
logger.debug(cOBJCategory)
mappingResult(result, json, cOBJCategory)
result match {
case publication: Publication => convertPublication(publication)
case dataset: Dataset => convertDataset(dataset)
}
result
}
def convertDataset(dataset: Dataset): Unit = {
}
def convertPublication(publication: Publication, json: JValue, cobjCategory:String): Unit = {
val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct
//Mapping book
if (cobjCategory.toLowerCase.contains("book")) {
val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
if (publication.getSource != null) {
val l: List[Field[String]] = publication.getSource.asScala.toList
val ll: List[Field[String]] = l ::: List(asField(source))
publication.setSource(ll.asJava)
}
else
publication.setSource(List(asField(source)).asJava)
}
} else {
val issn =
}
// Mapping other types of publications
}
def extractDate(dt: String, datePart: List[List[Int]]): String = {
if (StringUtils.isNotBlank(dt))
return createSP(dt, classId, schemeId)
return dt
if (datePart != null && datePart.size == 1) {
val res = datePart.head
if (res.size == 3) {
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
println(dp)
if (dp.length == 10) {
return dp
}
}
}
null
}
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
val dp = extractDate(dt, datePart)
if (StringUtils.isNotBlank(dp))
return createSP(dp, classId, schemeId)
}
}
}
null
}
@ -206,15 +322,19 @@ class Crossref2Oaf {
}
def createQualifier(cls: String, sch: String): Qualifier = {
def createQualifier(clsName: String,clsValue: String, schName: String, schValue: String): Qualifier = {
val q = new Qualifier
q.setClassid(cls)
q.setClassname(cls)
q.setSchemeid(sch)
q.setSchemename(sch)
q.setClassid(clsName)
q.setClassname(clsValue)
q.setSchemeid(schName)
q.setSchemename(schValue)
q
}
def createQualifier(cls: String, sch: String): Qualifier = {
createQualifier(cls, cls, sch, sch)
}
def generateItemFromType(objectType: String, objectSubType: String): Result = {
if (mappingCrossrefType.contains(objectType)) {

View File

@ -30,9 +30,9 @@ public class DoiBoostTest {
}
@Test
public void testConvertCrossRef2Oaf() throws IOException {
public void testConvertPreprintCrossRef2Oaf() throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream("pc.json"));
final String json = IOUtils.toString(getClass().getResourceAsStream("article.json"));
ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
assertNotNull(json);
assertFalse(StringUtils.isBlank(json));
@ -73,13 +73,79 @@ public class DoiBoostTest {
assertTrue(
result.getRelevantdate().stream()
.anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created")));
// assertTrue(
// result.getRelevantdate().stream()
// .anyMatch(
// d ->
// d.getQualifier().getClassid().equalsIgnoreCase("available")));
// assertTrue(
// result.getRelevantdate().stream()
// .anyMatch(d ->
// d.getQualifier().getClassid().equalsIgnoreCase("accepted")));
assertTrue(
result.getRelevantdate().stream()
.anyMatch(
d -> d.getQualifier().getClassid().equalsIgnoreCase("available")));
d ->
d.getQualifier()
.getClassid()
.equalsIgnoreCase("published-online")));
// assertTrue(
// result.getRelevantdate().stream()
// .anyMatch(
// d ->
// d.getQualifier()
// .getClassid()
// .equalsIgnoreCase("published-print")));
logger.info(mapper.writeValueAsString(result));
}
@Test
public void testConvertBooktCrossRef2Oaf() throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream("book.json"));
ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT);
assertNotNull(json);
assertFalse(StringUtils.isBlank(json));
Crossref2Oaf cf = new Crossref2Oaf();
final Result result = cf.convert(json, logger);
assertNotNull(result);
logger.info(mapper.writeValueAsString(result));
assertNotNull(result.getDataInfo(), "Datainfo test not null Failed");
assertNotNull(
result.getDataInfo().getProvenanceaction(),
"DataInfo/Provenance test not null Failed");
assertFalse(
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassid()),
"DataInfo/Provenance/classId test not null Failed");
assertFalse(
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassname()),
"DataInfo/Provenance/className test not null Failed");
assertFalse(
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemeid()),
"DataInfo/Provenance/SchemeId test not null Failed");
assertFalse(
StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemename()),
"DataInfo/Provenance/SchemeName test not null Failed");
assertNotNull(result.getCollectedfrom(), "CollectedFrom test not null Failed");
assertTrue(result.getCollectedfrom().size() > 0);
assertTrue(
result.getCollectedfrom().stream()
.anyMatch(
c ->
c.getKey()
.equalsIgnoreCase(
"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2")));
assertTrue(
result.getCollectedfrom().stream()
.anyMatch(c -> c.getValue().equalsIgnoreCase("crossref")));
assertTrue(
result.getRelevantdate().stream()
.anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("accepted")));
.anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created")));
assertTrue(
result.getRelevantdate().stream()
.anyMatch(
@ -94,8 +160,6 @@ public class DoiBoostTest {
d.getQualifier()
.getClassid()
.equalsIgnoreCase("published-print")));
logger.info(mapper.writeValueAsString(result));
}
@Test

View File

@ -0,0 +1,174 @@
{
"DOI": "10.26850/1678-4618eqj.v35.1.2010.p41-46",
"issued": {
"date-parts": [
[
2018,
1,
15
]
]
},
"abstract": "<jats:p>A qualitative spot-test and tandem quantitative analysis of dipyrone in the bulk drugand in pharmaceutical preparations is proposed. The formation of a reddish-violet\u00a0 color indicates a positive result. In sequence a quantitative procedure can be performed in the same flask. The quantitative results obtained were statistically compared with those obtained with the method indicated by the Brazilian\u00a0 Pharmacopoeia, using the Student\u2019s t and the F tests. Considering the concentration in a 100 \u03bcL aliquot, the qualitative visual limit of detection is about 5\u00d710-6 g; instrumental LOD \u2245 1.4\u00d710-4 mol L-1 ; LOQ \u2245 4.5\u00d710-4 mol L-1.</jats:p>",
"prefix": "10.26850",
"author": [
{
"authenticated-orcid": false,
"given": "Matthieu",
"family": "Tubino",
"sequence": "first",
"affiliation": [],
"ORCID": "http://orcid.org/0000-0002-1987-3907"
},
{
"affiliation": [],
"given": "A. C.",
"family": "Biondo",
"sequence": "additional"
},
{
"authenticated-orcid": false,
"given": "Marta Maria Duarte Carvalho",
"family": "Vila",
"sequence": "additional",
"affiliation": [],
"ORCID": "http://orcid.org/0000-0002-0198-7076"
},
{
"authenticated-orcid": false,
"given": "Leonardo",
"family": "Pezza",
"sequence": "additional",
"affiliation": [],
"ORCID": "http://orcid.org/0000-0003-0197-7369"
},
{
"authenticated-orcid": false,
"given": "Helena Redigolo",
"family": "Pezza",
"sequence": "additional",
"affiliation": [],
"ORCID": "http://orcid.org/0000-0001-5564-1639"
}
],
"reference-count": 0,
"ISSN": [
"1678-4618"
],
"member": "11395",
"source": "Crossref",
"score": 1.0,
"deposited": {
"timestamp": 1540823529000,
"date-time": "2018-10-29T14:32:09Z",
"date-parts": [
[
2018,
10,
29
]
]
},
"indexed": {
"timestamp": 1540825815212,
"date-time": "2018-10-29T15:10:15Z",
"date-parts": [
[
2018,
10,
29
]
]
},
"type": "journal-article",
"published-online": {
"date-parts": [
[
2018,
1,
15
]
]
},
"URL": "http://dx.doi.org/10.26850/1678-4618eqj.v35.1.2010.p41-46",
"is-referenced-by-count": 0,
"volume": "35",
"issn-type": [
{
"type": "electronic",
"value": "1678-4618"
}
],
"link": [
{
"URL": "http://revista.iq.unesp.br/ojs/index.php/ecletica/article/viewFile/191/149",
"intended-application": "text-mining",
"content-version": "vor",
"content-type": "application/pdf"
},
{
"URL": "http://revista.iq.unesp.br/ojs/index.php/ecletica/article/viewFile/191/149",
"intended-application": "similarity-checking",
"content-version": "vor",
"content-type": "unspecified"
}
],
"journal-issue": {
"issue": "1",
"published-online": {
"date-parts": [
[
2018,
1,
15
]
]
}
},
"references-count": 0,
"short-container-title": [
"Eclet. Quim. J."
],
"publisher": "Ecletica Quimica Journal",
"content-domain": {
"domain": [],
"crossmark-restriction": false
},
"license": [
{
"URL": "http://creativecommons.org/licenses/by/4.0",
"start": {
"timestamp": 1515974400000,
"date-time": "2018-01-15T00:00:00Z",
"date-parts": [
[
2018,
1,
15
]
]
},
"content-version": "unspecified",
"delay-in-days": 0
}
],
"created": {
"timestamp": 1517590842000,
"date-time": "2018-02-02T17:00:42Z",
"date-parts": [
[
2018,
2,
2
]
]
},
"issue": "1",
"title": [
"Spot-test identification and rapid quantitative sequential analys is of dipyrone"
],
"container-title": [
"Ecl\u00e9tica Qu\u00edmica Journal"
],
"page": "41"
}

View File

@ -0,0 +1,104 @@
{
"DOI": "10.17848/9780880992299.vol1ch4",
"ISBN": [
"9780880992299"
],
"issued": {
"date-parts": [
[
2001,
12,
1
]
]
},
"prefix": "10.17848",
"author": [
{
"affiliation": [
],
"given": "William E.",
"family": "Even",
"authenticated-orcid": false
},
{
"affiliation": [
],
"given": "David A.",
"family": "Macpherson"
}
],
"reference-count": 0,
"member": "7312",
"source": "Crossref",
"score": 1.0,
"deposited": {
"timestamp": 1461687244000,
"date-parts": [
[
2016,
4,
26
]
],
"date-time": "2016-04-26T16:14:04Z"
},
"indexed": {
"timestamp": 1502548826285,
"date-parts": [
[
2017,
8,
12
]
],
"date-time": "2017-08-12T14:40:26Z"
},
"type": "book-chapter",
"published-online": {
"date-parts": [
[
2010,
5,
27
]
]
},
"URL": "http://dx.doi.org/10.17848/9780880992299.vol1ch4",
"is-referenced-by-count": 0,
"download_ts": 1508079092.874343,
"published-print": {
"date-parts": [
[
2001,
12,
1
]
]
},
"references-count": 0,
"publisher": "W.E. Upjohn Institute",
"content-domain": {
"domain": [
],
"crossmark-restriction": false
},
"created": {
"timestamp": 1434034139000,
"date-parts": [
[
2015,
6,
11
]
],
"date-time": "2015-06-11T14:48:59Z"
},
"title": [
"Children\\'s Effects on Women\\'s Labor Market Attachment and Earnings"
],
"container-title": [
"Working Time in Comparative Perspective - Volume II: Life-Cycle Working Time and Nonstandard Hours"
],
"page": "99-128"
}