From 4ba386d99693915f76ce344b3846fe543842394e Mon Sep 17 00:00:00 2001 From: sandro Date: Thu, 23 Apr 2020 09:33:48 +0200 Subject: [PATCH] improved crossref mapping --- .../doiboost/crossref/Crossref2Oaf.scala | 86 +- .../doiboost/crossref/CrossrefImporter.java | 53 +- .../dnetlib/doiboost/crossref/ESClient.java | 63 +- .../crossref/SparkMapDumpIntoOAF.scala | 59 +- .../doiboost/orcid/json/JsonWriter.java | 22 +- .../doiboost/orcid/model/AuthorData.java | 81 +- .../doiboost/convert_map_to_oaf_params.json | 1 + .../eu/dnetlib/doiboost/DoiBoostTest.java | 100 +- .../eu/dnetlib/doiboost/article.json | 2 +- .../eu/dnetlib/doiboost/dataset.json | 105 ++ .../raw/AbstractMdRecordToOafMapper.java | 826 ++++++++------- .../raw/MigrateDbEntitiesApplication.java | 996 +++++++++--------- .../graph/raw/common/MigrationConstants.java | 37 +- 13 files changed, 1383 insertions(+), 1048 deletions(-) create mode 100644 dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/dataset.json diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index eee0a4a72..157373db6 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -10,10 +10,11 @@ import org.json4s.jackson.JsonMethods._ import org.slf4j.Logger import scala.collection.JavaConverters._ -case class mappingAffiliation(name:String) -case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation:Option[mappingAffiliation]) {} +case class mappingAffiliation(name: String) +case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation: Option[mappingAffiliation]) {} -class Crossref2Oaf { + +case object Crossref2Oaf { //STATIC STRING val MAG = "MAG" @@ -28,7 +29,6 @@ class Crossref2Oaf { val DNET_LANGUAGES = "dnet:languages" val PID_TYPES = "dnet:pid_types" - val mappingCrossrefType = Map( "book-section" -> "publication", "book" -> "publication", @@ -84,7 +84,7 @@ class Crossref2Oaf { ) - def mappingResult(result: Result, json: JValue, cobjCategory:String): Result = { + def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats //MAPPING Crossref DOI into PID @@ -111,7 +111,7 @@ class Crossref2Oaf { result.setCollectedfrom(List(createCollectedFrom()).asJava) // Publisher ( Name of work's publisher mapped into Result/Publisher) - val publisher = (json \ "publisher").extract[String] + val publisher = (json \ "publisher").extractOrElse[String](null) result.setPublisher(asField(publisher)) // TITLE @@ -144,7 +144,7 @@ class Crossref2Oaf { //Mapping AUthor - val authorList:List[mappingAuthor] = (json \ "author").extract[List[mappingAuthor]] + val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List()) result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava) @@ -152,8 +152,8 @@ class Crossref2Oaf { val instance = new Instance() val license = for { - JString(lic) <- json \ "license" \ "URL" - } yield asField(lic) + JString(lic) <- json \ "license" \ "URL" + } yield asField(lic) val l = license.filter(d => StringUtils.isNotBlank(d.getValue)) if (l.nonEmpty) instance.setLicense(l.head) @@ -161,24 +161,22 @@ class Crossref2Oaf { instance.setAccessright(createQualifier("Restricted", "dnet:access_modes")) result.setInstance(List(instance).asJava) - instance.setInstancetype(createQualifier(cobjCategory.substring(0,4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource")) + instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource")) instance.setCollectedfrom(createCollectedFrom()) if (StringUtils.isNotBlank(issuedDate)) { instance.setDateofacceptance(asField(issuedDate)) } - val s: String =(json \ "URL").extract[String] - val links:List[String] = ((for {JString(url) <-json \ "link" \ "URL"} yield url) ::: List(s)).filter(p =>p != null).distinct + val s: String = (json \ "URL").extract[String] + val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct if (links.nonEmpty) instance.setUrl(links.asJava) result } - - - def generateAuhtor(given:String, family:String, orcid:String):Author = { - val a =new Author + def generateAuhtor(given: String, family: String, orcid: String): Author = { + val a = new Author a.setName(given) a.setSurname(family) a.setFullname(s"${given} ${family}") @@ -202,30 +200,28 @@ class Crossref2Oaf { if (result == null) return result val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type")); - logger.debug(mappingCrossrefType(objectType)) - logger.debug(cOBJCategory) +// logger.debug(mappingCrossrefType(objectType)) +// logger.debug(cOBJCategory) mappingResult(result, json, cOBJCategory) result match { - case publication: Publication => convertPublication(publication) + case publication: Publication => convertPublication(publication, json, cOBJCategory) case dataset: Dataset => convertDataset(dataset) } - - - result } def convertDataset(dataset: Dataset): Unit = { - + //TODO probably we need to add relation and other stuff here } - def convertPublication(publication: Publication, json: JValue, cobjCategory:String): Unit = { + def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct @@ -243,12 +239,44 @@ class Crossref2Oaf { publication.setSource(List(asField(source)).asJava) } } else { - val issn = + // Mapping Journal + + val issnInfos = for {JArray(issn_types) <- json \ "issn-type" + JObject(issn_type) <- issn_types + JField("type", JString(tp)) <- issn_type + JField("value", JString(vl)) <- issn_type + } yield Tuple2(tp, vl) + + val volume = (json \ "volume").extractOrElse[String] (null) + if (containerTitles.nonEmpty) { + val journal = new Journal + journal.setName(containerTitles.head) + if (issnInfos.nonEmpty) { + + issnInfos.foreach(tp => { + tp._1 match { + case "electronic" => journal.setIssnOnline(tp._2) + case "print" => journal.setIssnPrinted(tp._2) + } + }) + + } + journal.setVol(volume) + + val page = (json \ "page").extractOrElse[String] (null) + if(page!= null ) { + val pp = page.split("-") + journal.setSp(pp.head) + if (pp.size > 1) + journal.setEp(pp(1)) + } + + + publication.setJournal(journal) + } + } - // Mapping other types of publications - - } @@ -322,7 +350,7 @@ class Crossref2Oaf { } - def createQualifier(clsName: String,clsValue: String, schName: String, schValue: String): Qualifier = { + def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = { val q = new Qualifier q.setClassid(clsName) q.setClassname(clsValue) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java index d279e4a46..f09719267 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/CrossrefImporter.java @@ -1,6 +1,8 @@ package eu.dnetlib.doiboost.crossref; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import java.io.ByteArrayOutputStream; +import java.util.zip.Inflater; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -12,30 +14,29 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.ByteArrayOutputStream; -import java.util.zip.Inflater; - - public class CrossrefImporter { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(CrossrefImporter.class.getResourceAsStream("/eu/dnetlib/dhp/doiboost/import_from_es.json"))); + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + CrossrefImporter.class.getResourceAsStream( + "/eu/dnetlib/dhp/doiboost/import_from_es.json"))); Logger logger = LoggerFactory.getLogger(CrossrefImporter.class); parser.parseArgument(args); final String hdfsuri = parser.get("namenode"); - logger.info("HDFS URI"+hdfsuri); + logger.info("HDFS URI" + hdfsuri); Path hdfswritepath = new Path(parser.get("targetPath")); - logger.info("TargetPath: "+hdfsuri); + logger.info("TargetPath: " + hdfsuri); - final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))?Long.parseLong(parser.get("timestamp")):-1; - - if(timestamp>0) - logger.info("Timestamp added "+timestamp); + final Long timestamp = + StringUtils.isNotBlank(parser.get("timestamp")) + ? Long.parseLong(parser.get("timestamp")) + : -1; + if (timestamp > 0) logger.info("Timestamp added " + timestamp); // ====== Init HDFS File System Object Configuration conf = new Configuration(); @@ -45,16 +46,21 @@ public class CrossrefImporter { conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + ESClient client = + timestamp > 0 + ? new ESClient( + "ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp) + : new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref"); - - ESClient client = timestamp>0?new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp):new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref"); - - try (SequenceFile.Writer writer = SequenceFile.createWriter(conf, - SequenceFile.Writer.file(hdfswritepath), SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { + try (SequenceFile.Writer writer = + SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { int i = 0; - long start= System.currentTimeMillis(); + long start = System.currentTimeMillis(); long end = 0; final IntWritable key = new IntWritable(i); final Text value = new Text(); @@ -65,7 +71,10 @@ public class CrossrefImporter { if (i % 1000000 == 0) { end = System.currentTimeMillis(); final float time = (end - start) / 1000.0F; - logger.info(String.format("Imported %d records last 100000 imported in %f seconds", i, time)); + logger.info( + String.format( + "Imported %d records last 100000 imported in %f seconds", + i, time)); start = System.currentTimeMillis(); } } @@ -87,7 +96,7 @@ public class CrossrefImporter { decompresser.end(); return new String(unzippeddata); } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob,e); + throw new RuntimeException("Wrong record:" + blob, e); } } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java index c7cc3a75a..5412a192a 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ESClient.java @@ -1,6 +1,9 @@ package eu.dnetlib.doiboost.crossref; import com.jayway.jsonpath.JsonPath; +import java.io.IOException; +import java.util.Iterator; +import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; @@ -10,18 +13,15 @@ import org.apache.http.impl.client.HttpClients; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - public class ESClient implements Iterator { - private final static Logger logger = LoggerFactory.getLogger(ESClient.class); + private static final Logger logger = LoggerFactory.getLogger(ESClient.class); - final static String blobPath = "$.hits[*].hits[*]._source.blob"; - final static String scrollIdPath = "$._scroll_id"; - final static String JSON_NO_TS ="{\"size\":1000}"; - final static String JSON_WITH_TS ="{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; - final static String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; + static final String blobPath = "$.hits[*].hits[*]._source.blob"; + static final String scrollIdPath = "$._scroll_id"; + static final String JSON_NO_TS = "{\"size\":1000}"; + static final String JSON_WITH_TS = + "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}"; + static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}"; private final String scrollId; @@ -29,29 +29,34 @@ public class ESClient implements Iterator { private final String esHost; - public ESClient(final String esHost, final String esIndex) throws IOException { this.esHost = esHost; - final String body =getResponse(String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS); - scrollId= getJPathString(scrollIdPath, body); + final String body = + getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + JSON_NO_TS); + scrollId = getJPathString(scrollIdPath, body); buffer = getBlobs(body); } - - public ESClient(final String esHost, final String esIndex, final long timestamp) throws IOException { + public ESClient(final String esHost, final String esIndex, final long timestamp) + throws IOException { this.esHost = esHost; - final String body =getResponse(String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), String.format(JSON_WITH_TS, timestamp)); - scrollId= getJPathString(scrollIdPath, body); + final String body = + getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + String.format(JSON_WITH_TS, timestamp)); + scrollId = getJPathString(scrollIdPath, body); buffer = getBlobs(body); } - private String getResponse(final String url,final String json ) { + private String getResponse(final String url, final String json) { CloseableHttpClient client = HttpClients.createDefault(); try { HttpPost httpPost = new HttpPost(url); - if (json!= null) { + if (json != null) { StringEntity entity = new StringEntity(json); httpPost.setEntity(entity); httpPost.setHeader("Accept", "application/json"); @@ -61,22 +66,20 @@ public class ESClient implements Iterator { return IOUtils.toString(response.getEntity().getContent()); } catch (Throwable e) { - throw new RuntimeException("Error on executing request ",e); + throw new RuntimeException("Error on executing request ", e); } finally { try { client.close(); } catch (IOException e) { - throw new RuntimeException("Unable to close client ",e); + throw new RuntimeException("Unable to close client ", e); } } - } - private String getJPathString(final String jsonPath, final String json) { + private String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String) o; + if (o instanceof String) return (String) o; return null; } catch (Exception e) { return ""; @@ -84,14 +87,13 @@ public class ESClient implements Iterator { } private List getBlobs(final String body) { - final List res = JsonPath.read(body, "$.hits.hits[*]._source.blob"); + final List res = JsonPath.read(body, "$.hits.hits[*]._source.blob"); return res; } @Override public boolean hasNext() { - return (buffer!= null && !buffer.isEmpty()); - + return (buffer != null && !buffer.isEmpty()); } @Override @@ -100,11 +102,12 @@ public class ESClient implements Iterator { if (buffer.isEmpty()) { final String json_param = String.format(JSON_SCROLL, scrollId); - final String body =getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + final String body = + getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); try { buffer = getBlobs(body); } catch (Throwable e) { - logger.error("Error on get next page: body:"+body); + logger.error("Error on get next page: body:" + body); } } return nextItem; diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala index 284106f81..3374f2969 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/SparkMapDumpIntoOAF.scala @@ -1,22 +1,22 @@ package eu.dnetlib.doiboost.crossref import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.Publication import org.apache.commons.io.IOUtils import org.apache.hadoop.io.{IntWritable, Text} import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Dataset, Encoders, SaveMode, SparkSession} import org.slf4j.{Logger, LoggerFactory} - -case class Reference(author:String, firstPage:String) {} +case class Reference(author: String, firstPage: String) {} object SparkMapDumpIntoOAF { def main(args: Array[String]): Unit = { - val logger:Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) + val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass) val conf: SparkConf = new SparkConf() val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json"))) parser.parseArgument(args) @@ -26,33 +26,42 @@ object SparkMapDumpIntoOAF { .config(conf) .appName(SparkMapDumpIntoOAF.getClass.getSimpleName) .master(parser.get("master")).getOrCreate() + import spark.implicits._ + implicit val mapEncoder = Encoders.bean(classOf[Publication]) val sc = spark.sparkContext - val x: String = sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text]) - .map(k => k._2.toString).first() - val item =CrossrefImporter.decompressBlob(x) + val total = sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text]) + .map(k => k._2.toString).map(CrossrefImporter.decompressBlob) + .map(k => Crossref2Oaf.convert(k, logger)) + .filter(k => k != null && k.isInstanceOf[Publication]) + .map(k => k.asInstanceOf[Publication]) - logger.info(item) + val ds: Dataset[Publication] = spark.createDataset(total) + val targetPath = parser.get("targetPath") + ds.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication") -// lazy val json: json4s.JValue = parse(item) -// -// -// val references = for { -// JArray(references) <- json \\ "reference" -// JObject(reference) <- references -// JField("first-page", JString(firstPage)) <- reference -// JField("author", JString(author)) <- reference -// } yield Reference(author, firstPage) -// -// -// -// -// logger.info((json \ "created" \ "timestamp").extractOrElse("missing")) -// logger.info(references.toString()) -// -// logger.info((json \ "type").extractOrElse("missing")) + + logger.info(s"total Item :${total}") + + // lazy val json: json4s.JValue = parse(item) + // + // + // val references = for { + // JArray(references) <- json \\ "reference" + // JObject(reference) <- references + // JField("first-page", JString(firstPage)) <- reference + // JField("author", JString(author)) <- reference + // } yield Reference(author, firstPage) + // + // + // + // + // logger.info((json \ "created" \ "timestamp").extractOrElse("missing")) + // logger.info(references.toString()) + // + // logger.info((json \ "type").extractOrElse("missing")) } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java index 7e795c1a0..395f0c0cd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/json/JsonWriter.java @@ -1,20 +1,18 @@ package eu.dnetlib.doiboost.orcid.json; import com.google.gson.JsonObject; - import eu.dnetlib.doiboost.orcid.model.AuthorData; - public class JsonWriter { - public static String create(AuthorData authorData) { - JsonObject author = new JsonObject(); - author.addProperty("oid", authorData.getOid()); - author.addProperty("name", authorData.getName()); - author.addProperty("surname", authorData.getSurname()); - if (authorData.getCreditName()!=null) { - author.addProperty("creditname", authorData.getCreditName()); - } - return author.toString(); - } + public static String create(AuthorData authorData) { + JsonObject author = new JsonObject(); + author.addProperty("oid", authorData.getOid()); + author.addProperty("name", authorData.getName()); + author.addProperty("surname", authorData.getSurname()); + if (authorData.getCreditName() != null) { + author.addProperty("creditname", authorData.getCreditName()); + } + return author.toString(); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java index 66997a57e..6f46b1161 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/model/AuthorData.java @@ -2,40 +2,49 @@ package eu.dnetlib.doiboost.orcid.model; public class AuthorData { - private String oid; - private String name; - private String surname; - private String creditName; - private String errorCode; - - public String getErrorCode() { - return errorCode; - } - public void setErrorCode(String errorCode) { - this.errorCode = errorCode; - } - public String getName() { - return name; - } - public void setName(String name) { - this.name = name; - } - public String getSurname() { - return surname; - } - public void setSurname(String surname) { - this.surname = surname; - } - public String getCreditName() { - return creditName; - } - public void setCreditName(String creditName) { - this.creditName = creditName; - } - public String getOid() { - return oid; - } - public void setOid(String oid) { - this.oid = oid; - } + private String oid; + private String name; + private String surname; + private String creditName; + private String errorCode; + + public String getErrorCode() { + return errorCode; + } + + public void setErrorCode(String errorCode) { + this.errorCode = errorCode; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public String getSurname() { + return surname; + } + + public void setSurname(String surname) { + this.surname = surname; + } + + public String getCreditName() { + return creditName; + } + + public void setCreditName(String creditName) { + this.creditName = creditName; + } + + public String getOid() { + return oid; + } + + public void setOid(String oid) { + this.oid = oid; + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json index 8bac47123..312bd0751 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json @@ -1,5 +1,6 @@ [ {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the working dir path", "paramRequired": true}, {"paramName":"m", "paramLongName":"master", "paramDescription": "the master name", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java index 9658c8858..49f9ef912 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/DoiBoostTest.java @@ -26,18 +26,98 @@ public class DoiBoostTest { // CrossrefImporter.main("-n file:///tmp -t file:///tmp/p.seq -ts 1586110000749".split(" // ")); SparkMapDumpIntoOAF.main( - "-m local[*] -s file:///data/doiboost/crossref_dump.seq".split(" ")); + "-m local[*] -s file:///data/doiboost/crossref_dump.seq -t /data/doiboost" + .split(" ")); + } + + @Test + public void testConvertDatasetCrossRef2Oaf() throws IOException { + final String json = IOUtils.toString(getClass().getResourceAsStream("dataset.json")); + ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); + assertNotNull(json); + assertFalse(StringUtils.isBlank(json)); + final Result result = Crossref2Oaf.convert(json, logger); + + logger.info(mapper.writeValueAsString(result)); } @Test public void testConvertPreprintCrossRef2Oaf() throws IOException { - final String json = IOUtils.toString(getClass().getResourceAsStream("article.json")); + final String json = IOUtils.toString(getClass().getResourceAsStream("preprint.json")); ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); assertNotNull(json); assertFalse(StringUtils.isBlank(json)); - Crossref2Oaf cf = new Crossref2Oaf(); - final Result result = cf.convert(json, logger); + + final Result result = Crossref2Oaf.convert(json, logger); + assertNotNull(result); + + assertNotNull(result.getDataInfo(), "Datainfo test not null Failed"); + assertNotNull( + result.getDataInfo().getProvenanceaction(), + "DataInfo/Provenance test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassid()), + "DataInfo/Provenance/classId test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getClassname()), + "DataInfo/Provenance/className test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemeid()), + "DataInfo/Provenance/SchemeId test not null Failed"); + assertFalse( + StringUtils.isBlank(result.getDataInfo().getProvenanceaction().getSchemename()), + "DataInfo/Provenance/SchemeName test not null Failed"); + + assertNotNull(result.getCollectedfrom(), "CollectedFrom test not null Failed"); + assertTrue(result.getCollectedfrom().size() > 0); + assertTrue( + result.getCollectedfrom().stream() + .anyMatch( + c -> + c.getKey() + .equalsIgnoreCase( + "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"))); + assertTrue( + result.getCollectedfrom().stream() + .anyMatch(c -> c.getValue().equalsIgnoreCase("crossref"))); + + assertTrue( + result.getRelevantdate().stream() + .anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> d.getQualifier().getClassid().equalsIgnoreCase("available"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("accepted"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> + d.getQualifier() + .getClassid() + .equalsIgnoreCase("published-online"))); + assertTrue( + result.getRelevantdate().stream() + .anyMatch( + d -> + d.getQualifier() + .getClassid() + .equalsIgnoreCase("published-print"))); + + logger.info(mapper.writeValueAsString(result)); + } + + @Test + public void testConvertArticleCrossRef2Oaf() throws IOException { + + final String json = IOUtils.toString(getClass().getResourceAsStream("article.json")); + ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); + assertNotNull(json); + assertFalse(StringUtils.isBlank(json)); + final Result result = Crossref2Oaf.convert(json, logger); assertNotNull(result); assertNotNull(result.getDataInfo(), "Datainfo test not null Failed"); @@ -73,15 +153,6 @@ public class DoiBoostTest { assertTrue( result.getRelevantdate().stream() .anyMatch(d -> d.getQualifier().getClassid().equalsIgnoreCase("created"))); - // assertTrue( - // result.getRelevantdate().stream() - // .anyMatch( - // d -> - // d.getQualifier().getClassid().equalsIgnoreCase("available"))); - // assertTrue( - // result.getRelevantdate().stream() - // .anyMatch(d -> - // d.getQualifier().getClassid().equalsIgnoreCase("accepted"))); assertTrue( result.getRelevantdate().stream() .anyMatch( @@ -107,8 +178,7 @@ public class DoiBoostTest { ObjectMapper mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT); assertNotNull(json); assertFalse(StringUtils.isBlank(json)); - Crossref2Oaf cf = new Crossref2Oaf(); - final Result result = cf.convert(json, logger); + final Result result = Crossref2Oaf.convert(json, logger); assertNotNull(result); logger.info(mapper.writeValueAsString(result)); diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/article.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/article.json index e89d41ecd..afef13b69 100644 --- a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/article.json +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/article.json @@ -170,5 +170,5 @@ "container-title": [ "Ecl\u00e9tica Qu\u00edmica Journal" ], - "page": "41" + "page": "41-50" } \ No newline at end of file diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/dataset.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/dataset.json new file mode 100644 index 000000000..5c4b8c5a2 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/dataset.json @@ -0,0 +1,105 @@ +{ + "DOI": "10.1037/e522512014-096", + "subtitle": [ + "(522512014-096)" + ], + "issued": { + "date-parts": [ + [ + 2012 + ] + ] + }, + "prefix": "10.1037", + "author": [ + { + "affiliation": [], + "given": "Jessica", + "family": "Trudeau", + "sequence": "first" + }, + { + "affiliation": [], + "given": "Amy", + "family": "McShane", + "sequence": "additional" + }, + { + "affiliation": [], + "given": "Renee", + "family": "McDonald", + "sequence": "additional" + } + ], + "reference-count": 0, + "member": "15", + "source": "Crossref", + "score": 1.0, + "deposited": { + "timestamp": 1413827035000, + "date-parts": [ + [ + 2014, + 10, + 20 + ] + ], + "date-time": "2014-10-20T17:43:55Z" + }, + "indexed": { + "timestamp": 1550142454710, + "date-parts": [ + [ + 2019, + 2, + 14 + ] + ], + "date-time": "2019-02-14T11:07:34Z" + }, + "type": "dataset", + "URL": "http://dx.doi.org/10.1037/e522512014-096", + "is-referenced-by-count": 0, + "published-print": { + "date-parts": [ + [ + 2012 + ] + ] + }, + "references-count": 0, + "institution": { + "acronym": [ + "APA" + ], + "place": [ + "-" + ], + "name": "American Psychological Association" + }, + "publisher": "American Psychological Association (APA)", + "content-domain": { + "domain": [], + "crossmark-restriction": false + }, + "created": { + "timestamp": 1413826121000, + "date-parts": [ + [ + 2014, + 10, + 20 + ] + ], + "date-time": "2014-10-20T17:28:41Z" + }, + "title": [ + "Project Support: A Randomized Control Study to Evaluate the Translation of an Evidence- Based Program" + ], + "alternative-id": [ + "522512014-096" + ], + "container-title": [ + "PsycEXTRA Dataset" + ] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5374a69e8..3519cd88d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,19 +10,6 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.DocumentFactory; -import org.dom4j.DocumentHelper; -import org.dom4j.Node; - import eu.dnetlib.dhp.oa.graph.raw.common.MigrationConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -41,388 +28,439 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; - - protected static final Qualifier MAIN_TITLE_QUALIFIER = - qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; - } - - public List processMdRecord(final String xml) { - try { - final Map nsContext = new HashMap<>(); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - - final Document doc = - DocumentHelper.parseText(xml.replaceAll("http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = - keyValue(createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), doc.valueOf("//oaf:collectedFrom/@name")); - final KeyValue hostedBy = - StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : keyValue(createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), doc.valueOf("//oaf:hostedBy/@name")); - - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List oafs = new ArrayList<>(); - - switch (type.toLowerCase()) { - case "": - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; - } - - if (!oafs.isEmpty()) { - oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); - oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); - } - - return oafs; - } - - private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List res = new ArrayList<>(); - - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - final String projectId = createOpenaireId(40, ((Node) o).getText(), true); - - final Relation r1 = new Relation(); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("isProducedBy"); - r1.setSource(docId); - r1.setTarget(projectId); - r1.setCollectedfrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); - - final Relation r2 = new Relation(); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("produces"); - r2.setSource(projectId); - r2.setTarget(docId); - r2.setCollectedfrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - - return res; - } - - protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - - private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances( - Document doc, - DataInfo info, - KeyValue collectedfrom, - KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, - DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, - DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses( - Document doc, - DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, - DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber( - Document doc, - DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } - } - return null; - } - - protected Qualifier prepareQualifier( - final Node node, - final String xpath, - final String schemeId, - final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final Qualifier qualifier, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = - doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { return null; } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']");; - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");; - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']");; - final String harvestDate = n.valueOf("@harvestDate");; - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - protected DataInfo prepareDataInfo(final Document doc) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { return dataInfo(false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = - Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields( - final Node node, - final String xpath, - final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } + protected final Map code2name; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = + qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; + } + + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + + final Document doc = + DocumentHelper.parseText( + xml.replaceAll( + "http://datacite.org/schema/kernel-4", + "http://datacite.org/schema/kernel-3")); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = + keyValue( + createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), + doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = + StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : keyValue( + createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), + doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected List createOafs( + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); + } + + return oafs; + } + + private List addProjectRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + final String projectId = createOpenaireId(40, ((Node) o).getText(), true); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedfrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedfrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + + return res; + } + + protected abstract List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private void populateResultFields( + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setPid( + prepareListStructProps( + doc, + "//oaf:identifier", + "@identifierType", + "dnet:pid_types", + "dnet:pid_types", + info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses( + Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber( + Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); + } + } + return null; + } + + protected Qualifier prepareQualifier( + final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add( + structuredProperty( + n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = + doc.selectSingleNode( + "//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + ; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + ; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + ; + final String harvestDate = n.valueOf("@harvestDate"); + ; + + return oaiIProvenance( + identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return dataInfo( + false, + null, + false, + false, + MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, + "0.9"); + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = + Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, final String xpath, final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index eb3a6a8c7..b59056528 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -10,23 +10,6 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; -import java.io.Closeable; -import java.io.IOException; -import java.sql.Array; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.function.Consumer; -import java.util.function.Function; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; @@ -48,460 +31,531 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import java.io.Closeable; +import java.io.IOException; +import java.sql.Array; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { - - private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); - - private final DbClient dbClient; - - private final long lastUpdateTimestamp; - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString(MigrateDbEntitiesApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); - - parser.parseArgument(args); - - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); - - final String hdfsPath = parser.get("hdfsPath"); - - final boolean processClaims = - parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); - - try (final MigrateDbEntitiesApplication smdbe = - new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { - if (processClaims) { - log.info("Processing claims..."); - smdbe.execute("queryClaims.sql", smdbe::processClaims); - } else { - log.info("Processing datasources..."); - smdbe.execute("queryDatasources.sql", smdbe::processDatasource); - - log.info("Processing projects..."); - smdbe.execute("queryProjects.sql", smdbe::processProject); - - log.info("Processing orgs..."); - smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); - - log.info("Processing relations ds <-> orgs ..."); - smdbe.execute("queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); - - log.info("Processing projects <-> orgs ..."); - smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); - } - log.info("All done."); - } - } - - protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST - super(); - this.dbClient = null; - this.lastUpdateTimestamp = new Date().getTime(); - } - - public MigrateDbEntitiesApplication( - final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) - throws Exception { - super(hdfsPath); - this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); - this.lastUpdateTimestamp = new Date().getTime(); - } - - public void execute(final String sqlFile, final Function> producer) - throws Exception { - final String sql = - IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); - - final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); - - dbClient.processResults(sql, consumer); - } - - public List processDatasource(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Datasource ds = new Datasource(); - - ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); - ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); - ds.setCollectedfrom(listKeyValues(createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"))); - ds.setPid(new ArrayList<>()); - ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); - ds.setDateoftransformation(null); // Value not returned by the SQL query - ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB - ds.setOaiprovenance(null); // Values not present in the DB - ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); - ds.setOpenairecompatibility(prepareQualifierSplitting(rs.getString("openairecompatibility"))); - ds.setOfficialname(field(rs.getString("officialname"), info)); - ds.setEnglishname(field(rs.getString("englishname"), info)); - ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); - ds.setLogourl(field(rs.getString("logourl"), info)); - ds.setContactemail(field(rs.getString("contactemail"), info)); - ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); - ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); - ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); - ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); - ds.setDescription(field(rs.getString("description"), info)); - ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); - ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); - ds.setOdpolicies(field(rs.getString("odpolicies"), info)); - ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); - ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); - ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); - ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); - ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); - ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); - ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); - ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); - ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); - ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); - ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); - ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); - ds.setVersioning(field(rs.getBoolean("versioning"), info)); - ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); - ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); - ds.setPidsystems(field(rs.getString("pidsystems"), info)); - ds.setCertificates(field(rs.getString("certificates"), info)); - ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array - ds.setJournal(prepareJournal(rs.getString("officialname"), rs.getString("journal"), info)); // Journal - ds.setDataInfo(info); - ds.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(ds); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProject(final ResultSet rs) { - try { - - final DataInfo info = prepareDataInfo(rs); - - final Project p = new Project(); - - p.setId(createOpenaireId(40, rs.getString("projectid"), true)); - p.setOriginalId(Arrays.asList(rs.getString("projectid"))); - p.setCollectedfrom(listKeyValues(createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"))); - p.setPid(new ArrayList<>()); - p.setDateofcollection(asString(rs.getDate("dateofcollection"))); - p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - p.setExtraInfo(new ArrayList<>()); // Values not present in the DB - p.setOaiprovenance(null); // Values not present in the DB - p.setWebsiteurl(field(rs.getString("websiteurl"), info)); - p.setCode(field(rs.getString("code"), info)); - p.setAcronym(field(rs.getString("acronym"), info)); - p.setTitle(field(rs.getString("title"), info)); - p.setStartdate(field(asString(rs.getDate("startdate")), info)); - p.setEnddate(field(asString(rs.getDate("enddate")), info)); - p.setCallidentifier(field(rs.getString("callidentifier"), info)); - p.setKeywords(field(rs.getString("keywords"), info)); - p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); - p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); - p.setOamandatepublications(field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); - p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); - p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); - p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); - p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); - p.setOptional1(field(rs.getString("optional1"), info)); - p.setOptional2(field(rs.getString("optional2"), info)); - p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); - p.setContactfullname(field(rs.getString("contactfullname"), info)); - p.setContactfax(field(rs.getString("contactfax"), info)); - p.setContactphone(field(rs.getString("contactphone"), info)); - p.setContactemail(field(rs.getString("contactemail"), info)); - p.setSummary(field(rs.getString("summary"), info)); - p.setCurrency(field(rs.getString("currency"), info)); - p.setTotalcost(new Float(rs.getDouble("totalcost"))); - p.setFundedamount(new Float(rs.getDouble("fundedamount"))); - p.setDataInfo(info); - p.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(p); - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processOrganization(final ResultSet rs) { - - try { - - final DataInfo info = prepareDataInfo(rs); - - final Organization o = new Organization(); - - o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); - o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); - o.setCollectedfrom(listKeyValues(createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname"))); - o.setPid(new ArrayList<>()); - o.setDateofcollection(asString(rs.getDate("dateofcollection"))); - o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); - o.setExtraInfo(new ArrayList<>()); // Values not present in the DB - o.setOaiprovenance(null); // Values not present in the DB - o.setLegalshortname(field(rs.getString("legalshortname"), info)); - o.setLegalname(field(rs.getString("legalname"), info)); - o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query - o.setWebsiteurl(field(rs.getString("websiteurl"), info)); - o.setLogourl(field(rs.getString("logourl"), info)); - o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); - o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); - o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); - o.setEcresearchorganization(field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); - o.setEchighereducation(field(Boolean.toString(rs.getBoolean("echighereducation")), info)); - o.setEcinternationalorganizationeurinterests(field(Boolean.toString(rs.getBoolean("ecinternationalorganizationeurinterests")), info)); - o.setEcinternationalorganization(field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); - o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); - o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); - o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); - o.setCountry(prepareQualifierSplitting(rs.getString("country"))); - o.setDataInfo(info); - o.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(o); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processDatasourceOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("organization"), true); - final String dsId = createOpenaireId(10, rs.getString("datasource"), true); - final List collectedFrom = - listKeyValues(createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("datasourceOrganization"); - r1.setSubRelType("provision"); - r1.setRelClass("isProvidedBy"); - r1.setSource(dsId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("datasourceOrganization"); - r2.setSubRelType("provision"); - r2.setRelClass("provides"); - r2.setSource(orgId); - r2.setTarget(dsId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processProjectOrganization(final ResultSet rs) { - try { - final DataInfo info = prepareDataInfo(rs); - final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); - final String projectId = createOpenaireId(40, rs.getString("project"), true); - final List collectedFrom = - listKeyValues(createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); - - final Relation r1 = new Relation(); - r1.setRelType("projectOrganization"); - r1.setSubRelType("participation"); - r1.setRelClass("isParticipant"); - r1.setSource(projectId); - r1.setTarget(orgId); - r1.setCollectedfrom(collectedFrom); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - final Relation r2 = new Relation(); - r2.setRelType("projectOrganization"); - r2.setSubRelType("participation"); - r2.setRelClass("hasParticipant"); - r2.setSource(orgId); - r2.setTarget(projectId); - r2.setCollectedfrom(collectedFrom); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - public List processClaims(final ResultSet rs) { - - final DataInfo info = - dataInfo(false, null, false, false, qualifier("user:claim", "user:claim", "dnet:provenanceActions", "dnet:provenanceActions"), "0.9"); - - final List collectedFrom = - listKeyValues(createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); - - try { - - if (rs.getString("source_type").equals("context")) { - final Result r; - - if (rs.getString("target_type").equals("dataset")) { - r = new Dataset(); - r.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); - } else if (rs.getString("target_type").equals("software")) { - r = new Software(); - r.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); - } else if (rs.getString("target_type").equals("other")) { - r = new OtherResearchProduct(); - r.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); - } else { - r = new Publication(); - r.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); - } - r.setId(createOpenaireId(50, rs.getString("target_id"), false)); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setContext(prepareContext(rs.getString("source_id"), info)); - r.setDataInfo(info); - r.setCollectedfrom(collectedFrom); - - return Arrays.asList(r); - } else { - final String sourceId = - createOpenaireId(rs.getString("source_type"), rs.getString("source_id"), false); - final String targetId = - createOpenaireId(rs.getString("target_type"), rs.getString("target_id"), false); - - final Relation r1 = new Relation(); - final Relation r2 = new Relation(); - - if (rs.getString("source_type").equals("project")) { - r1.setCollectedfrom(collectedFrom); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("produces"); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("isProducedBy"); - } else { - r1.setCollectedfrom(collectedFrom); - r1.setRelType("resultResult"); - r1.setSubRelType("relationship"); - r1.setRelClass("isRelatedTo"); - - r2.setCollectedfrom(collectedFrom); - r2.setRelType("resultResult"); - r2.setSubRelType("relationship"); - r2.setRelClass("isRelatedTo"); - } - - r1.setSource(sourceId); - r1.setTarget(targetId); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - - r2.setSource(targetId); - r2.setTarget(sourceId); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - - return Arrays.asList(r1, r2); - } - - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - private List prepareContext(final String id, final DataInfo dataInfo) { - final Context context = new Context(); - context.setId(id); - context.setDataInfo(Arrays.asList(dataInfo)); - return Arrays.asList(context); - } - - private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { - final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); - final String inferenceprovenance = rs.getString("inferenceprovenance"); - final Boolean inferred = rs.getBoolean("inferred"); - final String trust = rs.getString("trust"); - return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, MigrationConstants.ENTITYREGISTRY_PROVENANCE_ACTION, trust); - } - - private Qualifier prepareQualifierSplitting(final String s) { - if (StringUtils.isBlank(s)) { return null; } - final String[] arr = s.split("@@@"); - return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; - } - - private List> prepareListFields(final Array array, final DataInfo info) { - try { - return array != null - ? listFields(info, (String[]) array.getArray()) - : new ArrayList<>(); - } catch (final SQLException e) { - throw new RuntimeException("Invalid SQL array", e); - } - } - - private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { - if (StringUtils.isBlank(s)) { return null; } - final String[] parts = s.split("###"); - if (parts.length == 2) { - final String value = parts[0]; - final String[] arr = parts[1].split("@@@"); - if (arr.length == 4) { return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); } - } - return null; - } - - private List prepareListOfStructProps( - final Array array, - final DataInfo dataInfo) throws SQLException { - final List res = new ArrayList<>(); - if (array != null) { - for (final String s : (String[]) array.getArray()) { - final StructuredProperty sp = prepareStructProp(s, dataInfo); - if (sp != null) { - res.add(sp); - } - } - } - - return res; - } - - private Journal prepareJournal(final String name, final String sj, final DataInfo info) { - if (StringUtils.isNotBlank(sj)) { - final String[] arr = sj.split("@@@"); - if (arr.length == 3) { - final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; - final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null;; - final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null;; - if (issn != null || eissn != null || lissn != null) { - return journal(name, issn, eissn, eissn, null, null, null, null, null, null, null, info); - } - } - } - return null; - } - - @Override - public void close() throws IOException { - super.close(); - dbClient.close(); - } + implements Closeable { + + private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); + + private final DbClient dbClient; + + private final long lastUpdateTimestamp; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = + new ArgumentApplicationParser( + IOUtils.toString( + MigrateDbEntitiesApplication.class.getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json"))); + + parser.parseArgument(args); + + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsPath = parser.get("hdfsPath"); + + final boolean processClaims = + parser.get("action") != null && parser.get("action").equalsIgnoreCase("claims"); + + try (final MigrateDbEntitiesApplication smdbe = + new MigrateDbEntitiesApplication(hdfsPath, dbUrl, dbUser, dbPassword)) { + if (processClaims) { + log.info("Processing claims..."); + smdbe.execute("queryClaims.sql", smdbe::processClaims); + } else { + log.info("Processing datasources..."); + smdbe.execute("queryDatasources.sql", smdbe::processDatasource); + + log.info("Processing projects..."); + smdbe.execute("queryProjects.sql", smdbe::processProject); + + log.info("Processing orgs..."); + smdbe.execute("queryOrganizations.sql", smdbe::processOrganization); + + log.info("Processing relations ds <-> orgs ..."); + smdbe.execute( + "queryDatasourceOrganization.sql", smdbe::processDatasourceOrganization); + + log.info("Processing projects <-> orgs ..."); + smdbe.execute("queryProjectOrganization.sql", smdbe::processProjectOrganization); + } + log.info("All done."); + } + } + + protected MigrateDbEntitiesApplication() { // ONLY FOR UNIT TEST + super(); + this.dbClient = null; + this.lastUpdateTimestamp = new Date().getTime(); + } + + public MigrateDbEntitiesApplication( + final String hdfsPath, final String dbUrl, final String dbUser, final String dbPassword) + throws Exception { + super(hdfsPath); + this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); + this.lastUpdateTimestamp = new Date().getTime(); + } + + public void execute(final String sqlFile, final Function> producer) + throws Exception { + final String sql = + IOUtils.toString( + getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile)); + + final Consumer consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf)); + + dbClient.processResults(sql, consumer); + } + + public List processDatasource(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Datasource ds = new Datasource(); + + ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); + ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); + ds.setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + ds.setPid(new ArrayList<>()); + ds.setDateofcollection(asString(rs.getDate("dateofcollection"))); + ds.setDateoftransformation(null); // Value not returned by the SQL query + ds.setExtraInfo(new ArrayList<>()); // Values not present in the DB + ds.setOaiprovenance(null); // Values not present in the DB + ds.setDatasourcetype(prepareQualifierSplitting(rs.getString("datasourcetype"))); + ds.setOpenairecompatibility( + prepareQualifierSplitting(rs.getString("openairecompatibility"))); + ds.setOfficialname(field(rs.getString("officialname"), info)); + ds.setEnglishname(field(rs.getString("englishname"), info)); + ds.setWebsiteurl(field(rs.getString("websiteurl"), info)); + ds.setLogourl(field(rs.getString("logourl"), info)); + ds.setContactemail(field(rs.getString("contactemail"), info)); + ds.setNamespaceprefix(field(rs.getString("namespaceprefix"), info)); + ds.setLatitude(field(Double.toString(rs.getDouble("latitude")), info)); + ds.setLongitude(field(Double.toString(rs.getDouble("longitude")), info)); + ds.setDateofvalidation(field(asString(rs.getDate("dateofvalidation")), info)); + ds.setDescription(field(rs.getString("description"), info)); + ds.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + ds.setOdnumberofitems(field(Double.toString(rs.getInt("odnumberofitems")), info)); + ds.setOdnumberofitemsdate(field(asString(rs.getDate("odnumberofitemsdate")), info)); + ds.setOdpolicies(field(rs.getString("odpolicies"), info)); + ds.setOdlanguages(prepareListFields(rs.getArray("odlanguages"), info)); + ds.setOdcontenttypes(prepareListFields(rs.getArray("odcontenttypes"), info)); + ds.setAccessinfopackage(prepareListFields(rs.getArray("accessinfopackage"), info)); + ds.setReleasestartdate(field(asString(rs.getDate("releasestartdate")), info)); + ds.setReleaseenddate(field(asString(rs.getDate("releaseenddate")), info)); + ds.setMissionstatementurl(field(rs.getString("missionstatementurl"), info)); + ds.setDataprovider(field(rs.getBoolean("dataprovider"), info)); + ds.setServiceprovider(field(rs.getBoolean("serviceprovider"), info)); + ds.setDatabaseaccesstype(field(rs.getString("databaseaccesstype"), info)); + ds.setDatauploadtype(field(rs.getString("datauploadtype"), info)); + ds.setDatabaseaccessrestriction(field(rs.getString("databaseaccessrestriction"), info)); + ds.setDatauploadrestriction(field(rs.getString("datauploadrestriction"), info)); + ds.setVersioning(field(rs.getBoolean("versioning"), info)); + ds.setCitationguidelineurl(field(rs.getString("citationguidelineurl"), info)); + ds.setQualitymanagementkind(field(rs.getString("qualitymanagementkind"), info)); + ds.setPidsystems(field(rs.getString("pidsystems"), info)); + ds.setCertificates(field(rs.getString("certificates"), info)); + ds.setPolicies(new ArrayList<>()); // The sql query returns an empty array + ds.setJournal( + prepareJournal( + rs.getString("officialname"), + rs.getString("journal"), + info)); // Journal + ds.setDataInfo(info); + ds.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(ds); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProject(final ResultSet rs) { + try { + + final DataInfo info = prepareDataInfo(rs); + + final Project p = new Project(); + + p.setId(createOpenaireId(40, rs.getString("projectid"), true)); + p.setOriginalId(Arrays.asList(rs.getString("projectid"))); + p.setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + p.setPid(new ArrayList<>()); + p.setDateofcollection(asString(rs.getDate("dateofcollection"))); + p.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + p.setExtraInfo(new ArrayList<>()); // Values not present in the DB + p.setOaiprovenance(null); // Values not present in the DB + p.setWebsiteurl(field(rs.getString("websiteurl"), info)); + p.setCode(field(rs.getString("code"), info)); + p.setAcronym(field(rs.getString("acronym"), info)); + p.setTitle(field(rs.getString("title"), info)); + p.setStartdate(field(asString(rs.getDate("startdate")), info)); + p.setEnddate(field(asString(rs.getDate("enddate")), info)); + p.setCallidentifier(field(rs.getString("callidentifier"), info)); + p.setKeywords(field(rs.getString("keywords"), info)); + p.setDuration(field(Integer.toString(rs.getInt("duration")), info)); + p.setEcsc39(field(Boolean.toString(rs.getBoolean("ecsc39")), info)); + p.setOamandatepublications( + field(Boolean.toString(rs.getBoolean("oamandatepublications")), info)); + p.setEcarticle29_3(field(Boolean.toString(rs.getBoolean("ecarticle29_3")), info)); + p.setSubjects(prepareListOfStructProps(rs.getArray("subjects"), info)); + p.setFundingtree(prepareListFields(rs.getArray("fundingtree"), info)); + p.setContracttype(prepareQualifierSplitting(rs.getString("contracttype"))); + p.setOptional1(field(rs.getString("optional1"), info)); + p.setOptional2(field(rs.getString("optional2"), info)); + p.setJsonextrainfo(field(rs.getString("jsonextrainfo"), info)); + p.setContactfullname(field(rs.getString("contactfullname"), info)); + p.setContactfax(field(rs.getString("contactfax"), info)); + p.setContactphone(field(rs.getString("contactphone"), info)); + p.setContactemail(field(rs.getString("contactemail"), info)); + p.setSummary(field(rs.getString("summary"), info)); + p.setCurrency(field(rs.getString("currency"), info)); + p.setTotalcost(new Float(rs.getDouble("totalcost"))); + p.setFundedamount(new Float(rs.getDouble("fundedamount"))); + p.setDataInfo(info); + p.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(p); + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processOrganization(final ResultSet rs) { + + try { + + final DataInfo info = prepareDataInfo(rs); + + final Organization o = new Organization(); + + o.setId(createOpenaireId(20, rs.getString("organizationid"), true)); + o.setOriginalId(Arrays.asList(rs.getString("organizationid"))); + o.setCollectedfrom( + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname"))); + o.setPid(new ArrayList<>()); + o.setDateofcollection(asString(rs.getDate("dateofcollection"))); + o.setDateoftransformation(asString(rs.getDate("dateoftransformation"))); + o.setExtraInfo(new ArrayList<>()); // Values not present in the DB + o.setOaiprovenance(null); // Values not present in the DB + o.setLegalshortname(field(rs.getString("legalshortname"), info)); + o.setLegalname(field(rs.getString("legalname"), info)); + o.setAlternativeNames(new ArrayList<>()); // Values not returned by the SQL query + o.setWebsiteurl(field(rs.getString("websiteurl"), info)); + o.setLogourl(field(rs.getString("logourl"), info)); + o.setEclegalbody(field(Boolean.toString(rs.getBoolean("eclegalbody")), info)); + o.setEclegalperson(field(Boolean.toString(rs.getBoolean("eclegalperson")), info)); + o.setEcnonprofit(field(Boolean.toString(rs.getBoolean("ecnonprofit")), info)); + o.setEcresearchorganization( + field(Boolean.toString(rs.getBoolean("ecresearchorganization")), info)); + o.setEchighereducation( + field(Boolean.toString(rs.getBoolean("echighereducation")), info)); + o.setEcinternationalorganizationeurinterests( + field( + Boolean.toString( + rs.getBoolean("ecinternationalorganizationeurinterests")), + info)); + o.setEcinternationalorganization( + field(Boolean.toString(rs.getBoolean("ecinternationalorganization")), info)); + o.setEcenterprise(field(Boolean.toString(rs.getBoolean("ecenterprise")), info)); + o.setEcsmevalidated(field(Boolean.toString(rs.getBoolean("ecsmevalidated")), info)); + o.setEcnutscode(field(Boolean.toString(rs.getBoolean("ecnutscode")), info)); + o.setCountry(prepareQualifierSplitting(rs.getString("country"))); + o.setDataInfo(info); + o.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(o); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processDatasourceOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("organization"), true); + final String dsId = createOpenaireId(10, rs.getString("datasource"), true); + final List collectedFrom = + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("datasourceOrganization"); + r1.setSubRelType("provision"); + r1.setRelClass("isProvidedBy"); + r1.setSource(dsId); + r1.setTarget(orgId); + r1.setCollectedfrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("datasourceOrganization"); + r2.setSubRelType("provision"); + r2.setRelClass("provides"); + r2.setSource(orgId); + r2.setTarget(dsId); + r2.setCollectedfrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processProjectOrganization(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); + final String orgId = createOpenaireId(20, rs.getString("resporganization"), true); + final String projectId = createOpenaireId(40, rs.getString("project"), true); + final List collectedFrom = + listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), + rs.getString("collectedfromname")); + + final Relation r1 = new Relation(); + r1.setRelType("projectOrganization"); + r1.setSubRelType("participation"); + r1.setRelClass("isParticipant"); + r1.setSource(projectId); + r1.setTarget(orgId); + r1.setCollectedfrom(collectedFrom); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + final Relation r2 = new Relation(); + r2.setRelType("projectOrganization"); + r2.setSubRelType("participation"); + r2.setRelClass("hasParticipant"); + r2.setSource(orgId); + r2.setTarget(projectId); + r2.setCollectedfrom(collectedFrom); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + public List processClaims(final ResultSet rs) { + + final DataInfo info = + dataInfo( + false, + null, + false, + false, + qualifier( + "user:claim", + "user:claim", + "dnet:provenanceActions", + "dnet:provenanceActions"), + "0.9"); + + final List collectedFrom = + listKeyValues(createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); + + try { + + if (rs.getString("source_type").equals("context")) { + final Result r; + + if (rs.getString("target_type").equals("dataset")) { + r = new Dataset(); + r.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); + } else if (rs.getString("target_type").equals("software")) { + r = new Software(); + r.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); + } else if (rs.getString("target_type").equals("other")) { + r = new OtherResearchProduct(); + r.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); + } else { + r = new Publication(); + r.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); + } + r.setId(createOpenaireId(50, rs.getString("target_id"), false)); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setContext(prepareContext(rs.getString("source_id"), info)); + r.setDataInfo(info); + r.setCollectedfrom(collectedFrom); + + return Arrays.asList(r); + } else { + final String sourceId = + createOpenaireId( + rs.getString("source_type"), rs.getString("source_id"), false); + final String targetId = + createOpenaireId( + rs.getString("target_type"), rs.getString("target_id"), false); + + final Relation r1 = new Relation(); + final Relation r2 = new Relation(); + + if (rs.getString("source_type").equals("project")) { + r1.setCollectedfrom(collectedFrom); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("produces"); + + r2.setCollectedfrom(collectedFrom); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("isProducedBy"); + } else { + r1.setCollectedfrom(collectedFrom); + r1.setRelType("resultResult"); + r1.setSubRelType("relationship"); + r1.setRelClass("isRelatedTo"); + + r2.setCollectedfrom(collectedFrom); + r2.setRelType("resultResult"); + r2.setSubRelType("relationship"); + r2.setRelClass("isRelatedTo"); + } + + r1.setSource(sourceId); + r1.setTarget(targetId); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + + r2.setSource(targetId); + r2.setTarget(sourceId); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r1, r2); + } + + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + private List prepareContext(final String id, final DataInfo dataInfo) { + final Context context = new Context(); + context.setId(id); + context.setDataInfo(Arrays.asList(dataInfo)); + return Arrays.asList(context); + } + + private DataInfo prepareDataInfo(final ResultSet rs) throws SQLException { + final Boolean deletedbyinference = rs.getBoolean("deletedbyinference"); + final String inferenceprovenance = rs.getString("inferenceprovenance"); + final Boolean inferred = rs.getBoolean("inferred"); + final String trust = rs.getString("trust"); + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + MigrationConstants.ENTITYREGISTRY_PROVENANCE_ACTION, + trust); + } + + private Qualifier prepareQualifierSplitting(final String s) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] arr = s.split("@@@"); + return arr.length == 4 ? qualifier(arr[0], arr[1], arr[2], arr[3]) : null; + } + + private List> prepareListFields(final Array array, final DataInfo info) { + try { + return array != null + ? listFields(info, (String[]) array.getArray()) + : new ArrayList<>(); + } catch (final SQLException e) { + throw new RuntimeException("Invalid SQL array", e); + } + } + + private StructuredProperty prepareStructProp(final String s, final DataInfo dataInfo) { + if (StringUtils.isBlank(s)) { + return null; + } + final String[] parts = s.split("###"); + if (parts.length == 2) { + final String value = parts[0]; + final String[] arr = parts[1].split("@@@"); + if (arr.length == 4) { + return structuredProperty(value, arr[0], arr[1], arr[2], arr[3], dataInfo); + } + } + return null; + } + + private List prepareListOfStructProps( + final Array array, final DataInfo dataInfo) throws SQLException { + final List res = new ArrayList<>(); + if (array != null) { + for (final String s : (String[]) array.getArray()) { + final StructuredProperty sp = prepareStructProp(s, dataInfo); + if (sp != null) { + res.add(sp); + } + } + } + + return res; + } + + private Journal prepareJournal(final String name, final String sj, final DataInfo info) { + if (StringUtils.isNotBlank(sj)) { + final String[] arr = sj.split("@@@"); + if (arr.length == 3) { + final String issn = StringUtils.isNotBlank(arr[0]) ? arr[0] : null; + final String eissn = StringUtils.isNotBlank(arr[1]) ? arr[1] : null; + ; + final String lissn = StringUtils.isNotBlank(arr[2]) ? arr[2] : null; + ; + if (issn != null || eissn != null || lissn != null) { + return journal( + name, issn, eissn, eissn, null, null, null, null, null, null, null, + info); + } + } + } + return null; + } + + @Override + public void close() throws IOException { + super.close(); + dbClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java index f9ff105b0..2f31b1e03 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java @@ -6,17 +6,28 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; public class MigrationConstants { - public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = - qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = - qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = - qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = - qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = - qualifier("sysimport:crosswalk:repository", "sysimport:crosswalk:repository", "dnet:provenanceActions", "dnet:provenanceActions"); - public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = - qualifier("sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", "dnet:provenanceActions", "dnet:provenanceActions"); - + public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = + qualifier( + "publication", + "publication", + "dnet:result_typologies", + "dnet:result_typologies"); + public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = + qualifier("dataset", "dataset", "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = + qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = + qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = + qualifier( + "sysimport:crosswalk:repository", + "sysimport:crosswalk:repository", + "dnet:provenanceActions", + "dnet:provenanceActions"); + public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = + qualifier( + "sysimport:crosswalk:entityregistry", + "sysimport:crosswalk:entityregistry", + "dnet:provenanceActions", + "dnet:provenanceActions"); }