From 4269e2f0d15f7f310d254a661030acfede7aabe2 Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 9 Dec 2021 14:59:01 +0100 Subject: [PATCH] added citation retrieving and conversion; added ped test --- .../bioschema/model/BioSchemaProtein.java | 29 +-- .../dhp/rdfconverter/utils/RDFConverter.java | 18 +- .../rdfconverter/bioschema/ConverterTest.java | 15 +- .../dnetlib/dhp/rdfconverter/bioschema/ped.nq | 229 ++++++++++++++++++ .../src/test/resources/log4j.properties | 11 + 5 files changed, 276 insertions(+), 26 deletions(-) create mode 100644 dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq create mode 100644 dhp-workflows/dhp-rdfconverter/src/test/resources/log4j.properties diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java index ffac36459..962644c07 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java @@ -14,8 +14,6 @@ public class BioSchemaProtein { private List entryList; @JsonProperty("http://purl.org/pav/retrievedOn") private DateTimeType retrievedOn; - @JsonProperty("citation") - private Citation citation; public static class Entry { @JsonProperty("@id") @@ -50,6 +48,8 @@ public class BioSchemaProtein { private IsPartOfBioChemEntity isPartOfBioChemEntity; @JsonProperty("mainEntityOfPage") private Link mainEntityOfPage; + @JsonProperty("https://schema.org/citation") + private Citation citation; public String getId() { return id; @@ -190,6 +190,13 @@ public class BioSchemaProtein { this.mainEntityOfPage = mainEntityOfPage; } + public Citation getCitation() { + return citation; + } + + public void setCitation(Citation citation) { + this.citation = citation; + } } public static class IsPartOfBioChemEntity { @@ -328,19 +335,9 @@ public class BioSchemaProtein { } public static class Citation { - @JsonProperty("@type") - private String type; @JsonProperty("@id") private String id; - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - public String getId() { return id; } @@ -373,12 +370,4 @@ public class BioSchemaProtein { public void setRetrievedOn(DateTimeType retrievedOn) { this.retrievedOn = retrievedOn; } - - public Citation getCitation() { - return citation; - } - - public void setCitation(Citation citation) { - this.citation = citation; - } } diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java index db5a38f2c..d57a548bf 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.rdfconverter.utils; import java.io.StringReader; import java.io.StringWriter; import java.util.*; +import java.util.stream.Collectors; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.rio.RDFFormat; @@ -42,6 +43,7 @@ public class RDFConverter { Object jsonObject = JsonUtils.fromString(jsonLDBuffer); Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions()); String compactContent = JsonUtils.toString(compact); + log.debug("jsonld: " + compactContent); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY); @@ -50,7 +52,14 @@ public class RDFConverter { BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); log.debug("BioSchema id: " + bioSchemaProtein.getId()); BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); - BioSchemaProtein.Citation citation = bioSchemaProtein.getCitation(); + + List citations = bioSchemaProtein.getEntryList().stream().map(entry -> { + if (entry.getCitation() != null) { + BioSchemaProtein.Citation citationInfo = entry.getCitation(); + return citationInfo.getId(); + } + return null; + }).filter(id -> id != null).collect(Collectors.toList()); ArrayList results = new ArrayList(); bioSchemaProtein.getEntryList().stream().forEach(entry -> { @@ -59,9 +68,10 @@ public class RDFConverter { && entry.getType().stream().filter(type -> type.equals("https://schema.org/Protein")).count() == 1) { DataciteProtein dataciteProtein = new DataciteProtein(); - if (citation != null) { - addRelatedIdentifier(dataciteProtein, citation.getId(), "CitedBy"); - } + + citations.forEach(citation -> { + addRelatedIdentifier(dataciteProtein, citation, "CitedBy"); + }); DataciteProtein.Types types = new DataciteProtein.Types(); types.setResourceType("Protein"); diff --git a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java index e4042fe8b..a6945ced8 100644 --- a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java +++ b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java @@ -17,11 +17,22 @@ public class ConverterTest { static Logger logger = LoggerFactory.getLogger(ConverterTest.class); @Test -// @Disabled public void nqToDataciteTest() throws Exception { InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq"); String nq = IOUtils.toString(is); - logger.info("NQ: " + nq); + logger.debug("NQ: " + nq); + RDFConverter converter = new RDFConverter(); + ArrayList results = converter.nQuadsFile2DataciteJson(nq, "Protein"); + if (results != null && !results.isEmpty()) { + logger.info("JSON DATACITE: " + results.get(0)); + } + } + + @Test + public void pedCitationTest() throws Exception { + InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq"); + String nq = IOUtils.toString(is); + logger.debug("NQ: " + nq); RDFConverter converter = new RDFConverter(); ArrayList results = converter.nQuadsFile2DataciteJson(nq, "Protein"); if (results != null && !results.isEmpty()) { diff --git a/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq new file mode 100644 index 000000000..babe3aeef --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/test/resources/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq @@ -0,0 +1,229 @@ + . + "2021-12-06T11:52:22"^^ . + . + . + . + "https://identifiers.org/ped:PED00014" . + "https://proteinensemble.org/#2021-02-12" . + . + "Structural ensemble of pSic1 (1-90) phosphorylated at Thr5, Thr33, Thr45, Ser69, Ser76 and Ser80, in complex with SKP1 (4-186) and CDC4 (270-744)" . + . + . + . + . + . + "3" . + . + . + "LKRDLITSLPFEISLKIFNYLQFEDIINSLGVSQNWNKIIRKSTSLWKKLLISENFVSPKGFNSLNLKLSQKYPKLSQQDRLRLSFLENIFILKNWYNPKFVPQRTTLRGHMTSVITCLQFEDNYVITGADDKMIRVYDSINKKFLLQLSGHDGGVWALKYAHGGILVSGSTDRTVRVWDIKKGCCTHVFKGHNSTVRCLDIVEYKNIKYIVTGSRDNTLHVWKLPKESSVPDHGEEHDYPLVFHTPEENPYFVGVLRGHMASVRTVSGHGNIVVSGSYDNTLIVWDVAQMKCLYILSGHTDRIYSTIYDHERKRCISASMDTTIRIWDLENIWNNGECSYATNSASPCAKILGAMYTLQGHTALVGLLRLSDKFLVSAAADGSIRGWDANDYSRKFSYHHTNLSAITTFYVSDNILVSGSENQFNIYNLRSGKLVHANILKDADQIWSVNFKGKTLVAAVEKDGQSFLEILDFS" . + . + "https://identifiers.org/uniprot:P07834" . + "Cell division control protein 4" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "Term" . + . + . + . + "NMR" . + "IDPO:00120" . + . + "IDP ontology" . + . + "Term" . + . + . + . + "RDC" . + "IDPO:00166" . + . + "Term" . + . + . + . + "chemical shift" . + "IDPO:00167" . + . + "Term" . + . + . + . + "relaxation" . + "IDPO:00168" . + . + "Term" . + . + . + . + "T2 relaxation" . + "IDPO:00169" . + . + "Term" . + . + . + . + "SAXS" . + "IDPO:00125" . + . + "Term" . + . + . + . + "TraDES" . + "IDPO:00186" . + . + "Term" . + . + . + . + "CNS" . + "IDPO:00192" . + . + "Term" . + . + . + . + "CRYSOL" . + "IDPO:00208" . + . + "Term" . + . + . + . + "ShiftX" . + "IDPO:00210" . + . + "Term" . + . + . + . + "ENSEMBLE" . + "IDPO:00216" . + . + "744" . + "270" . + . + . + "SNVVLVSGEGERFTVDKKIAERSLLLKNYLNDMHDSNLQNNSDSESDSDSETNHKSKDNNNGDDDDEDDDEIVMPVPNVRSSVLQKVIEWAEHHRDSNFPDEDDDDSRKSAPVDSWDREFLKVDQEMLYEIILAANYLNIKPLLDAGCKVVAEMIRGRSPEEIRRTFNIVNDFTPEEEAAIRR" . + . + "https://identifiers.org/uniprot:P52286" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "186" . + "4" . + . + . + "MTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRT" . + . + "https://identifiers.org/uniprot:P38634" . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "Term" . + . + . + "90" . + "1" . + "PED" . diff --git a/dhp-workflows/dhp-rdfconverter/src/test/resources/log4j.properties b/dhp-workflows/dhp-rdfconverter/src/test/resources/log4j.properties new file mode 100644 index 000000000..20f56e38d --- /dev/null +++ b/dhp-workflows/dhp-rdfconverter/src/test/resources/log4j.properties @@ -0,0 +1,11 @@ +# Set root logger level to DEBUG and its only appender to A1. +log4j.rootLogger=INFO, A1 + +# A1 is set to be a ConsoleAppender. +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.logger.org = ERROR +log4j.logger.eu.dnetlib = DEBUG +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n \ No newline at end of file