From f43bfdb594b4249a11a55a0b16551e57ff89e81c Mon Sep 17 00:00:00 2001 From: Enrico Ottonello Date: Thu, 17 Mar 2022 19:24:07 +0100 Subject: [PATCH] added subjects --- .../bioschema/model/BioSchemaProtein.java | 81 +++++++++++---- .../bioschema/model/DataciteProtein.java | 40 ++++++++ .../dhp/rdfconverter/utils/RDFConverter.java | 98 ++++++++++++++++--- .../rdfconverter/bioschema/ConverterTest.java | 24 +++-- 4 files changed, 199 insertions(+), 44 deletions(-) diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java index d9f123963..ececaf413 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/BioSchemaProtein.java @@ -19,7 +19,8 @@ public class BioSchemaProtein { @JsonProperty("@id") private String id; @JsonProperty("@type") - private List type; +// private List type; + private String type; @JsonProperty("https://schema.org/identifier") private String identifier; @JsonProperty("https://schema.org/name") @@ -50,8 +51,16 @@ public class BioSchemaProtein { private Link mainEntityOfPage; @JsonProperty("https://schema.org/citation") private Citation citation; + @JsonProperty("https://schema.org/sameAs") + private List sameAs; @JsonProperty("https://schema.org/hasSequenceAnnotation") - private SequenceAnnotation sequenceAnnotation; + private Link hasSequenceAnnotation; + @JsonProperty("https://schema.org/additionalProperty") + private List sequenceAnnotation; + @JsonProperty("https://schema.org/value") + private Link propertyValue; + @JsonProperty("https://schema.org/termCode") + private String termCode; public String getId() { return id; @@ -61,11 +70,11 @@ public class BioSchemaProtein { this.id = id; } - public List getType() { + public String getType() { return type; } - public void setType(List type) { + public void setType(String type) { this.type = type; } @@ -77,9 +86,6 @@ public class BioSchemaProtein { this.name = name; } - @JsonProperty("https://schema.org/sameAs") - private List sameAs; - public List getSameAs() { return sameAs; } @@ -192,11 +198,19 @@ public class BioSchemaProtein { this.mainEntityOfPage = mainEntityOfPage; } - public SequenceAnnotation getSequenceAnnotation() { + public Link getHasSequenceAnnotation() { + return hasSequenceAnnotation; + } + + public void setHasSequenceAnnotation(Link hasSequenceAnnotation) { + this.hasSequenceAnnotation = hasSequenceAnnotation; + } + + public List getSequenceAnnotation() { return sequenceAnnotation; } - public void setSequenceAnnotation(SequenceAnnotation sequenceAnnotation) { + public void setSequenceAnnotation(List sequenceAnnotation) { this.sequenceAnnotation = sequenceAnnotation; } @@ -207,6 +221,22 @@ public class BioSchemaProtein { public void setCitation(Citation citation) { this.citation = citation; } + + public Link getPropertyValue() { + return propertyValue; + } + + public void setPropertyValue(Link propertyValue) { + this.propertyValue = propertyValue; + } + + public String getTermCode() { + return termCode; + } + + public void setTermCode(String termCode) { + this.termCode = termCode; + } } public static class IsPartOfBioChemEntity { @@ -357,33 +387,46 @@ public class BioSchemaProtein { } } + public static class SequenceAnnotationId { + @JsonProperty("@id") + private String id; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + } + public static class SequenceAnnotation { @JsonProperty("https://schema.org/additionalProperty") - private List additionalProperty; + private List additionalProperty; - public List getAdditionalProperty() { + public List getAdditionalProperty() { return additionalProperty; } - public void setAdditionalProperty(List additionalProperty) { + public void setAdditionalProperty(List additionalProperty) { this.additionalProperty = additionalProperty; } } - public static class AdditionalProperty { + public static class PropertyValue { @JsonProperty("https://schema.org/value") - private List propertyValue; + private DefinedTerm definedTerm; - public List getPropertyValue() { - return propertyValue; + public DefinedTerm getDefinedTerm() { + return definedTerm; } - public void setPropertyValue(List propertyValue) { - this.propertyValue = propertyValue; + public void setDefinedTerm(DefinedTerm definedTerm) { + this.definedTerm = definedTerm; } } - public static class PropertyValue { + public static class DefinedTerm { @JsonProperty("@id") private String id; @JsonProperty("https://schema.org/termCode") diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java index 6fe0963e2..939d04202 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/bioschema/model/DataciteProtein.java @@ -21,6 +21,7 @@ public class DataciteProtein { List descriptions = new ArrayList(); List titles = new ArrayList<Title>(); private List<DataciteDate> dates = new ArrayList<DataciteDate>(); + private List<Subject> subjects = new ArrayList<Subject>(); @JsonInclude(JsonInclude.Include.NON_NULL) public static class Types { @@ -189,6 +190,37 @@ public class DataciteProtein { } } + @JsonInclude(JsonInclude.Include.NON_NULL) + public static class Subject { + private String schemeURI; + private String value; + private String subjectScheme; + + public String getSchemeURI() { + return schemeURI; + } + + public void setSchemeURI(String schemeURI) { + this.schemeURI = schemeURI; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } + + public String getSubjectScheme() { + return subjectScheme; + } + + public void setSubjectScheme(String subjectScheme) { + this.subjectScheme = subjectScheme; + } + } + public String getId() { return id; } @@ -288,4 +320,12 @@ public class DataciteProtein { public void setDates(List<DataciteDate> dates) { this.dates = dates; } + + public List<Subject> getSubjects() { + return subjects; + } + + public void setSubjects(List<Subject> subjects) { + this.subjects = subjects; + } } diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java index bc67565e1..ca9421022 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java @@ -1,10 +1,12 @@ package eu.dnetlib.dhp.rdfconverter.utils; +import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.rio.RDFFormat; @@ -19,6 +21,7 @@ import com.github.jsonldjava.core.JsonLdOptions; import com.github.jsonldjava.core.JsonLdProcessor; import com.github.jsonldjava.utils.JsonUtils; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein; import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein; @@ -62,10 +65,18 @@ public class RDFConverter { }).filter(id -> id != null).collect(Collectors.toList()); ArrayList<String> results = new ArrayList<String>(); + final List<DataciteProtein> dataciteProteins = new ArrayList<>(); + final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>(); + final Map<String, String> propertyValues = new HashMap<>(); + final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>(); + final Map<String, BioSchemaProtein.Entry> proteins = new HashMap<>(); + bioSchemaProtein.getEntryList().stream().forEach(entry -> { if (entry.getType() != null - && entry.getType().stream().filter(type -> type.equals("https://schema.org/Protein")).count() == 1) { + && entry + .getType() + .equals("https://schema.org/Protein")) { DataciteProtein dataciteProtein = new DataciteProtein(); @@ -173,8 +184,8 @@ public class RDFConverter { }); } - if (entry.getSequenceAnnotation() !=null) { - log.debug("Sequence Annotation found " ); + if (entry.getHasSequenceAnnotation() != null) { + log.debug("Sequence Annotation id found: " + entry.getHasSequenceAnnotation().getId()); } String proteinId = ""; @@ -186,17 +197,80 @@ public class RDFConverter { } dataciteProtein.setId(proteinId); - - ObjectMapper mapper = new ObjectMapper(); - try { - StringWriter writer = new StringWriter(); - mapper.writeValue(writer, dataciteProtein); - results.add(writer.toString()); - } catch (Exception e) { - throw new RuntimeException(e); - } + proteins.put(entry.getId(), entry); + dataciteProteins.add(dataciteProtein); + } + if (entry.getType() != null + && entry + .getType() + .equals("https://schema.org/SequenceAnnotation")) { + log.debug("Sequence Annotation found "); + log.debug("sequence id > " + entry.getId()); + entry.getSequenceAnnotation().forEach(l -> { + log.debug(l.getId()); + }); + sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation()); + } + if (entry.getType() != null + && entry + .getType() + .equals("https://schema.org/PropertyValue")) { + log.debug("Property found "); + log.debug(entry.getPropertyValue().getId()); + propertyValues.put(entry.getId(), entry.getPropertyValue().getId()); + } + if (entry.getType() != null + && entry + .getType() + .equals("https://schema.org/DefinedTerm")) { + log.debug("Term found "); + log.debug(entry.getTermCode()); + BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm(); + term.setId(entry.getId()); + term.setTermCode(entry.getTermCode()); + term.setName(entry.getName()); + definedTerms.put(term.getId(), term); } }); + proteins.entrySet().stream().forEach(p -> { + String sequenceAnnotationId = p.getValue().getHasSequenceAnnotation().getId(); + List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations.get(sequenceAnnotationId); + List<String> termIds = propertyIds + .stream() + .map(propertyId -> propertyValues.get(propertyId.getId())) + .collect(Collectors.toList()); + List<BioSchemaProtein.DefinedTerm> terms = termIds + .stream() + .map(term -> definedTerms.get(term)) + .collect(Collectors.toList()); + terms.forEach(t -> { + log.debug("protein id: " + p.getKey() + " >>> term: " + t.getId()); + dataciteProteins.stream().filter(d -> { + return p.getKey().contains(d.getId()); + }).forEach(d -> { + DataciteProtein.Subject subject = new DataciteProtein.Subject(); + subject.setSchemeURI(t.getId()); + subject.setValue(t.getName()); + subject.setSubjectScheme(t.getTermCode()); + d.getSubjects().add(subject); + }); + }); + }); + + ObjectMapper mapper = new ObjectMapper(); +// try { + dataciteProteins.forEach(d -> { + StringWriter writer = new StringWriter(); + try { + mapper.writeValue(writer, d); + } catch (IOException e) { + throw new RuntimeException(e); + } + results.add(writer.toString()); + }); +// } catch (Exception e) { +// throw new RuntimeException(e); +// } return results; } diff --git a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java index a6945ced8..b2fbd954a 100644 --- a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java +++ b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java @@ -5,7 +5,6 @@ import java.io.InputStream; import java.util.ArrayList; import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -16,23 +15,22 @@ public class ConverterTest { static Logger logger = LoggerFactory.getLogger(ConverterTest.class); - @Test - public void nqToDataciteTest() throws Exception { - InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq"); - String nq = IOUtils.toString(is); - logger.debug("NQ: " + nq); - RDFConverter converter = new RDFConverter(); - ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein"); - if (results != null && !results.isEmpty()) { - logger.info("JSON DATACITE: " + results.get(0)); - } - } +// @Test +// private void nqToDataciteTest() throws Exception { +// InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq"); +// String nq = IOUtils.toString(is); +// logger.debug("NQ: " + nq); +// RDFConverter converter = new RDFConverter(); +// ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein"); +// if (results != null && !results.isEmpty()) { +// logger.info("JSON DATACITE: " + results.get(0)); +// } +// } @Test public void pedCitationTest() throws Exception { InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/ped.nq"); String nq = IOUtils.toString(is); - logger.debug("NQ: " + nq); RDFConverter converter = new RDFConverter(); ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein"); if (results != null && !results.isEmpty()) {