diff --git a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java index ca9421022..9172ff087 100644 --- a/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java +++ b/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java @@ -6,7 +6,6 @@ import java.io.StringReader; import java.io.StringWriter; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.rio.RDFFormat; @@ -21,7 +20,6 @@ import com.github.jsonldjava.core.JsonLdOptions; import com.github.jsonldjava.core.JsonLdProcessor; import com.github.jsonldjava.utils.JsonUtils; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein; import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein; @@ -53,7 +51,6 @@ public class RDFConverter { objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT); objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); - log.debug("BioSchema id: " + bioSchemaProtein.getId()); BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); List citations = bioSchemaProtein.getEntryList().stream().map(entry -> { @@ -69,7 +66,8 @@ public class RDFConverter { final Map> sequenceAnnotations = new HashMap<>(); final Map propertyValues = new HashMap<>(); final Map definedTerms = new HashMap<>(); - final Map proteins = new HashMap<>(); + final Map bioschemaProteins = new HashMap<>(); + final List mainTitles = new ArrayList<>(); bioSchemaProtein.getEntryList().stream().forEach(entry -> { @@ -94,42 +92,30 @@ public class RDFConverter { dataciteDate.setDateType("Collected"); dataciteProtein.getDates().add(dataciteDate); - if (entry.getName() != null) { - log.debug("Name: " + entry.getName()); - DataciteProtein.Title title = new DataciteProtein.Title(); - title.setTitle(entry.getName()); - dataciteProtein.getTitles().add(title); - } DataciteProtein.Identifier identifier = new DataciteProtein.Identifier(); - log.debug("Id: " + entry.getId()); identifier.setIdentifier(entry.getId()); identifier.setIdentifierType("URL"); dataciteProtein.getIdentifiers().add(identifier); if (entry.getIdentifier() != null) { - log.debug("Identifier: " + entry.getIdentifier()); addAlternateIdentifier(dataciteProtein, entry.getIdentifier()); } if (entry.getDescription() != null) { - log.debug("description: " + entry.getDescription()); DataciteProtein.Description description = new DataciteProtein.Description(); description.setDescription(entry.getDescription()); dataciteProtein.getDescriptions().add(description); } if (entry.getIsEncodedByBioChemEntity() != null) { - log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity()); addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); } if (entry.getUrl() != null) { - log.debug("url: " + entry.getUrl()); addAlternateIdentifier(dataciteProtein, entry.getUrl()); } if (entry.getAlternateName() != null) { - log.debug("alternateName: " + entry.getAlternateName()); DataciteProtein.Title title = new DataciteProtein.Title(); title.setTitle(entry.getAlternateName()); title.setTitleType("AlternativeTitle"); @@ -138,56 +124,44 @@ public class RDFConverter { if (entry.getBioChemInteraction() != null) { entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> { - log.debug("bioChemInteraction: " + bc.getId()); addRelatedIdentifier(dataciteProtein, bc.getId(), ""); }); } if (entry.getBioChemSimilarity() != null) { entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> { - log.debug("bioChemSimilarity: " + bc.getId()); addRelatedIdentifier(dataciteProtein, bc.getId(), ""); }); } if (entry.getHasMolecularFunction() != null) { - log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction()); addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), ""); } if (entry.getIsInvolvedInBiologicalProcess() != null) { - log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess()); addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), ""); } if (entry.getIsEncodedByBioChemEntity() != null) { - log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity()); addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); } if (entry.getIsPartOfBioChemEntity() != null) { - log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity()); addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), ""); } if (entry.getSameAs() != null) { entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> { - log.debug("sameAs value: " + sameAs.getId()); addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo"); }); } if (entry.getAssociatedDisease() != null) { entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> { - log.debug("associated disease: " + ad.getName()); addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo"); }); } - if (entry.getHasSequenceAnnotation() != null) { - log.debug("Sequence Annotation id found: " + entry.getHasSequenceAnnotation().getId()); - } - String proteinId = ""; try { String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/"); @@ -197,68 +171,77 @@ public class RDFConverter { } dataciteProtein.setId(proteinId); - proteins.put(entry.getId(), entry); + bioschemaProteins.put(entry.getId(), entry); dataciteProteins.add(dataciteProtein); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/SequenceAnnotation")) { - log.debug("Sequence Annotation found "); - log.debug("sequence id > " + entry.getId()); - entry.getSequenceAnnotation().forEach(l -> { - log.debug(l.getId()); - }); sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation()); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/PropertyValue")) { - log.debug("Property found "); - log.debug(entry.getPropertyValue().getId()); propertyValues.put(entry.getId(), entry.getPropertyValue().getId()); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/DefinedTerm")) { - log.debug("Term found "); - log.debug(entry.getTermCode()); BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm(); term.setId(entry.getId()); term.setTermCode(entry.getTermCode()); term.setName(entry.getName()); definedTerms.put(term.getId(), term); } + if (entry.getType() != null + && entry + .getType() + .equals("https://schema.org/CollectionPage")) { + mainTitles.add(entry.getName()); + } }); - proteins.entrySet().stream().forEach(p -> { - String sequenceAnnotationId = p.getValue().getHasSequenceAnnotation().getId(); - List propertyIds = sequenceAnnotations.get(sequenceAnnotationId); - List termIds = propertyIds - .stream() - .map(propertyId -> propertyValues.get(propertyId.getId())) - .collect(Collectors.toList()); - List terms = termIds - .stream() - .map(term -> definedTerms.get(term)) - .collect(Collectors.toList()); - terms.forEach(t -> { - log.debug("protein id: " + p.getKey() + " >>> term: " + t.getId()); - dataciteProteins.stream().filter(d -> { - return p.getKey().contains(d.getId()); - }).forEach(d -> { - DataciteProtein.Subject subject = new DataciteProtein.Subject(); - subject.setSchemeURI(t.getId()); - subject.setValue(t.getName()); - subject.setSubjectScheme(t.getTermCode()); - d.getSubjects().add(subject); - }); + bioschemaProteins + .entrySet() + .stream() + .filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation())) + .forEach(bioProtein -> { + sequenceAnnotations + .get(bioProtein.getValue().getHasSequenceAnnotation().getId()) + .stream() + .map(propertyId -> propertyValues.get(propertyId.getId())) + .filter(term -> Objects.nonNull(term)) + .map(term -> definedTerms.get(term)) + .filter(term -> Objects.nonNull(term)) + .forEach(term -> { + dataciteProteins + .stream() + .filter( + dcProtein -> dcProtein + .getIdentifiers() + .get(0) + .getIdentifier() + .equals(bioProtein.getKey())) + .filter(dcProtein -> Objects.nonNull(dcProtein)) + .forEach(d -> { + DataciteProtein.Subject subject = new DataciteProtein.Subject(); + subject.setSchemeURI(term.getId()); + subject.setValue(term.getName()); + subject.setSubjectScheme(term.getTermCode()); + d.getSubjects().add(subject); + }); + }); }); + + dataciteProteins.forEach(d -> { + DataciteProtein.Title title = new DataciteProtein.Title(); + title.setTitle(d.getId() + " - ".concat(mainTitles.get(0))); + d.getTitles().add(title); }); ObjectMapper mapper = new ObjectMapper(); -// try { dataciteProteins.forEach(d -> { StringWriter writer = new StringWriter(); try { @@ -268,9 +251,6 @@ public class RDFConverter { } results.add(writer.toString()); }); -// } catch (Exception e) { -// throw new RuntimeException(e); -// } return results; } diff --git a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java index b2fbd954a..a203dcd40 100644 --- a/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java +++ b/dhp-workflows/dhp-rdfconverter/src/test/java/eu/dnetlib/dhp/rdfconverter/bioschema/ConverterTest.java @@ -33,8 +33,8 @@ public class ConverterTest { String nq = IOUtils.toString(is); RDFConverter converter = new RDFConverter(); ArrayList results = converter.nQuadsFile2DataciteJson(nq, "Protein"); - if (results != null && !results.isEmpty()) { - logger.info("JSON DATACITE: " + results.get(0)); - } + results.stream().forEach(r -> { + logger.info("JSON DATACITE >> " + r); + }); } }