package eu.dnetlib.dhp.rdfconverter.utils; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.*; import java.util.stream.Collectors; import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFWriter; import org.eclipse.rdf4j.rio.Rio; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; import com.github.jsonldjava.core.JsonLdOptions; import com.github.jsonldjava.core.JsonLdProcessor; import com.github.jsonldjava.utils.JsonUtils; import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein; import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein; public class RDFConverter { private static final Logger log = LoggerFactory.getLogger(RDFConverter.class); public ArrayList nQuadsFile2DataciteJson(String nquads, String profile) throws Exception { if (profile.equals("Protein")) { return nQuadsFile2DataciteJson(nquads); } throw new RuntimeException("Profile not supported"); } private ArrayList nQuadsFile2DataciteJson(String nquads) throws Exception { StringReader reader = new StringReader(nquads); Model model = Rio.parse(reader, "", RDFFormat.NQUADS); StringWriter jsonLDWriter = new StringWriter(); RDFWriter rdfRecordWriter = Rio.createWriter(RDFFormat.JSONLD, jsonLDWriter); Rio.write(model, rdfRecordWriter); String jsonLDBuffer = jsonLDWriter.toString(); Object jsonObject = JsonUtils.fromString(jsonLDBuffer); Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions()); String compactContent = JsonUtils.toString(compact); log.debug("jsonld: " + compactContent); ObjectMapper objectMapper = new ObjectMapper(); objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY); objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT); objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false); objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false); BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); List citations = bioSchemaProtein.getEntryList().stream().map(entry -> { if (entry.getCitation() != null) { BioSchemaProtein.Citation citationInfo = entry.getCitation(); return citationInfo.getId(); } return null; }).filter(id -> id != null).collect(Collectors.toList()); ArrayList results = new ArrayList(); final List dataciteProteins = new ArrayList<>(); final Map> sequenceAnnotations = new HashMap<>(); final Map propertyValues = new HashMap<>(); final Map definedTerms = new HashMap<>(); final Map bioschemaProteins = new HashMap<>(); final List mainTitles = new ArrayList<>(); bioSchemaProtein.getEntryList().stream().forEach(entry -> { if (entry.getType() != null && entry .getType() .equals("https://schema.org/Protein")) { DataciteProtein dataciteProtein = new DataciteProtein(); citations.forEach(citation -> { addRelatedIdentifier(dataciteProtein, citation, "IsCitedBy"); }); DataciteProtein.Types types = new DataciteProtein.Types(); types.setResourceType("Protein"); types.setResourceTypeGeneral("Dataset"); dataciteProtein.setTypes(types); DataciteProtein.DataciteDate dataciteDate = new DataciteProtein.DataciteDate(); dataciteDate.setDate(retrievedOnType.getValue()); dataciteDate.setDateType("Collected"); dataciteProtein.getDates().add(dataciteDate); DataciteProtein.Identifier identifier = new DataciteProtein.Identifier(); identifier.setIdentifier(entry.getId()); identifier.setIdentifierType("URL"); dataciteProtein.getIdentifiers().add(identifier); if (entry.getIdentifier() != null) { addAlternateIdentifier(dataciteProtein, entry.getIdentifier()); } if (entry.getDescription() != null) { DataciteProtein.Description description = new DataciteProtein.Description(); description.setDescription(entry.getDescription()); dataciteProtein.getDescriptions().add(description); } if (entry.getIsEncodedByBioChemEntity() != null) { addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); } if (entry.getUrl() != null) { addAlternateIdentifier(dataciteProtein, entry.getUrl()); } if (entry.getAlternateName() != null) { DataciteProtein.Title title = new DataciteProtein.Title(); title.setTitle(entry.getAlternateName()); title.setTitleType("AlternativeTitle"); dataciteProtein.getTitles().add(title); } if (entry.getBioChemInteraction() != null) { entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> { addRelatedIdentifier(dataciteProtein, bc.getId(), ""); }); } if (entry.getBioChemSimilarity() != null) { entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> { addRelatedIdentifier(dataciteProtein, bc.getId(), ""); }); } if (entry.getHasMolecularFunction() != null) { addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), ""); } if (entry.getIsInvolvedInBiologicalProcess() != null) { addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), ""); } if (entry.getIsEncodedByBioChemEntity() != null) { addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); } if (entry.getIsPartOfBioChemEntity() != null) { addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), ""); } if (entry.getSameAs() != null) { entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> { addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo"); }); } if (entry.getAssociatedDisease() != null) { entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> { addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo"); }); } String proteinId = ""; try { String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/"); proteinId = identifierParts[identifierParts.length - 1]; } catch (Exception e) { log.error("Identifier not found", e.getMessage()); } mainTitles.add(" " + entry.getName() + " "); dataciteProtein.setId(proteinId); bioschemaProteins.put(entry.getId(), entry); dataciteProteins.add(dataciteProtein); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/SequenceAnnotation")) { sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation()); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/PropertyValue")) { propertyValues.put(entry.getId(), entry.getPropertyValue().getId()); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/DefinedTerm")) { BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm(); term.setId(entry.getId()); term.setTermCode(entry.getTermCode()); term.setName(entry.getName()); definedTerms.put(term.getId(), term); } if (entry.getType() != null && entry .getType() .equals("https://schema.org/CollectionPage")) { mainTitles.add(entry.getName()); } }); bioschemaProteins .entrySet() .stream() .filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation())) .forEach(bioProtein -> { List seqAnnIds = bioProtein .getValue() .getHasSequenceAnnotation() .stream() .map(s -> s.getId()) .collect(Collectors.toList()); List propertyIds = sequenceAnnotations .entrySet() .stream() .filter(s -> seqAnnIds.contains(s.getKey())) .flatMap(s -> { return s.getValue().stream(); }) .collect(Collectors.toList()); propertyIds .stream() .map(propertyId -> propertyValues.get(propertyId.getId())) .filter(term -> Objects.nonNull(term)) .map(term -> definedTerms.get(term)) .filter(term -> Objects.nonNull(term)) .forEach(term -> { dataciteProteins .stream() .filter( dcProtein -> dcProtein .getIdentifiers() .get(0) .getIdentifier() .equals(bioProtein.getKey())) .filter(dcProtein -> Objects.nonNull(dcProtein)) .forEach(d -> { DataciteProtein.Subject subject = new DataciteProtein.Subject(); subject.setSchemeURI(term.getId()); subject.setValue(term.getName()); subject.setSubjectScheme(term.getTermCode()); d.getSubjects().add(subject); }); }); }); dataciteProteins.forEach(d -> { DataciteProtein.Title title = new DataciteProtein.Title(); title.setTitle(d.getId() + " - ".concat(mainTitles.get(0))); d.getTitles().add(title); }); ObjectMapper mapper = new ObjectMapper(); dataciteProteins.forEach(d -> { StringWriter writer = new StringWriter(); try { mapper.writeValue(writer, d); } catch (IOException e) { throw new RuntimeException(e); } results.add(writer.toString()); }); return results; } private void addRelatedIdentifier(DataciteProtein DataciteProtein, String relatedIdentifierValue, String relationType) { DataciteProtein.RelatedIdentifier relatedIdentifier = new DataciteProtein.RelatedIdentifier(); relatedIdentifier.setRelatedIdentifier(relatedIdentifierValue); if (!relationType.isEmpty()) { relatedIdentifier.setRelationType(relationType); } if (relatedIdentifierValue.contains("http://") || relatedIdentifierValue.contains("https://")) { relatedIdentifier.setRelatedIdentifierType("URL"); } DataciteProtein.getRelatedIdentifiers().add(relatedIdentifier); } private void addAlternateIdentifier(DataciteProtein DataciteProtein, String alternateIdentifierValue) { DataciteProtein.AlternateIdentifier alternateIdentifier = new DataciteProtein.AlternateIdentifier(); alternateIdentifier.setAlternateIdentifier(alternateIdentifierValue); DataciteProtein.getAlternateIdentifiers().add(alternateIdentifier); } }