added titles merging title page and protein identifier

This commit is contained in:
Enrico Ottonello 2022-03-18 14:51:57 +01:00
parent f43bfdb594
commit 861f2a3306
2 changed files with 47 additions and 67 deletions

View File

@ -6,7 +6,6 @@ import java.io.StringReader;
import java.io.StringWriter; import java.io.StringWriter;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.eclipse.rdf4j.model.Model; import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.rio.RDFFormat; import org.eclipse.rdf4j.rio.RDFFormat;
@ -21,7 +20,6 @@ import com.github.jsonldjava.core.JsonLdOptions;
import com.github.jsonldjava.core.JsonLdProcessor; import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.utils.JsonUtils; import com.github.jsonldjava.utils.JsonUtils;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein; import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein; import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
@ -53,7 +51,6 @@ public class RDFConverter {
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT); objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
log.debug("BioSchema id: " + bioSchemaProtein.getId());
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> { List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> {
@ -69,7 +66,8 @@ public class RDFConverter {
final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>(); final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>();
final Map<String, String> propertyValues = new HashMap<>(); final Map<String, String> propertyValues = new HashMap<>();
final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>(); final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>();
final Map<String, BioSchemaProtein.Entry> proteins = new HashMap<>(); final Map<String, BioSchemaProtein.Entry> bioschemaProteins = new HashMap<>();
final List<String> mainTitles = new ArrayList<>();
bioSchemaProtein.getEntryList().stream().forEach(entry -> { bioSchemaProtein.getEntryList().stream().forEach(entry -> {
@ -94,42 +92,30 @@ public class RDFConverter {
dataciteDate.setDateType("Collected"); dataciteDate.setDateType("Collected");
dataciteProtein.getDates().add(dataciteDate); dataciteProtein.getDates().add(dataciteDate);
if (entry.getName() != null) {
log.debug("Name: " + entry.getName());
DataciteProtein.Title title = new DataciteProtein.Title();
title.setTitle(entry.getName());
dataciteProtein.getTitles().add(title);
}
DataciteProtein.Identifier identifier = new DataciteProtein.Identifier(); DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
log.debug("Id: " + entry.getId());
identifier.setIdentifier(entry.getId()); identifier.setIdentifier(entry.getId());
identifier.setIdentifierType("URL"); identifier.setIdentifierType("URL");
dataciteProtein.getIdentifiers().add(identifier); dataciteProtein.getIdentifiers().add(identifier);
if (entry.getIdentifier() != null) { if (entry.getIdentifier() != null) {
log.debug("Identifier: " + entry.getIdentifier());
addAlternateIdentifier(dataciteProtein, entry.getIdentifier()); addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
} }
if (entry.getDescription() != null) { if (entry.getDescription() != null) {
log.debug("description: " + entry.getDescription());
DataciteProtein.Description description = new DataciteProtein.Description(); DataciteProtein.Description description = new DataciteProtein.Description();
description.setDescription(entry.getDescription()); description.setDescription(entry.getDescription());
dataciteProtein.getDescriptions().add(description); dataciteProtein.getDescriptions().add(description);
} }
if (entry.getIsEncodedByBioChemEntity() != null) { if (entry.getIsEncodedByBioChemEntity() != null) {
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
} }
if (entry.getUrl() != null) { if (entry.getUrl() != null) {
log.debug("url: " + entry.getUrl());
addAlternateIdentifier(dataciteProtein, entry.getUrl()); addAlternateIdentifier(dataciteProtein, entry.getUrl());
} }
if (entry.getAlternateName() != null) { if (entry.getAlternateName() != null) {
log.debug("alternateName: " + entry.getAlternateName());
DataciteProtein.Title title = new DataciteProtein.Title(); DataciteProtein.Title title = new DataciteProtein.Title();
title.setTitle(entry.getAlternateName()); title.setTitle(entry.getAlternateName());
title.setTitleType("AlternativeTitle"); title.setTitleType("AlternativeTitle");
@ -138,56 +124,44 @@ public class RDFConverter {
if (entry.getBioChemInteraction() != null) { if (entry.getBioChemInteraction() != null) {
entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> { entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
log.debug("bioChemInteraction: " + bc.getId());
addRelatedIdentifier(dataciteProtein, bc.getId(), ""); addRelatedIdentifier(dataciteProtein, bc.getId(), "");
}); });
} }
if (entry.getBioChemSimilarity() != null) { if (entry.getBioChemSimilarity() != null) {
entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> { entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
log.debug("bioChemSimilarity: " + bc.getId());
addRelatedIdentifier(dataciteProtein, bc.getId(), ""); addRelatedIdentifier(dataciteProtein, bc.getId(), "");
}); });
} }
if (entry.getHasMolecularFunction() != null) { if (entry.getHasMolecularFunction() != null) {
log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction());
addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), ""); addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
} }
if (entry.getIsInvolvedInBiologicalProcess() != null) { if (entry.getIsInvolvedInBiologicalProcess() != null) {
log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess());
addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), ""); addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
} }
if (entry.getIsEncodedByBioChemEntity() != null) { if (entry.getIsEncodedByBioChemEntity() != null) {
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), ""); addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
} }
if (entry.getIsPartOfBioChemEntity() != null) { if (entry.getIsPartOfBioChemEntity() != null) {
log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), ""); addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
} }
if (entry.getSameAs() != null) { if (entry.getSameAs() != null) {
entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> { entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
log.debug("sameAs value: " + sameAs.getId());
addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo"); addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
}); });
} }
if (entry.getAssociatedDisease() != null) { if (entry.getAssociatedDisease() != null) {
entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> { entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
log.debug("associated disease: " + ad.getName());
addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo"); addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
}); });
} }
if (entry.getHasSequenceAnnotation() != null) {
log.debug("Sequence Annotation id found: " + entry.getHasSequenceAnnotation().getId());
}
String proteinId = ""; String proteinId = "";
try { try {
String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/"); String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
@ -197,68 +171,77 @@ public class RDFConverter {
} }
dataciteProtein.setId(proteinId); dataciteProtein.setId(proteinId);
proteins.put(entry.getId(), entry); bioschemaProteins.put(entry.getId(), entry);
dataciteProteins.add(dataciteProtein); dataciteProteins.add(dataciteProtein);
} }
if (entry.getType() != null if (entry.getType() != null
&& entry && entry
.getType() .getType()
.equals("https://schema.org/SequenceAnnotation")) { .equals("https://schema.org/SequenceAnnotation")) {
log.debug("Sequence Annotation found ");
log.debug("sequence id > " + entry.getId());
entry.getSequenceAnnotation().forEach(l -> {
log.debug(l.getId());
});
sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation()); sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation());
} }
if (entry.getType() != null if (entry.getType() != null
&& entry && entry
.getType() .getType()
.equals("https://schema.org/PropertyValue")) { .equals("https://schema.org/PropertyValue")) {
log.debug("Property found ");
log.debug(entry.getPropertyValue().getId());
propertyValues.put(entry.getId(), entry.getPropertyValue().getId()); propertyValues.put(entry.getId(), entry.getPropertyValue().getId());
} }
if (entry.getType() != null if (entry.getType() != null
&& entry && entry
.getType() .getType()
.equals("https://schema.org/DefinedTerm")) { .equals("https://schema.org/DefinedTerm")) {
log.debug("Term found ");
log.debug(entry.getTermCode());
BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm(); BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm();
term.setId(entry.getId()); term.setId(entry.getId());
term.setTermCode(entry.getTermCode()); term.setTermCode(entry.getTermCode());
term.setName(entry.getName()); term.setName(entry.getName());
definedTerms.put(term.getId(), term); definedTerms.put(term.getId(), term);
} }
if (entry.getType() != null
&& entry
.getType()
.equals("https://schema.org/CollectionPage")) {
mainTitles.add(entry.getName());
}
}); });
proteins.entrySet().stream().forEach(p -> { bioschemaProteins
String sequenceAnnotationId = p.getValue().getHasSequenceAnnotation().getId(); .entrySet()
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations.get(sequenceAnnotationId); .stream()
List<String> termIds = propertyIds .filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
.stream() .forEach(bioProtein -> {
.map(propertyId -> propertyValues.get(propertyId.getId())) sequenceAnnotations
.collect(Collectors.toList()); .get(bioProtein.getValue().getHasSequenceAnnotation().getId())
List<BioSchemaProtein.DefinedTerm> terms = termIds .stream()
.stream() .map(propertyId -> propertyValues.get(propertyId.getId()))
.map(term -> definedTerms.get(term)) .filter(term -> Objects.nonNull(term))
.collect(Collectors.toList()); .map(term -> definedTerms.get(term))
terms.forEach(t -> { .filter(term -> Objects.nonNull(term))
log.debug("protein id: " + p.getKey() + " >>> term: " + t.getId()); .forEach(term -> {
dataciteProteins.stream().filter(d -> { dataciteProteins
return p.getKey().contains(d.getId()); .stream()
}).forEach(d -> { .filter(
DataciteProtein.Subject subject = new DataciteProtein.Subject(); dcProtein -> dcProtein
subject.setSchemeURI(t.getId()); .getIdentifiers()
subject.setValue(t.getName()); .get(0)
subject.setSubjectScheme(t.getTermCode()); .getIdentifier()
d.getSubjects().add(subject); .equals(bioProtein.getKey()))
}); .filter(dcProtein -> Objects.nonNull(dcProtein))
.forEach(d -> {
DataciteProtein.Subject subject = new DataciteProtein.Subject();
subject.setSchemeURI(term.getId());
subject.setValue(term.getName());
subject.setSubjectScheme(term.getTermCode());
d.getSubjects().add(subject);
});
});
}); });
dataciteProteins.forEach(d -> {
DataciteProtein.Title title = new DataciteProtein.Title();
title.setTitle(d.getId() + " - ".concat(mainTitles.get(0)));
d.getTitles().add(title);
}); });
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
// try {
dataciteProteins.forEach(d -> { dataciteProteins.forEach(d -> {
StringWriter writer = new StringWriter(); StringWriter writer = new StringWriter();
try { try {
@ -268,9 +251,6 @@ public class RDFConverter {
} }
results.add(writer.toString()); results.add(writer.toString());
}); });
// } catch (Exception e) {
// throw new RuntimeException(e);
// }
return results; return results;
} }

View File

@ -33,8 +33,8 @@ public class ConverterTest {
String nq = IOUtils.toString(is); String nq = IOUtils.toString(is);
RDFConverter converter = new RDFConverter(); RDFConverter converter = new RDFConverter();
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein"); ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
if (results != null && !results.isEmpty()) { results.stream().forEach(r -> {
logger.info("JSON DATACITE: " + results.get(0)); logger.info("JSON DATACITE >> " + r);
} });
} }
} }