dnet-hadoop/dhp-workflows/dhp-rdfconverter/src/main/java/eu/dnetlib/dhp/rdfconverter/utils/RDFConverter.java

296 lines
11 KiB
Java

package eu.dnetlib.dhp.rdfconverter.utils;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.rio.RDFFormat;
import org.eclipse.rdf4j.rio.RDFWriter;
import org.eclipse.rdf4j.rio.Rio;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.jsonldjava.core.JsonLdOptions;
import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.utils.JsonUtils;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
public class RDFConverter {
private static final Logger log = LoggerFactory.getLogger(RDFConverter.class);
public ArrayList<String> nQuadsFile2DataciteJson(String nquads, String profile) throws Exception {
if (profile.equals("Protein")) {
return nQuadsFile2DataciteJson(nquads);
}
throw new RuntimeException("Profile not supported");
}
private ArrayList<String> nQuadsFile2DataciteJson(String nquads) throws Exception {
StringReader reader = new StringReader(nquads);
Model model = Rio.parse(reader, "", RDFFormat.NQUADS);
StringWriter jsonLDWriter = new StringWriter();
RDFWriter rdfRecordWriter = Rio.createWriter(RDFFormat.JSONLD, jsonLDWriter);
Rio.write(model, rdfRecordWriter);
String jsonLDBuffer = jsonLDWriter.toString();
Object jsonObject = JsonUtils.fromString(jsonLDBuffer);
Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions());
String compactContent = JsonUtils.toString(compact);
log.debug("jsonld: " + compactContent);
ObjectMapper objectMapper = new ObjectMapper();
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
log.debug("BioSchema id: " + bioSchemaProtein.getId());
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> {
if (entry.getCitation() != null) {
BioSchemaProtein.Citation citationInfo = entry.getCitation();
return citationInfo.getId();
}
return null;
}).filter(id -> id != null).collect(Collectors.toList());
ArrayList<String> results = new ArrayList<String>();
final List<DataciteProtein> dataciteProteins = new ArrayList<>();
final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>();
final Map<String, String> propertyValues = new HashMap<>();
final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>();
final Map<String, BioSchemaProtein.Entry> proteins = new HashMap<>();
bioSchemaProtein.getEntryList().stream().forEach(entry -> {
if (entry.getType() != null
&& entry
.getType()
.equals("https://schema.org/Protein")) {
DataciteProtein dataciteProtein = new DataciteProtein();
citations.forEach(citation -> {
addRelatedIdentifier(dataciteProtein, citation, "IsCitedBy");
});
DataciteProtein.Types types = new DataciteProtein.Types();
types.setResourceType("Protein");
types.setResourceTypeGeneral("Dataset");
dataciteProtein.setTypes(types);
DataciteProtein.DataciteDate dataciteDate = new DataciteProtein.DataciteDate();
dataciteDate.setDate(retrievedOnType.getValue());
dataciteDate.setDateType("Collected");
dataciteProtein.getDates().add(dataciteDate);
if (entry.getName() != null) {
log.debug("Name: " + entry.getName());
DataciteProtein.Title title = new DataciteProtein.Title();
title.setTitle(entry.getName());
dataciteProtein.getTitles().add(title);
}
DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
log.debug("Id: " + entry.getId());
identifier.setIdentifier(entry.getId());
identifier.setIdentifierType("URL");
dataciteProtein.getIdentifiers().add(identifier);
if (entry.getIdentifier() != null) {
log.debug("Identifier: " + entry.getIdentifier());
addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
}
if (entry.getDescription() != null) {
log.debug("description: " + entry.getDescription());
DataciteProtein.Description description = new DataciteProtein.Description();
description.setDescription(entry.getDescription());
dataciteProtein.getDescriptions().add(description);
}
if (entry.getIsEncodedByBioChemEntity() != null) {
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
}
if (entry.getUrl() != null) {
log.debug("url: " + entry.getUrl());
addAlternateIdentifier(dataciteProtein, entry.getUrl());
}
if (entry.getAlternateName() != null) {
log.debug("alternateName: " + entry.getAlternateName());
DataciteProtein.Title title = new DataciteProtein.Title();
title.setTitle(entry.getAlternateName());
title.setTitleType("AlternativeTitle");
dataciteProtein.getTitles().add(title);
}
if (entry.getBioChemInteraction() != null) {
entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
log.debug("bioChemInteraction: " + bc.getId());
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
});
}
if (entry.getBioChemSimilarity() != null) {
entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
log.debug("bioChemSimilarity: " + bc.getId());
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
});
}
if (entry.getHasMolecularFunction() != null) {
log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction());
addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
}
if (entry.getIsInvolvedInBiologicalProcess() != null) {
log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess());
addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
}
if (entry.getIsEncodedByBioChemEntity() != null) {
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
}
if (entry.getIsPartOfBioChemEntity() != null) {
log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity());
addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
}
if (entry.getSameAs() != null) {
entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
log.debug("sameAs value: " + sameAs.getId());
addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
});
}
if (entry.getAssociatedDisease() != null) {
entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
log.debug("associated disease: " + ad.getName());
addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
});
}
if (entry.getHasSequenceAnnotation() != null) {
log.debug("Sequence Annotation id found: " + entry.getHasSequenceAnnotation().getId());
}
String proteinId = "";
try {
String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
proteinId = identifierParts[identifierParts.length - 1];
} catch (Exception e) {
log.error("Identifier not found", e.getMessage());
}
dataciteProtein.setId(proteinId);
proteins.put(entry.getId(), entry);
dataciteProteins.add(dataciteProtein);
}
if (entry.getType() != null
&& entry
.getType()
.equals("https://schema.org/SequenceAnnotation")) {
log.debug("Sequence Annotation found ");
log.debug("sequence id > " + entry.getId());
entry.getSequenceAnnotation().forEach(l -> {
log.debug(l.getId());
});
sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation());
}
if (entry.getType() != null
&& entry
.getType()
.equals("https://schema.org/PropertyValue")) {
log.debug("Property found ");
log.debug(entry.getPropertyValue().getId());
propertyValues.put(entry.getId(), entry.getPropertyValue().getId());
}
if (entry.getType() != null
&& entry
.getType()
.equals("https://schema.org/DefinedTerm")) {
log.debug("Term found ");
log.debug(entry.getTermCode());
BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm();
term.setId(entry.getId());
term.setTermCode(entry.getTermCode());
term.setName(entry.getName());
definedTerms.put(term.getId(), term);
}
});
proteins.entrySet().stream().forEach(p -> {
String sequenceAnnotationId = p.getValue().getHasSequenceAnnotation().getId();
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations.get(sequenceAnnotationId);
List<String> termIds = propertyIds
.stream()
.map(propertyId -> propertyValues.get(propertyId.getId()))
.collect(Collectors.toList());
List<BioSchemaProtein.DefinedTerm> terms = termIds
.stream()
.map(term -> definedTerms.get(term))
.collect(Collectors.toList());
terms.forEach(t -> {
log.debug("protein id: " + p.getKey() + " >>> term: " + t.getId());
dataciteProteins.stream().filter(d -> {
return p.getKey().contains(d.getId());
}).forEach(d -> {
DataciteProtein.Subject subject = new DataciteProtein.Subject();
subject.setSchemeURI(t.getId());
subject.setValue(t.getName());
subject.setSubjectScheme(t.getTermCode());
d.getSubjects().add(subject);
});
});
});
ObjectMapper mapper = new ObjectMapper();
// try {
dataciteProteins.forEach(d -> {
StringWriter writer = new StringWriter();
try {
mapper.writeValue(writer, d);
} catch (IOException e) {
throw new RuntimeException(e);
}
results.add(writer.toString());
});
// } catch (Exception e) {
// throw new RuntimeException(e);
// }
return results;
}
private void addRelatedIdentifier(DataciteProtein DataciteProtein, String relatedIdentifierValue,
String relationType) {
DataciteProtein.RelatedIdentifier relatedIdentifier = new DataciteProtein.RelatedIdentifier();
relatedIdentifier.setRelatedIdentifier(relatedIdentifierValue);
if (!relationType.isEmpty()) {
relatedIdentifier.setRelationType(relationType);
}
if (relatedIdentifierValue.contains("http://") || relatedIdentifierValue.contains("https://")) {
relatedIdentifier.setRelatedIdentifierType("URL");
}
DataciteProtein.getRelatedIdentifiers().add(relatedIdentifier);
}
private void addAlternateIdentifier(DataciteProtein DataciteProtein, String alternateIdentifierValue) {
DataciteProtein.AlternateIdentifier alternateIdentifier = new DataciteProtein.AlternateIdentifier();
alternateIdentifier.setAlternateIdentifier(alternateIdentifierValue);
DataciteProtein.getAlternateIdentifiers().add(alternateIdentifier);
}
}