forked from D-Net/dnet-hadoop
218 lines
8.5 KiB
Java
218 lines
8.5 KiB
Java
|
|
package eu.dnetlib.dhp.rdfconverter.utils;
|
|
|
|
import java.io.StringReader;
|
|
import java.io.StringWriter;
|
|
import java.util.*;
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.eclipse.rdf4j.model.Model;
|
|
import org.eclipse.rdf4j.rio.RDFFormat;
|
|
import org.eclipse.rdf4j.rio.RDFWriter;
|
|
import org.eclipse.rdf4j.rio.Rio;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
import com.github.jsonldjava.core.JsonLdOptions;
|
|
import com.github.jsonldjava.core.JsonLdProcessor;
|
|
import com.github.jsonldjava.utils.JsonUtils;
|
|
|
|
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
|
import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
|
|
|
|
public class RDFConverter {
|
|
|
|
private static final Logger log = LoggerFactory.getLogger(RDFConverter.class);
|
|
|
|
public ArrayList<String> nQuadsFile2DataciteJson(String nquads, String profile) throws Exception {
|
|
if (profile.equals("Protein")) {
|
|
return nQuadsFile2DataciteJson(nquads);
|
|
}
|
|
throw new RuntimeException("Profile not supported");
|
|
}
|
|
|
|
private ArrayList<String> nQuadsFile2DataciteJson(String nquads) throws Exception {
|
|
StringReader reader = new StringReader(nquads);
|
|
Model model = Rio.parse(reader, "", RDFFormat.NQUADS);
|
|
StringWriter jsonLDWriter = new StringWriter();
|
|
RDFWriter rdfRecordWriter = Rio.createWriter(RDFFormat.JSONLD, jsonLDWriter);
|
|
Rio.write(model, rdfRecordWriter);
|
|
String jsonLDBuffer = jsonLDWriter.toString();
|
|
Object jsonObject = JsonUtils.fromString(jsonLDBuffer);
|
|
Object compact = JsonLdProcessor.compact(jsonObject, new HashMap<>(), new JsonLdOptions());
|
|
String compactContent = JsonUtils.toString(compact);
|
|
log.debug("jsonld: " + compactContent);
|
|
|
|
ObjectMapper objectMapper = new ObjectMapper();
|
|
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
|
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
|
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
|
log.debug("BioSchema id: " + bioSchemaProtein.getId());
|
|
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
|
|
|
List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> {
|
|
if (entry.getCitation() != null) {
|
|
BioSchemaProtein.Citation citationInfo = entry.getCitation();
|
|
return citationInfo.getId();
|
|
}
|
|
return null;
|
|
}).filter(id -> id != null).collect(Collectors.toList());
|
|
|
|
ArrayList<String> results = new ArrayList<String>();
|
|
bioSchemaProtein.getEntryList().stream().forEach(entry -> {
|
|
|
|
if (entry.getType() != null
|
|
&& entry.getType().stream().filter(type -> type.equals("https://schema.org/Protein")).count() == 1) {
|
|
|
|
DataciteProtein dataciteProtein = new DataciteProtein();
|
|
|
|
citations.forEach(citation -> {
|
|
addRelatedIdentifier(dataciteProtein, citation, "IsCitedBy");
|
|
});
|
|
|
|
DataciteProtein.Types types = new DataciteProtein.Types();
|
|
types.setResourceType("Protein");
|
|
types.setResourceTypeGeneral("Dataset");
|
|
dataciteProtein.setTypes(types);
|
|
|
|
DataciteProtein.DataciteDate dataciteDate = new DataciteProtein.DataciteDate();
|
|
dataciteDate.setDate(retrievedOnType.getValue());
|
|
dataciteDate.setDateType("Collected");
|
|
dataciteProtein.getDates().add(dataciteDate);
|
|
|
|
if (entry.getName() != null) {
|
|
log.debug("Name: " + entry.getName());
|
|
DataciteProtein.Title title = new DataciteProtein.Title();
|
|
title.setTitle(entry.getName());
|
|
dataciteProtein.getTitles().add(title);
|
|
}
|
|
DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
|
|
log.debug("Id: " + entry.getId());
|
|
identifier.setIdentifier(entry.getId());
|
|
identifier.setIdentifierType("URL");
|
|
dataciteProtein.getIdentifiers().add(identifier);
|
|
|
|
if (entry.getIdentifier() != null) {
|
|
log.debug("Identifier: " + entry.getIdentifier());
|
|
addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
|
|
}
|
|
|
|
if (entry.getDescription() != null) {
|
|
log.debug("description: " + entry.getDescription());
|
|
DataciteProtein.Description description = new DataciteProtein.Description();
|
|
description.setDescription(entry.getDescription());
|
|
dataciteProtein.getDescriptions().add(description);
|
|
}
|
|
|
|
if (entry.getIsEncodedByBioChemEntity() != null) {
|
|
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
|
|
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
|
}
|
|
|
|
if (entry.getUrl() != null) {
|
|
log.debug("url: " + entry.getUrl());
|
|
addAlternateIdentifier(dataciteProtein, entry.getUrl());
|
|
}
|
|
|
|
if (entry.getAlternateName() != null) {
|
|
log.debug("alternateName: " + entry.getAlternateName());
|
|
DataciteProtein.Title title = new DataciteProtein.Title();
|
|
title.setTitle(entry.getAlternateName());
|
|
title.setTitleType("AlternativeTitle");
|
|
dataciteProtein.getTitles().add(title);
|
|
}
|
|
|
|
if (entry.getBioChemInteraction() != null) {
|
|
entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
|
|
log.debug("bioChemInteraction: " + bc.getId());
|
|
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
|
});
|
|
}
|
|
|
|
if (entry.getBioChemSimilarity() != null) {
|
|
entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
|
|
log.debug("bioChemSimilarity: " + bc.getId());
|
|
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
|
});
|
|
}
|
|
|
|
if (entry.getHasMolecularFunction() != null) {
|
|
log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction());
|
|
addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
|
|
}
|
|
|
|
if (entry.getIsInvolvedInBiologicalProcess() != null) {
|
|
log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess());
|
|
addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
|
|
}
|
|
|
|
if (entry.getIsEncodedByBioChemEntity() != null) {
|
|
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
|
|
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
|
}
|
|
|
|
if (entry.getIsPartOfBioChemEntity() != null) {
|
|
log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity());
|
|
addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
|
|
}
|
|
|
|
if (entry.getSameAs() != null) {
|
|
entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
|
|
log.debug("sameAs: " + sameAs.getId());
|
|
addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
|
|
});
|
|
}
|
|
|
|
if (entry.getAssociatedDisease() != null) {
|
|
entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
|
|
log.debug("associated disease: " + ad.getName());
|
|
addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
|
|
});
|
|
}
|
|
|
|
String proteinId = "";
|
|
try {
|
|
String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
|
|
proteinId = identifierParts[identifierParts.length - 1];
|
|
} catch (Exception e) {
|
|
log.error("Identifier not found", e.getMessage());
|
|
}
|
|
|
|
dataciteProtein.setId(proteinId);
|
|
|
|
ObjectMapper mapper = new ObjectMapper();
|
|
try {
|
|
StringWriter writer = new StringWriter();
|
|
mapper.writeValue(writer, dataciteProtein);
|
|
results.add(writer.toString());
|
|
} catch (Exception e) {
|
|
throw new RuntimeException(e);
|
|
}
|
|
}
|
|
});
|
|
return results;
|
|
}
|
|
|
|
private void addRelatedIdentifier(DataciteProtein DataciteProtein, String relatedIdentifierValue,
|
|
String relationType) {
|
|
DataciteProtein.RelatedIdentifier relatedIdentifier = new DataciteProtein.RelatedIdentifier();
|
|
relatedIdentifier.setRelatedIdentifier(relatedIdentifierValue);
|
|
if (!relationType.isEmpty()) {
|
|
relatedIdentifier.setRelationType(relationType);
|
|
}
|
|
if (relatedIdentifierValue.contains("http://") || relatedIdentifierValue.contains("https://")) {
|
|
relatedIdentifier.setRelatedIdentifierType("URL");
|
|
}
|
|
DataciteProtein.getRelatedIdentifiers().add(relatedIdentifier);
|
|
}
|
|
|
|
private void addAlternateIdentifier(DataciteProtein DataciteProtein, String alternateIdentifierValue) {
|
|
DataciteProtein.AlternateIdentifier alternateIdentifier = new DataciteProtein.AlternateIdentifier();
|
|
alternateIdentifier.setAlternateIdentifier(alternateIdentifierValue);
|
|
DataciteProtein.getAlternateIdentifiers().add(alternateIdentifier);
|
|
}
|
|
}
|