forked from D-Net/dnet-hadoop
added titles merging title page and protein identifier
This commit is contained in:
parent
f43bfdb594
commit
861f2a3306
|
@ -6,7 +6,6 @@ import java.io.StringReader;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.eclipse.rdf4j.model.Model;
|
import org.eclipse.rdf4j.model.Model;
|
||||||
import org.eclipse.rdf4j.rio.RDFFormat;
|
import org.eclipse.rdf4j.rio.RDFFormat;
|
||||||
|
@ -21,7 +20,6 @@ import com.github.jsonldjava.core.JsonLdOptions;
|
||||||
import com.github.jsonldjava.core.JsonLdProcessor;
|
import com.github.jsonldjava.core.JsonLdProcessor;
|
||||||
import com.github.jsonldjava.utils.JsonUtils;
|
import com.github.jsonldjava.utils.JsonUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
|
||||||
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
||||||
import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
|
import eu.dnetlib.dhp.rdfconverter.bioschema.model.DataciteProtein;
|
||||||
|
|
||||||
|
@ -53,7 +51,6 @@ public class RDFConverter {
|
||||||
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
||||||
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
||||||
log.debug("BioSchema id: " + bioSchemaProtein.getId());
|
|
||||||
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
||||||
|
|
||||||
List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> {
|
List<String> citations = bioSchemaProtein.getEntryList().stream().map(entry -> {
|
||||||
|
@ -69,7 +66,8 @@ public class RDFConverter {
|
||||||
final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>();
|
final Map<String, List<BioSchemaProtein.Link>> sequenceAnnotations = new HashMap<>();
|
||||||
final Map<String, String> propertyValues = new HashMap<>();
|
final Map<String, String> propertyValues = new HashMap<>();
|
||||||
final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>();
|
final Map<String, BioSchemaProtein.DefinedTerm> definedTerms = new HashMap<>();
|
||||||
final Map<String, BioSchemaProtein.Entry> proteins = new HashMap<>();
|
final Map<String, BioSchemaProtein.Entry> bioschemaProteins = new HashMap<>();
|
||||||
|
final List<String> mainTitles = new ArrayList<>();
|
||||||
|
|
||||||
bioSchemaProtein.getEntryList().stream().forEach(entry -> {
|
bioSchemaProtein.getEntryList().stream().forEach(entry -> {
|
||||||
|
|
||||||
|
@ -94,42 +92,30 @@ public class RDFConverter {
|
||||||
dataciteDate.setDateType("Collected");
|
dataciteDate.setDateType("Collected");
|
||||||
dataciteProtein.getDates().add(dataciteDate);
|
dataciteProtein.getDates().add(dataciteDate);
|
||||||
|
|
||||||
if (entry.getName() != null) {
|
|
||||||
log.debug("Name: " + entry.getName());
|
|
||||||
DataciteProtein.Title title = new DataciteProtein.Title();
|
|
||||||
title.setTitle(entry.getName());
|
|
||||||
dataciteProtein.getTitles().add(title);
|
|
||||||
}
|
|
||||||
DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
|
DataciteProtein.Identifier identifier = new DataciteProtein.Identifier();
|
||||||
log.debug("Id: " + entry.getId());
|
|
||||||
identifier.setIdentifier(entry.getId());
|
identifier.setIdentifier(entry.getId());
|
||||||
identifier.setIdentifierType("URL");
|
identifier.setIdentifierType("URL");
|
||||||
dataciteProtein.getIdentifiers().add(identifier);
|
dataciteProtein.getIdentifiers().add(identifier);
|
||||||
|
|
||||||
if (entry.getIdentifier() != null) {
|
if (entry.getIdentifier() != null) {
|
||||||
log.debug("Identifier: " + entry.getIdentifier());
|
|
||||||
addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
|
addAlternateIdentifier(dataciteProtein, entry.getIdentifier());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getDescription() != null) {
|
if (entry.getDescription() != null) {
|
||||||
log.debug("description: " + entry.getDescription());
|
|
||||||
DataciteProtein.Description description = new DataciteProtein.Description();
|
DataciteProtein.Description description = new DataciteProtein.Description();
|
||||||
description.setDescription(entry.getDescription());
|
description.setDescription(entry.getDescription());
|
||||||
dataciteProtein.getDescriptions().add(description);
|
dataciteProtein.getDescriptions().add(description);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getIsEncodedByBioChemEntity() != null) {
|
if (entry.getIsEncodedByBioChemEntity() != null) {
|
||||||
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
|
|
||||||
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getUrl() != null) {
|
if (entry.getUrl() != null) {
|
||||||
log.debug("url: " + entry.getUrl());
|
|
||||||
addAlternateIdentifier(dataciteProtein, entry.getUrl());
|
addAlternateIdentifier(dataciteProtein, entry.getUrl());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getAlternateName() != null) {
|
if (entry.getAlternateName() != null) {
|
||||||
log.debug("alternateName: " + entry.getAlternateName());
|
|
||||||
DataciteProtein.Title title = new DataciteProtein.Title();
|
DataciteProtein.Title title = new DataciteProtein.Title();
|
||||||
title.setTitle(entry.getAlternateName());
|
title.setTitle(entry.getAlternateName());
|
||||||
title.setTitleType("AlternativeTitle");
|
title.setTitleType("AlternativeTitle");
|
||||||
|
@ -138,56 +124,44 @@ public class RDFConverter {
|
||||||
|
|
||||||
if (entry.getBioChemInteraction() != null) {
|
if (entry.getBioChemInteraction() != null) {
|
||||||
entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
|
entry.getBioChemInteraction().stream().filter(Objects::nonNull).forEach(bc -> {
|
||||||
log.debug("bioChemInteraction: " + bc.getId());
|
|
||||||
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getBioChemSimilarity() != null) {
|
if (entry.getBioChemSimilarity() != null) {
|
||||||
entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
|
entry.getBioChemSimilarity().stream().filter(Objects::nonNull).forEach(bc -> {
|
||||||
log.debug("bioChemSimilarity: " + bc.getId());
|
|
||||||
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
addRelatedIdentifier(dataciteProtein, bc.getId(), "");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getHasMolecularFunction() != null) {
|
if (entry.getHasMolecularFunction() != null) {
|
||||||
log.debug("hasMolecularFunction: " + entry.getHasMolecularFunction());
|
|
||||||
addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
|
addRelatedIdentifier(dataciteProtein, entry.getHasMolecularFunction(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getIsInvolvedInBiologicalProcess() != null) {
|
if (entry.getIsInvolvedInBiologicalProcess() != null) {
|
||||||
log.debug("isInvolvedInBiologicalProcess: " + entry.getIsInvolvedInBiologicalProcess());
|
|
||||||
addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
|
addRelatedIdentifier(dataciteProtein, entry.getIsInvolvedInBiologicalProcess(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getIsEncodedByBioChemEntity() != null) {
|
if (entry.getIsEncodedByBioChemEntity() != null) {
|
||||||
log.debug("isEncodedByBioChemEntity: " + entry.getIsEncodedByBioChemEntity());
|
|
||||||
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
addRelatedIdentifier(dataciteProtein, entry.getIsEncodedByBioChemEntity(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getIsPartOfBioChemEntity() != null) {
|
if (entry.getIsPartOfBioChemEntity() != null) {
|
||||||
log.debug("isPartOfBioChemEntity: " + entry.getIsPartOfBioChemEntity());
|
|
||||||
addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
|
addRelatedIdentifier(dataciteProtein, entry.getIsPartOfBioChemEntity().getUrl(), "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getSameAs() != null) {
|
if (entry.getSameAs() != null) {
|
||||||
entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
|
entry.getSameAs().stream().filter(Objects::nonNull).forEach(sameAs -> {
|
||||||
log.debug("sameAs value: " + sameAs.getId());
|
|
||||||
addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
|
addRelatedIdentifier(dataciteProtein, sameAs.getId(), "IsIdenticalTo");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getAssociatedDisease() != null) {
|
if (entry.getAssociatedDisease() != null) {
|
||||||
entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
|
entry.getAssociatedDisease().stream().filter(Objects::nonNull).forEach(ad -> {
|
||||||
log.debug("associated disease: " + ad.getName());
|
|
||||||
addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
|
addRelatedIdentifier(dataciteProtein, ad.getName(), "IsIdenticalTo");
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (entry.getHasSequenceAnnotation() != null) {
|
|
||||||
log.debug("Sequence Annotation id found: " + entry.getHasSequenceAnnotation().getId());
|
|
||||||
}
|
|
||||||
|
|
||||||
String proteinId = "";
|
String proteinId = "";
|
||||||
try {
|
try {
|
||||||
String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
|
String[] identifierParts = dataciteProtein.getIdentifiers().get(0).getIdentifier().split("/");
|
||||||
|
@ -197,68 +171,77 @@ public class RDFConverter {
|
||||||
}
|
}
|
||||||
|
|
||||||
dataciteProtein.setId(proteinId);
|
dataciteProtein.setId(proteinId);
|
||||||
proteins.put(entry.getId(), entry);
|
bioschemaProteins.put(entry.getId(), entry);
|
||||||
dataciteProteins.add(dataciteProtein);
|
dataciteProteins.add(dataciteProtein);
|
||||||
}
|
}
|
||||||
if (entry.getType() != null
|
if (entry.getType() != null
|
||||||
&& entry
|
&& entry
|
||||||
.getType()
|
.getType()
|
||||||
.equals("https://schema.org/SequenceAnnotation")) {
|
.equals("https://schema.org/SequenceAnnotation")) {
|
||||||
log.debug("Sequence Annotation found ");
|
|
||||||
log.debug("sequence id > " + entry.getId());
|
|
||||||
entry.getSequenceAnnotation().forEach(l -> {
|
|
||||||
log.debug(l.getId());
|
|
||||||
});
|
|
||||||
sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation());
|
sequenceAnnotations.put(entry.getId(), entry.getSequenceAnnotation());
|
||||||
}
|
}
|
||||||
if (entry.getType() != null
|
if (entry.getType() != null
|
||||||
&& entry
|
&& entry
|
||||||
.getType()
|
.getType()
|
||||||
.equals("https://schema.org/PropertyValue")) {
|
.equals("https://schema.org/PropertyValue")) {
|
||||||
log.debug("Property found ");
|
|
||||||
log.debug(entry.getPropertyValue().getId());
|
|
||||||
propertyValues.put(entry.getId(), entry.getPropertyValue().getId());
|
propertyValues.put(entry.getId(), entry.getPropertyValue().getId());
|
||||||
}
|
}
|
||||||
if (entry.getType() != null
|
if (entry.getType() != null
|
||||||
&& entry
|
&& entry
|
||||||
.getType()
|
.getType()
|
||||||
.equals("https://schema.org/DefinedTerm")) {
|
.equals("https://schema.org/DefinedTerm")) {
|
||||||
log.debug("Term found ");
|
|
||||||
log.debug(entry.getTermCode());
|
|
||||||
BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm();
|
BioSchemaProtein.DefinedTerm term = new BioSchemaProtein.DefinedTerm();
|
||||||
term.setId(entry.getId());
|
term.setId(entry.getId());
|
||||||
term.setTermCode(entry.getTermCode());
|
term.setTermCode(entry.getTermCode());
|
||||||
term.setName(entry.getName());
|
term.setName(entry.getName());
|
||||||
definedTerms.put(term.getId(), term);
|
definedTerms.put(term.getId(), term);
|
||||||
}
|
}
|
||||||
|
if (entry.getType() != null
|
||||||
|
&& entry
|
||||||
|
.getType()
|
||||||
|
.equals("https://schema.org/CollectionPage")) {
|
||||||
|
mainTitles.add(entry.getName());
|
||||||
|
}
|
||||||
});
|
});
|
||||||
proteins.entrySet().stream().forEach(p -> {
|
bioschemaProteins
|
||||||
String sequenceAnnotationId = p.getValue().getHasSequenceAnnotation().getId();
|
.entrySet()
|
||||||
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations.get(sequenceAnnotationId);
|
.stream()
|
||||||
List<String> termIds = propertyIds
|
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
||||||
|
.forEach(bioProtein -> {
|
||||||
|
sequenceAnnotations
|
||||||
|
.get(bioProtein.getValue().getHasSequenceAnnotation().getId())
|
||||||
.stream()
|
.stream()
|
||||||
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
||||||
.collect(Collectors.toList());
|
.filter(term -> Objects.nonNull(term))
|
||||||
List<BioSchemaProtein.DefinedTerm> terms = termIds
|
|
||||||
.stream()
|
|
||||||
.map(term -> definedTerms.get(term))
|
.map(term -> definedTerms.get(term))
|
||||||
.collect(Collectors.toList());
|
.filter(term -> Objects.nonNull(term))
|
||||||
terms.forEach(t -> {
|
.forEach(term -> {
|
||||||
log.debug("protein id: " + p.getKey() + " >>> term: " + t.getId());
|
dataciteProteins
|
||||||
dataciteProteins.stream().filter(d -> {
|
.stream()
|
||||||
return p.getKey().contains(d.getId());
|
.filter(
|
||||||
}).forEach(d -> {
|
dcProtein -> dcProtein
|
||||||
|
.getIdentifiers()
|
||||||
|
.get(0)
|
||||||
|
.getIdentifier()
|
||||||
|
.equals(bioProtein.getKey()))
|
||||||
|
.filter(dcProtein -> Objects.nonNull(dcProtein))
|
||||||
|
.forEach(d -> {
|
||||||
DataciteProtein.Subject subject = new DataciteProtein.Subject();
|
DataciteProtein.Subject subject = new DataciteProtein.Subject();
|
||||||
subject.setSchemeURI(t.getId());
|
subject.setSchemeURI(term.getId());
|
||||||
subject.setValue(t.getName());
|
subject.setValue(term.getName());
|
||||||
subject.setSubjectScheme(t.getTermCode());
|
subject.setSubjectScheme(term.getTermCode());
|
||||||
d.getSubjects().add(subject);
|
d.getSubjects().add(subject);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
dataciteProteins.forEach(d -> {
|
||||||
|
DataciteProtein.Title title = new DataciteProtein.Title();
|
||||||
|
title.setTitle(d.getId() + " - ".concat(mainTitles.get(0)));
|
||||||
|
d.getTitles().add(title);
|
||||||
|
});
|
||||||
|
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
ObjectMapper mapper = new ObjectMapper();
|
||||||
// try {
|
|
||||||
dataciteProteins.forEach(d -> {
|
dataciteProteins.forEach(d -> {
|
||||||
StringWriter writer = new StringWriter();
|
StringWriter writer = new StringWriter();
|
||||||
try {
|
try {
|
||||||
|
@ -268,9 +251,6 @@ public class RDFConverter {
|
||||||
}
|
}
|
||||||
results.add(writer.toString());
|
results.add(writer.toString());
|
||||||
});
|
});
|
||||||
// } catch (Exception e) {
|
|
||||||
// throw new RuntimeException(e);
|
|
||||||
// }
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,8 @@ public class ConverterTest {
|
||||||
String nq = IOUtils.toString(is);
|
String nq = IOUtils.toString(is);
|
||||||
RDFConverter converter = new RDFConverter();
|
RDFConverter converter = new RDFConverter();
|
||||||
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||||
if (results != null && !results.isEmpty()) {
|
results.stream().forEach(r -> {
|
||||||
logger.info("JSON DATACITE: " + results.get(0));
|
logger.info("JSON DATACITE >> " + r);
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue