Compare commits

...

2 Commits

6 changed files with 143 additions and 70 deletions

View File

@ -2,64 +2,13 @@
<!-- OCEAN -->
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
<!-- GARR -->
<!-- <property>-->
<!-- <name>jobTracker</name>-->
<!-- <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>-->
<!-- <value>yarnRM</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>nameNode</name>-->
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>hive_metastore_uris</name>-->
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
<!-- <value>true</value>-->
<!-- <value>hdfs://nameservice1</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.use.system.libpath</name>-->
@ -70,6 +19,14 @@
<!-- <value>spark2</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
<!-- <value>true</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2EventLogDir</name>-->
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
<!-- </property>-->
@ -81,4 +38,47 @@
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
<!-- </property>-->
<!-- GARR -->
<property>
<name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
</configuration>

View File

@ -17,12 +17,20 @@
</property>
</parameters>
<start to="TransformJob"/>
<start to="ResetDataset"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ResetDataset">
<fs>
<delete path='${mainPath}/dataset/*'/>
</fs>
<ok to="TransformJob"/>
<error to="Kill"/>
</action>
<action name="StartTransaction">
<java>
<configuration>

View File

@ -5,6 +5,9 @@ import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.rdfconverter.utils.CustomPropertyValueDeserializer;
@JsonIgnoreProperties(ignoreUnknown = true)
public class BioSchemaProtein {
@ -54,10 +57,11 @@ public class BioSchemaProtein {
@JsonProperty("https://schema.org/sameAs")
private List<Link> sameAs;
@JsonProperty("https://schema.org/hasSequenceAnnotation")
private Link hasSequenceAnnotation;
private List<Link> hasSequenceAnnotation;
@JsonProperty("https://schema.org/additionalProperty")
private List<Link> sequenceAnnotation;
@JsonProperty("https://schema.org/value")
@JsonDeserialize(using = CustomPropertyValueDeserializer.class)
private Link propertyValue;
@JsonProperty("https://schema.org/termCode")
private String termCode;
@ -198,11 +202,11 @@ public class BioSchemaProtein {
this.mainEntityOfPage = mainEntityOfPage;
}
public Link getHasSequenceAnnotation() {
public List<Link> getHasSequenceAnnotation() {
return hasSequenceAnnotation;
}
public void setHasSequenceAnnotation(Link hasSequenceAnnotation) {
public void setHasSequenceAnnotation(List<Link> hasSequenceAnnotation) {
this.hasSequenceAnnotation = hasSequenceAnnotation;
}

View File

@ -0,0 +1,43 @@
package eu.dnetlib.dhp.rdfconverter.utils;
import java.io.IOException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
public class CustomPropertyValueDeserializer
extends StdDeserializer<BioSchemaProtein.Link> {
public CustomPropertyValueDeserializer() {
this(null);
}
public CustomPropertyValueDeserializer(Class<?> vc) {
super(vc);
}
@Override
public BioSchemaProtein.Link deserialize(
JsonParser jsonparser, DeserializationContext context)
throws IOException {
ObjectCodec oc = jsonparser.getCodec();
JsonNode node = oc.readTree(jsonparser);
JsonNode id = node.get("@id");
if (id != null) {
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
link.setId(id.asText());
return link;
} else {
String txt = node.asText();
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
link.setId(txt);
return link;
}
}
}

View File

@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.github.jsonldjava.core.JsonLdOptions;
import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.utils.JsonUtils;
@ -50,6 +51,8 @@ public class RDFConverter {
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false);
objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
@ -170,6 +173,8 @@ public class RDFConverter {
log.error("Identifier not found", e.getMessage());
}
mainTitles.add(" " + entry.getName() + " ");
dataciteProtein.setId(proteinId);
bioschemaProteins.put(entry.getId(), entry);
dataciteProteins.add(dataciteProtein);
@ -208,8 +213,22 @@ public class RDFConverter {
.stream()
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
.forEach(bioProtein -> {
sequenceAnnotations
.get(bioProtein.getValue().getHasSequenceAnnotation().getId())
List<String> seqAnnIds = bioProtein
.getValue()
.getHasSequenceAnnotation()
.stream()
.map(s -> s.getId())
.collect(Collectors.toList());
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations
.entrySet()
.stream()
.filter(s -> seqAnnIds.contains(s.getKey()))
.flatMap(s -> {
return s.getValue().stream();
})
.collect(Collectors.toList());
propertyIds
.stream()
.map(propertyId -> propertyValues.get(propertyId.getId()))
.filter(term -> Objects.nonNull(term))

View File

@ -15,17 +15,16 @@ public class ConverterTest {
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
// @Test
// private void nqToDataciteTest() throws Exception {
// InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
// String nq = IOUtils.toString(is);
// logger.debug("NQ: " + nq);
// RDFConverter converter = new RDFConverter();
// ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
// if (results != null && !results.isEmpty()) {
// logger.info("JSON DATACITE: " + results.get(0));
// }
// }
@Test
public void disprotToDataciteTest() throws Exception {
InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
String nq = IOUtils.toString(is);
RDFConverter converter = new RDFConverter();
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
results.stream().forEach(r -> {
logger.info("JSON DATACITE >> " + r);
});
}
@Test
public void pedCitationTest() throws Exception {