Compare commits

..

No commits in common. "afb46d71f7128e02a0a50e759c3699f74ccb1180" and "f11dfc51f7056d076f8bf3c7b7f2fcb1ce54c1c1" have entirely different histories.

6 changed files with 70 additions and 143 deletions

View File

@ -2,64 +2,13 @@
<!-- OCEAN --> <!-- OCEAN -->
<!-- <property>-->
<!-- <name>jobTracker</name>-->
<!-- <value>yarnRM</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>nameNode</name>-->
<!-- <value>hdfs://nameservice1</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.use.system.libpath</name>-->
<!-- <value>true</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.action.sharelib.for.spark</name>-->
<!-- <value>spark2</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
<!-- <value>true</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2EventLogDir</name>-->
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2ExtraListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
<!-- </property>-->
<!-- GARR -->
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value> <value>yarnRM</value>
</property> </property>
<property> <property>
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value> <value>hdfs://nameservice1</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property> </property>
<property> <property>
<name>oozie.use.system.libpath</name> <name>oozie.use.system.libpath</name>
@ -69,6 +18,14 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
</property>
<property> <property>
<name>spark2EventLogDir</name> <name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value> <value>/user/spark/spark2ApplicationHistory</value>
@ -81,4 +38,47 @@
<name>spark2SqlQueryExecutionListeners</name> <name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value> <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property> </property>
<!-- GARR -->
<!-- <property>-->
<!-- <name>jobTracker</name>-->
<!-- <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>nameNode</name>-->
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>hive_metastore_uris</name>-->
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
<!-- <value>true</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.use.system.libpath</name>-->
<!-- <value>true</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>oozie.action.sharelib.for.spark</name>-->
<!-- <value>spark2</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2EventLogDir</name>-->
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2ExtraListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
<!-- </property>-->
</configuration> </configuration>

View File

@ -17,20 +17,12 @@
</property> </property>
</parameters> </parameters>
<start to="ResetDataset"/> <start to="TransformJob"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ResetDataset">
<fs>
<delete path='${mainPath}/dataset/*'/>
</fs>
<ok to="TransformJob"/>
<error to="Kill"/>
</action>
<action name="StartTransaction"> <action name="StartTransaction">
<java> <java>
<configuration> <configuration>

View File

@ -5,9 +5,6 @@ import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.rdfconverter.utils.CustomPropertyValueDeserializer;
@JsonIgnoreProperties(ignoreUnknown = true) @JsonIgnoreProperties(ignoreUnknown = true)
public class BioSchemaProtein { public class BioSchemaProtein {
@ -57,11 +54,10 @@ public class BioSchemaProtein {
@JsonProperty("https://schema.org/sameAs") @JsonProperty("https://schema.org/sameAs")
private List<Link> sameAs; private List<Link> sameAs;
@JsonProperty("https://schema.org/hasSequenceAnnotation") @JsonProperty("https://schema.org/hasSequenceAnnotation")
private List<Link> hasSequenceAnnotation; private Link hasSequenceAnnotation;
@JsonProperty("https://schema.org/additionalProperty") @JsonProperty("https://schema.org/additionalProperty")
private List<Link> sequenceAnnotation; private List<Link> sequenceAnnotation;
@JsonProperty("https://schema.org/value") @JsonProperty("https://schema.org/value")
@JsonDeserialize(using = CustomPropertyValueDeserializer.class)
private Link propertyValue; private Link propertyValue;
@JsonProperty("https://schema.org/termCode") @JsonProperty("https://schema.org/termCode")
private String termCode; private String termCode;
@ -202,11 +198,11 @@ public class BioSchemaProtein {
this.mainEntityOfPage = mainEntityOfPage; this.mainEntityOfPage = mainEntityOfPage;
} }
public List<Link> getHasSequenceAnnotation() { public Link getHasSequenceAnnotation() {
return hasSequenceAnnotation; return hasSequenceAnnotation;
} }
public void setHasSequenceAnnotation(List<Link> hasSequenceAnnotation) { public void setHasSequenceAnnotation(Link hasSequenceAnnotation) {
this.hasSequenceAnnotation = hasSequenceAnnotation; this.hasSequenceAnnotation = hasSequenceAnnotation;
} }

View File

@ -1,43 +0,0 @@
package eu.dnetlib.dhp.rdfconverter.utils;
import java.io.IOException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.ObjectCodec;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
public class CustomPropertyValueDeserializer
extends StdDeserializer<BioSchemaProtein.Link> {
public CustomPropertyValueDeserializer() {
this(null);
}
public CustomPropertyValueDeserializer(Class<?> vc) {
super(vc);
}
@Override
public BioSchemaProtein.Link deserialize(
JsonParser jsonparser, DeserializationContext context)
throws IOException {
ObjectCodec oc = jsonparser.getCodec();
JsonNode node = oc.readTree(jsonparser);
JsonNode id = node.get("@id");
if (id != null) {
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
link.setId(id.asText());
return link;
} else {
String txt = node.asText();
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
link.setId(txt);
return link;
}
}
}

View File

@ -16,7 +16,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.github.jsonldjava.core.JsonLdOptions; import com.github.jsonldjava.core.JsonLdOptions;
import com.github.jsonldjava.core.JsonLdProcessor; import com.github.jsonldjava.core.JsonLdProcessor;
import com.github.jsonldjava.utils.JsonUtils; import com.github.jsonldjava.utils.JsonUtils;
@ -51,8 +50,6 @@ public class RDFConverter {
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY); objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT); objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false);
objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class); BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn(); BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
@ -173,8 +170,6 @@ public class RDFConverter {
log.error("Identifier not found", e.getMessage()); log.error("Identifier not found", e.getMessage());
} }
mainTitles.add(" " + entry.getName() + " ");
dataciteProtein.setId(proteinId); dataciteProtein.setId(proteinId);
bioschemaProteins.put(entry.getId(), entry); bioschemaProteins.put(entry.getId(), entry);
dataciteProteins.add(dataciteProtein); dataciteProteins.add(dataciteProtein);
@ -213,22 +208,8 @@ public class RDFConverter {
.stream() .stream()
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation())) .filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
.forEach(bioProtein -> { .forEach(bioProtein -> {
List<String> seqAnnIds = bioProtein sequenceAnnotations
.getValue() .get(bioProtein.getValue().getHasSequenceAnnotation().getId())
.getHasSequenceAnnotation()
.stream()
.map(s -> s.getId())
.collect(Collectors.toList());
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations
.entrySet()
.stream()
.filter(s -> seqAnnIds.contains(s.getKey()))
.flatMap(s -> {
return s.getValue().stream();
})
.collect(Collectors.toList());
propertyIds
.stream() .stream()
.map(propertyId -> propertyValues.get(propertyId.getId())) .map(propertyId -> propertyValues.get(propertyId.getId()))
.filter(term -> Objects.nonNull(term)) .filter(term -> Objects.nonNull(term))

View File

@ -15,16 +15,17 @@ public class ConverterTest {
static Logger logger = LoggerFactory.getLogger(ConverterTest.class); static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
@Test // @Test
public void disprotToDataciteTest() throws Exception { // private void nqToDataciteTest() throws Exception {
InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq"); // InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
String nq = IOUtils.toString(is); // String nq = IOUtils.toString(is);
RDFConverter converter = new RDFConverter(); // logger.debug("NQ: " + nq);
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein"); // RDFConverter converter = new RDFConverter();
results.stream().forEach(r -> { // ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
logger.info("JSON DATACITE >> " + r); // if (results != null && !results.isEmpty()) {
}); // logger.info("JSON DATACITE: " + results.get(0));
} // }
// }
@Test @Test
public void pedCitationTest() throws Exception { public void pedCitationTest() throws Exception {