forked from D-Net/dnet-hadoop
Compare commits
2 Commits
f11dfc51f7
...
afb46d71f7
Author | SHA1 | Date |
---|---|---|
Enrico Ottonello | afb46d71f7 | |
Enrico Ottonello | 98178b3165 |
|
@ -2,64 +2,13 @@
|
||||||
|
|
||||||
<!-- OCEAN -->
|
<!-- OCEAN -->
|
||||||
|
|
||||||
<property>
|
|
||||||
<name>jobTracker</name>
|
|
||||||
<value>yarnRM</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nameNode</name>
|
|
||||||
<value>hdfs://nameservice1</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.use.system.libpath</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>spark2</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2EventLogDir</name>
|
|
||||||
<value>/user/spark/spark2ApplicationHistory</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
<!-- GARR -->
|
|
||||||
|
|
||||||
<!-- <property>-->
|
<!-- <property>-->
|
||||||
<!-- <name>jobTracker</name>-->
|
<!-- <name>jobTracker</name>-->
|
||||||
<!-- <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>-->
|
<!-- <value>yarnRM</value>-->
|
||||||
<!-- </property>-->
|
<!-- </property>-->
|
||||||
<!-- <property>-->
|
<!-- <property>-->
|
||||||
<!-- <name>nameNode</name>-->
|
<!-- <name>nameNode</name>-->
|
||||||
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
|
<!-- <value>hdfs://nameservice1</value>-->
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>hive_metastore_uris</name>-->
|
|
||||||
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
|
||||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
|
||||||
<!-- <value>true</value>-->
|
|
||||||
<!-- </property>-->
|
<!-- </property>-->
|
||||||
<!-- <property>-->
|
<!-- <property>-->
|
||||||
<!-- <name>oozie.use.system.libpath</name>-->
|
<!-- <name>oozie.use.system.libpath</name>-->
|
||||||
|
@ -70,6 +19,14 @@
|
||||||
<!-- <value>spark2</value>-->
|
<!-- <value>spark2</value>-->
|
||||||
<!-- </property>-->
|
<!-- </property>-->
|
||||||
<!-- <property>-->
|
<!-- <property>-->
|
||||||
|
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
||||||
|
<!-- <value>true</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||||
|
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
<!-- <name>spark2EventLogDir</name>-->
|
<!-- <name>spark2EventLogDir</name>-->
|
||||||
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
||||||
<!-- </property>-->
|
<!-- </property>-->
|
||||||
|
@ -81,4 +38,47 @@
|
||||||
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
||||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
||||||
<!-- </property>-->
|
<!-- </property>-->
|
||||||
|
|
||||||
|
<!-- GARR -->
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -17,12 +17,20 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="TransformJob"/>
|
<start to="ResetDataset"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
<action name="ResetDataset">
|
||||||
|
<fs>
|
||||||
|
<delete path='${mainPath}/dataset/*'/>
|
||||||
|
</fs>
|
||||||
|
<ok to="TransformJob"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
<action name="StartTransaction">
|
<action name="StartTransaction">
|
||||||
<java>
|
<java>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
|
|
@ -5,6 +5,9 @@ import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.rdfconverter.utils.CustomPropertyValueDeserializer;
|
||||||
|
|
||||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||||
public class BioSchemaProtein {
|
public class BioSchemaProtein {
|
||||||
|
@ -54,10 +57,11 @@ public class BioSchemaProtein {
|
||||||
@JsonProperty("https://schema.org/sameAs")
|
@JsonProperty("https://schema.org/sameAs")
|
||||||
private List<Link> sameAs;
|
private List<Link> sameAs;
|
||||||
@JsonProperty("https://schema.org/hasSequenceAnnotation")
|
@JsonProperty("https://schema.org/hasSequenceAnnotation")
|
||||||
private Link hasSequenceAnnotation;
|
private List<Link> hasSequenceAnnotation;
|
||||||
@JsonProperty("https://schema.org/additionalProperty")
|
@JsonProperty("https://schema.org/additionalProperty")
|
||||||
private List<Link> sequenceAnnotation;
|
private List<Link> sequenceAnnotation;
|
||||||
@JsonProperty("https://schema.org/value")
|
@JsonProperty("https://schema.org/value")
|
||||||
|
@JsonDeserialize(using = CustomPropertyValueDeserializer.class)
|
||||||
private Link propertyValue;
|
private Link propertyValue;
|
||||||
@JsonProperty("https://schema.org/termCode")
|
@JsonProperty("https://schema.org/termCode")
|
||||||
private String termCode;
|
private String termCode;
|
||||||
|
@ -198,11 +202,11 @@ public class BioSchemaProtein {
|
||||||
this.mainEntityOfPage = mainEntityOfPage;
|
this.mainEntityOfPage = mainEntityOfPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Link getHasSequenceAnnotation() {
|
public List<Link> getHasSequenceAnnotation() {
|
||||||
return hasSequenceAnnotation;
|
return hasSequenceAnnotation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setHasSequenceAnnotation(Link hasSequenceAnnotation) {
|
public void setHasSequenceAnnotation(List<Link> hasSequenceAnnotation) {
|
||||||
this.hasSequenceAnnotation = hasSequenceAnnotation;
|
this.hasSequenceAnnotation = hasSequenceAnnotation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.rdfconverter.utils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonParser;
|
||||||
|
import com.fasterxml.jackson.core.ObjectCodec;
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationContext;
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
||||||
|
|
||||||
|
public class CustomPropertyValueDeserializer
|
||||||
|
extends StdDeserializer<BioSchemaProtein.Link> {
|
||||||
|
|
||||||
|
public CustomPropertyValueDeserializer() {
|
||||||
|
this(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public CustomPropertyValueDeserializer(Class<?> vc) {
|
||||||
|
super(vc);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BioSchemaProtein.Link deserialize(
|
||||||
|
JsonParser jsonparser, DeserializationContext context)
|
||||||
|
throws IOException {
|
||||||
|
ObjectCodec oc = jsonparser.getCodec();
|
||||||
|
JsonNode node = oc.readTree(jsonparser);
|
||||||
|
JsonNode id = node.get("@id");
|
||||||
|
if (id != null) {
|
||||||
|
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
||||||
|
link.setId(id.asText());
|
||||||
|
return link;
|
||||||
|
} else {
|
||||||
|
String txt = node.asText();
|
||||||
|
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
||||||
|
link.setId(txt);
|
||||||
|
return link;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -16,6 +16,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.fasterxml.jackson.databind.module.SimpleModule;
|
||||||
import com.github.jsonldjava.core.JsonLdOptions;
|
import com.github.jsonldjava.core.JsonLdOptions;
|
||||||
import com.github.jsonldjava.core.JsonLdProcessor;
|
import com.github.jsonldjava.core.JsonLdProcessor;
|
||||||
import com.github.jsonldjava.utils.JsonUtils;
|
import com.github.jsonldjava.utils.JsonUtils;
|
||||||
|
@ -50,6 +51,8 @@ public class RDFConverter {
|
||||||
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
||||||
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
||||||
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false);
|
||||||
|
objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
|
||||||
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
||||||
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
||||||
|
|
||||||
|
@ -170,6 +173,8 @@ public class RDFConverter {
|
||||||
log.error("Identifier not found", e.getMessage());
|
log.error("Identifier not found", e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mainTitles.add(" " + entry.getName() + " ");
|
||||||
|
|
||||||
dataciteProtein.setId(proteinId);
|
dataciteProtein.setId(proteinId);
|
||||||
bioschemaProteins.put(entry.getId(), entry);
|
bioschemaProteins.put(entry.getId(), entry);
|
||||||
dataciteProteins.add(dataciteProtein);
|
dataciteProteins.add(dataciteProtein);
|
||||||
|
@ -208,8 +213,22 @@ public class RDFConverter {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
||||||
.forEach(bioProtein -> {
|
.forEach(bioProtein -> {
|
||||||
sequenceAnnotations
|
List<String> seqAnnIds = bioProtein
|
||||||
.get(bioProtein.getValue().getHasSequenceAnnotation().getId())
|
.getValue()
|
||||||
|
.getHasSequenceAnnotation()
|
||||||
|
.stream()
|
||||||
|
.map(s -> s.getId())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> seqAnnIds.contains(s.getKey()))
|
||||||
|
.flatMap(s -> {
|
||||||
|
return s.getValue().stream();
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
propertyIds
|
||||||
.stream()
|
.stream()
|
||||||
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
||||||
.filter(term -> Objects.nonNull(term))
|
.filter(term -> Objects.nonNull(term))
|
||||||
|
|
|
@ -15,17 +15,16 @@ public class ConverterTest {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
|
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
|
||||||
|
|
||||||
// @Test
|
@Test
|
||||||
// private void nqToDataciteTest() throws Exception {
|
public void disprotToDataciteTest() throws Exception {
|
||||||
// InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
||||||
// String nq = IOUtils.toString(is);
|
String nq = IOUtils.toString(is);
|
||||||
// logger.debug("NQ: " + nq);
|
RDFConverter converter = new RDFConverter();
|
||||||
// RDFConverter converter = new RDFConverter();
|
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||||
// ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
results.stream().forEach(r -> {
|
||||||
// if (results != null && !results.isEmpty()) {
|
logger.info("JSON DATACITE >> " + r);
|
||||||
// logger.info("JSON DATACITE: " + results.get(0));
|
});
|
||||||
// }
|
}
|
||||||
// }
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void pedCitationTest() throws Exception {
|
public void pedCitationTest() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue