forked from D-Net/dnet-hadoop
Compare commits
No commits in common. "afb46d71f7128e02a0a50e759c3699f74ccb1180" and "f11dfc51f7056d076f8bf3c7b7f2fcb1ce54c1c1" have entirely different histories.
afb46d71f7
...
f11dfc51f7
|
@ -2,64 +2,13 @@
|
||||||
|
|
||||||
<!-- OCEAN -->
|
<!-- OCEAN -->
|
||||||
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>jobTracker</name>-->
|
|
||||||
<!-- <value>yarnRM</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>nameNode</name>-->
|
|
||||||
<!-- <value>hdfs://nameservice1</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>oozie.use.system.libpath</name>-->
|
|
||||||
<!-- <value>true</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>oozie.action.sharelib.for.spark</name>-->
|
|
||||||
<!-- <value>spark2</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
|
||||||
<!-- <value>true</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
|
||||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>spark2EventLogDir</name>-->
|
|
||||||
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>spark2ExtraListeners</name>-->
|
|
||||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
<!-- <property>-->
|
|
||||||
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
|
||||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
|
||||||
<!-- </property>-->
|
|
||||||
|
|
||||||
<!-- GARR -->
|
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>jobTracker</name>
|
<name>jobTracker</name>
|
||||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
<value>yarnRM</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>nameNode</name>
|
<name>nameNode</name>
|
||||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
<value>hdfs://nameservice1</value>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>hive_metastore_uris</name>
|
|
||||||
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
|
||||||
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozie.use.system.libpath</name>
|
<name>oozie.use.system.libpath</name>
|
||||||
|
@ -69,6 +18,14 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>spark2EventLogDir</name>
|
<name>spark2EventLogDir</name>
|
||||||
<value>/user/spark/spark2ApplicationHistory</value>
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
@ -81,4 +38,47 @@
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<!-- GARR -->
|
||||||
|
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>jobTracker</name>-->
|
||||||
|
<!-- <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>nameNode</name>-->
|
||||||
|
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>hive_metastore_uris</name>-->
|
||||||
|
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||||
|
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
||||||
|
<!-- <value>true</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>oozie.use.system.libpath</name>-->
|
||||||
|
<!-- <value>true</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>oozie.action.sharelib.for.spark</name>-->
|
||||||
|
<!-- <value>spark2</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>spark2EventLogDir</name>-->
|
||||||
|
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>spark2ExtraListeners</name>-->
|
||||||
|
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
|
<!-- <property>-->
|
||||||
|
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
||||||
|
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
||||||
|
<!-- </property>-->
|
||||||
</configuration>
|
</configuration>
|
|
@ -17,20 +17,12 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetDataset"/>
|
<start to="TransformJob"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ResetDataset">
|
|
||||||
<fs>
|
|
||||||
<delete path='${mainPath}/dataset/*'/>
|
|
||||||
</fs>
|
|
||||||
<ok to="TransformJob"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="StartTransaction">
|
<action name="StartTransaction">
|
||||||
<java>
|
<java>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
|
|
@ -5,9 +5,6 @@ import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.rdfconverter.utils.CustomPropertyValueDeserializer;
|
|
||||||
|
|
||||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||||
public class BioSchemaProtein {
|
public class BioSchemaProtein {
|
||||||
|
@ -57,11 +54,10 @@ public class BioSchemaProtein {
|
||||||
@JsonProperty("https://schema.org/sameAs")
|
@JsonProperty("https://schema.org/sameAs")
|
||||||
private List<Link> sameAs;
|
private List<Link> sameAs;
|
||||||
@JsonProperty("https://schema.org/hasSequenceAnnotation")
|
@JsonProperty("https://schema.org/hasSequenceAnnotation")
|
||||||
private List<Link> hasSequenceAnnotation;
|
private Link hasSequenceAnnotation;
|
||||||
@JsonProperty("https://schema.org/additionalProperty")
|
@JsonProperty("https://schema.org/additionalProperty")
|
||||||
private List<Link> sequenceAnnotation;
|
private List<Link> sequenceAnnotation;
|
||||||
@JsonProperty("https://schema.org/value")
|
@JsonProperty("https://schema.org/value")
|
||||||
@JsonDeserialize(using = CustomPropertyValueDeserializer.class)
|
|
||||||
private Link propertyValue;
|
private Link propertyValue;
|
||||||
@JsonProperty("https://schema.org/termCode")
|
@JsonProperty("https://schema.org/termCode")
|
||||||
private String termCode;
|
private String termCode;
|
||||||
|
@ -202,11 +198,11 @@ public class BioSchemaProtein {
|
||||||
this.mainEntityOfPage = mainEntityOfPage;
|
this.mainEntityOfPage = mainEntityOfPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Link> getHasSequenceAnnotation() {
|
public Link getHasSequenceAnnotation() {
|
||||||
return hasSequenceAnnotation;
|
return hasSequenceAnnotation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setHasSequenceAnnotation(List<Link> hasSequenceAnnotation) {
|
public void setHasSequenceAnnotation(Link hasSequenceAnnotation) {
|
||||||
this.hasSequenceAnnotation = hasSequenceAnnotation;
|
this.hasSequenceAnnotation = hasSequenceAnnotation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,43 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.rdfconverter.utils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonParser;
|
|
||||||
import com.fasterxml.jackson.core.ObjectCodec;
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationContext;
|
|
||||||
import com.fasterxml.jackson.databind.JsonNode;
|
|
||||||
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
|
||||||
|
|
||||||
public class CustomPropertyValueDeserializer
|
|
||||||
extends StdDeserializer<BioSchemaProtein.Link> {
|
|
||||||
|
|
||||||
public CustomPropertyValueDeserializer() {
|
|
||||||
this(null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public CustomPropertyValueDeserializer(Class<?> vc) {
|
|
||||||
super(vc);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BioSchemaProtein.Link deserialize(
|
|
||||||
JsonParser jsonparser, DeserializationContext context)
|
|
||||||
throws IOException {
|
|
||||||
ObjectCodec oc = jsonparser.getCodec();
|
|
||||||
JsonNode node = oc.readTree(jsonparser);
|
|
||||||
JsonNode id = node.get("@id");
|
|
||||||
if (id != null) {
|
|
||||||
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
|
||||||
link.setId(id.asText());
|
|
||||||
return link;
|
|
||||||
} else {
|
|
||||||
String txt = node.asText();
|
|
||||||
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
|
||||||
link.setId(txt);
|
|
||||||
return link;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -16,7 +16,6 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.fasterxml.jackson.databind.module.SimpleModule;
|
|
||||||
import com.github.jsonldjava.core.JsonLdOptions;
|
import com.github.jsonldjava.core.JsonLdOptions;
|
||||||
import com.github.jsonldjava.core.JsonLdProcessor;
|
import com.github.jsonldjava.core.JsonLdProcessor;
|
||||||
import com.github.jsonldjava.utils.JsonUtils;
|
import com.github.jsonldjava.utils.JsonUtils;
|
||||||
|
@ -51,8 +50,6 @@ public class RDFConverter {
|
||||||
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
||||||
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
||||||
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false);
|
|
||||||
objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
|
|
||||||
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
||||||
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
||||||
|
|
||||||
|
@ -173,8 +170,6 @@ public class RDFConverter {
|
||||||
log.error("Identifier not found", e.getMessage());
|
log.error("Identifier not found", e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
mainTitles.add(" " + entry.getName() + " ");
|
|
||||||
|
|
||||||
dataciteProtein.setId(proteinId);
|
dataciteProtein.setId(proteinId);
|
||||||
bioschemaProteins.put(entry.getId(), entry);
|
bioschemaProteins.put(entry.getId(), entry);
|
||||||
dataciteProteins.add(dataciteProtein);
|
dataciteProteins.add(dataciteProtein);
|
||||||
|
@ -213,22 +208,8 @@ public class RDFConverter {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
||||||
.forEach(bioProtein -> {
|
.forEach(bioProtein -> {
|
||||||
List<String> seqAnnIds = bioProtein
|
sequenceAnnotations
|
||||||
.getValue()
|
.get(bioProtein.getValue().getHasSequenceAnnotation().getId())
|
||||||
.getHasSequenceAnnotation()
|
|
||||||
.stream()
|
|
||||||
.map(s -> s.getId())
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations
|
|
||||||
.entrySet()
|
|
||||||
.stream()
|
|
||||||
.filter(s -> seqAnnIds.contains(s.getKey()))
|
|
||||||
.flatMap(s -> {
|
|
||||||
return s.getValue().stream();
|
|
||||||
})
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
propertyIds
|
|
||||||
.stream()
|
.stream()
|
||||||
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
||||||
.filter(term -> Objects.nonNull(term))
|
.filter(term -> Objects.nonNull(term))
|
||||||
|
|
|
@ -15,16 +15,17 @@ public class ConverterTest {
|
||||||
|
|
||||||
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
|
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
|
||||||
|
|
||||||
@Test
|
// @Test
|
||||||
public void disprotToDataciteTest() throws Exception {
|
// private void nqToDataciteTest() throws Exception {
|
||||||
InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
// InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
||||||
String nq = IOUtils.toString(is);
|
// String nq = IOUtils.toString(is);
|
||||||
RDFConverter converter = new RDFConverter();
|
// logger.debug("NQ: " + nq);
|
||||||
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
// RDFConverter converter = new RDFConverter();
|
||||||
results.stream().forEach(r -> {
|
// ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||||
logger.info("JSON DATACITE >> " + r);
|
// if (results != null && !results.isEmpty()) {
|
||||||
});
|
// logger.info("JSON DATACITE: " + results.get(0));
|
||||||
}
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void pedCitationTest() throws Exception {
|
public void pedCitationTest() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue