forked from D-Net/dnet-hadoop
Compare commits
No commits in common. "afb46d71f7128e02a0a50e759c3699f74ccb1180" and "f11dfc51f7056d076f8bf3c7b7f2fcb1ce54c1c1" have entirely different histories.
afb46d71f7
...
f11dfc51f7
|
@ -2,64 +2,13 @@
|
|||
|
||||
<!-- OCEAN -->
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>jobTracker</name>-->
|
||||
<!-- <value>yarnRM</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>nameNode</name>-->
|
||||
<!-- <value>hdfs://nameservice1</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.use.system.libpath</name>-->
|
||||
<!-- <value>true</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.action.sharelib.for.spark</name>-->
|
||||
<!-- <value>spark2</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
||||
<!-- <value>true</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2EventLogDir</name>-->
|
||||
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2ExtraListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
||||
<!-- </property>-->
|
||||
|
||||
<!-- GARR -->
|
||||
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>hadoop-rm3.garr-pa1.d4science.org:8032</value>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
|
@ -69,6 +18,14 @@
|
|||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
|
@ -81,4 +38,47 @@
|
|||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
|
||||
<!-- GARR -->
|
||||
|
||||
<!-- <property>-->
|
||||
<!-- <name>jobTracker</name>-->
|
||||
<!-- <value>hadoop-rm3.garr-pa1.d4science.org:8032</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>nameNode</name>-->
|
||||
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>hive_metastore_uris</name>-->
|
||||
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2YarnHistoryServerAddress</name>-->
|
||||
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.launcher.mapreduce.user.classpath.first</name>-->
|
||||
<!-- <value>true</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.use.system.libpath</name>-->
|
||||
<!-- <value>true</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>oozie.action.sharelib.for.spark</name>-->
|
||||
<!-- <value>spark2</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2EventLogDir</name>-->
|
||||
<!-- <value>/user/spark/spark2ApplicationHistory</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2ExtraListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>-->
|
||||
<!-- </property>-->
|
||||
<!-- <property>-->
|
||||
<!-- <name>spark2SqlQueryExecutionListeners</name>-->
|
||||
<!-- <value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>-->
|
||||
<!-- </property>-->
|
||||
</configuration>
|
|
@ -17,20 +17,12 @@
|
|||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ResetDataset"/>
|
||||
<start to="TransformJob"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ResetDataset">
|
||||
<fs>
|
||||
<delete path='${mainPath}/dataset/*'/>
|
||||
</fs>
|
||||
<ok to="TransformJob"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="StartTransaction">
|
||||
<java>
|
||||
<configuration>
|
||||
|
|
|
@ -5,9 +5,6 @@ import java.util.List;
|
|||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
|
||||
|
||||
import eu.dnetlib.dhp.rdfconverter.utils.CustomPropertyValueDeserializer;
|
||||
|
||||
@JsonIgnoreProperties(ignoreUnknown = true)
|
||||
public class BioSchemaProtein {
|
||||
|
@ -57,11 +54,10 @@ public class BioSchemaProtein {
|
|||
@JsonProperty("https://schema.org/sameAs")
|
||||
private List<Link> sameAs;
|
||||
@JsonProperty("https://schema.org/hasSequenceAnnotation")
|
||||
private List<Link> hasSequenceAnnotation;
|
||||
private Link hasSequenceAnnotation;
|
||||
@JsonProperty("https://schema.org/additionalProperty")
|
||||
private List<Link> sequenceAnnotation;
|
||||
@JsonProperty("https://schema.org/value")
|
||||
@JsonDeserialize(using = CustomPropertyValueDeserializer.class)
|
||||
private Link propertyValue;
|
||||
@JsonProperty("https://schema.org/termCode")
|
||||
private String termCode;
|
||||
|
@ -202,11 +198,11 @@ public class BioSchemaProtein {
|
|||
this.mainEntityOfPage = mainEntityOfPage;
|
||||
}
|
||||
|
||||
public List<Link> getHasSequenceAnnotation() {
|
||||
public Link getHasSequenceAnnotation() {
|
||||
return hasSequenceAnnotation;
|
||||
}
|
||||
|
||||
public void setHasSequenceAnnotation(List<Link> hasSequenceAnnotation) {
|
||||
public void setHasSequenceAnnotation(Link hasSequenceAnnotation) {
|
||||
this.hasSequenceAnnotation = hasSequenceAnnotation;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.rdfconverter.utils;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.core.ObjectCodec;
|
||||
import com.fasterxml.jackson.databind.DeserializationContext;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.deser.std.StdDeserializer;
|
||||
|
||||
import eu.dnetlib.dhp.rdfconverter.bioschema.model.BioSchemaProtein;
|
||||
|
||||
public class CustomPropertyValueDeserializer
|
||||
extends StdDeserializer<BioSchemaProtein.Link> {
|
||||
|
||||
public CustomPropertyValueDeserializer() {
|
||||
this(null);
|
||||
}
|
||||
|
||||
public CustomPropertyValueDeserializer(Class<?> vc) {
|
||||
super(vc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BioSchemaProtein.Link deserialize(
|
||||
JsonParser jsonparser, DeserializationContext context)
|
||||
throws IOException {
|
||||
ObjectCodec oc = jsonparser.getCodec();
|
||||
JsonNode node = oc.readTree(jsonparser);
|
||||
JsonNode id = node.get("@id");
|
||||
if (id != null) {
|
||||
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
||||
link.setId(id.asText());
|
||||
return link;
|
||||
} else {
|
||||
String txt = node.asText();
|
||||
BioSchemaProtein.Link link = new BioSchemaProtein.Link();
|
||||
link.setId(txt);
|
||||
return link;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -16,7 +16,6 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.module.SimpleModule;
|
||||
import com.github.jsonldjava.core.JsonLdOptions;
|
||||
import com.github.jsonldjava.core.JsonLdProcessor;
|
||||
import com.github.jsonldjava.utils.JsonUtils;
|
||||
|
@ -51,8 +50,6 @@ public class RDFConverter {
|
|||
objectMapper.enable(DeserializationFeature.ACCEPT_SINGLE_VALUE_AS_ARRAY);
|
||||
objectMapper.enable(DeserializationFeature.ACCEPT_EMPTY_STRING_AS_NULL_OBJECT);
|
||||
objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
objectMapper.configure(DeserializationFeature.FAIL_ON_INVALID_SUBTYPE, false);
|
||||
objectMapper.configure(DeserializationFeature.FAIL_ON_MISSING_CREATOR_PROPERTIES, false);
|
||||
BioSchemaProtein bioSchemaProtein = objectMapper.readValue(compactContent, BioSchemaProtein.class);
|
||||
BioSchemaProtein.DateTimeType retrievedOnType = bioSchemaProtein.getRetrievedOn();
|
||||
|
||||
|
@ -173,8 +170,6 @@ public class RDFConverter {
|
|||
log.error("Identifier not found", e.getMessage());
|
||||
}
|
||||
|
||||
mainTitles.add(" " + entry.getName() + " ");
|
||||
|
||||
dataciteProtein.setId(proteinId);
|
||||
bioschemaProteins.put(entry.getId(), entry);
|
||||
dataciteProteins.add(dataciteProtein);
|
||||
|
@ -213,22 +208,8 @@ public class RDFConverter {
|
|||
.stream()
|
||||
.filter(bioProtein -> Objects.nonNull(bioProtein.getValue().getHasSequenceAnnotation()))
|
||||
.forEach(bioProtein -> {
|
||||
List<String> seqAnnIds = bioProtein
|
||||
.getValue()
|
||||
.getHasSequenceAnnotation()
|
||||
.stream()
|
||||
.map(s -> s.getId())
|
||||
.collect(Collectors.toList());
|
||||
List<BioSchemaProtein.Link> propertyIds = sequenceAnnotations
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(s -> seqAnnIds.contains(s.getKey()))
|
||||
.flatMap(s -> {
|
||||
return s.getValue().stream();
|
||||
})
|
||||
.collect(Collectors.toList());
|
||||
|
||||
propertyIds
|
||||
sequenceAnnotations
|
||||
.get(bioProtein.getValue().getHasSequenceAnnotation().getId())
|
||||
.stream()
|
||||
.map(propertyId -> propertyValues.get(propertyId.getId()))
|
||||
.filter(term -> Objects.nonNull(term))
|
||||
|
|
|
@ -15,16 +15,17 @@ public class ConverterTest {
|
|||
|
||||
static Logger logger = LoggerFactory.getLogger(ConverterTest.class);
|
||||
|
||||
@Test
|
||||
public void disprotToDataciteTest() throws Exception {
|
||||
InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
||||
String nq = IOUtils.toString(is);
|
||||
RDFConverter converter = new RDFConverter();
|
||||
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||
results.stream().forEach(r -> {
|
||||
logger.info("JSON DATACITE >> " + r);
|
||||
});
|
||||
}
|
||||
// @Test
|
||||
// private void nqToDataciteTest() throws Exception {
|
||||
// InputStream is = ConverterTest.class.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/disprot.nq");
|
||||
// String nq = IOUtils.toString(is);
|
||||
// logger.debug("NQ: " + nq);
|
||||
// RDFConverter converter = new RDFConverter();
|
||||
// ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||
// if (results != null && !results.isEmpty()) {
|
||||
// logger.info("JSON DATACITE: " + results.get(0));
|
||||
// }
|
||||
// }
|
||||
|
||||
@Test
|
||||
public void pedCitationTest() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue