forked from D-Net/dnet-hadoop
fixed bug on missing relation in ANDS
This commit is contained in:
parent
3581244daf
commit
cd27df91a1
|
@ -1,6 +1,6 @@
|
||||||
package eu.dnetlib.dhp.doiboost
|
package eu.dnetlib.dhp.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation}
|
||||||
import org.apache.spark.SparkContext
|
import org.apache.spark.SparkContext
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
|
||||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||||
|
@ -21,6 +21,13 @@ class QueryTest {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def has_ands(r:Relation) :Boolean = {
|
||||||
|
|
||||||
|
r.getCollectedfrom!= null && r.getCollectedfrom.asScala.count(k => k.getValue.contains("Australian")) > 0
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def hasInstanceWithUrl(p:Publication):Boolean = {
|
def hasInstanceWithUrl(p:Publication):Boolean = {
|
||||||
|
|
|
@ -109,20 +109,20 @@ public class CleaningFunctions {
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getPid())) {
|
if (Objects.nonNull(r.getPid())) {
|
||||||
r
|
r
|
||||||
.setPid(
|
.setPid(
|
||||||
r
|
r
|
||||||
.getPid()
|
.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||||
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
|
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.map(sp -> {
|
.map(sp -> {
|
||||||
sp.setValue(StringUtils.trim(sp.getValue()));
|
sp.setValue(StringUtils.trim(sp.getValue()));
|
||||||
return sp;
|
return sp;
|
||||||
})
|
})
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||||
r
|
r
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
|
case class IdReplace(newId:String, oldId:String) {}
|
|
@ -1,12 +1,15 @@
|
||||||
package eu.dnetlib.dhp.sx.graph
|
package eu.dnetlib.dhp.sx.graph
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
||||||
import eu.dnetlib.dhp.sx.ebi.EBIAggregator
|
import eu.dnetlib.dhp.sx.ebi.EBIAggregator
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
|
import org.apache.spark.sql.functions.col
|
||||||
|
|
||||||
|
|
||||||
object SparkSplitOafTODLIEntities {
|
object SparkSplitOafTODLIEntities {
|
||||||
|
|
||||||
|
@ -83,14 +86,42 @@ object SparkSplitOafTODLIEntities {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ids(o:Oaf) :(String, String) = {
|
||||||
|
|
||||||
|
o match {
|
||||||
|
case p: DLIPublication =>
|
||||||
|
val prefix = StringUtils.substringBefore(p.getId, "|")
|
||||||
|
val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
|
||||||
|
(p.getId, s"$prefix|$original")
|
||||||
|
case p: DLIDataset =>
|
||||||
|
val prefix = StringUtils.substringBefore(p.getId, "|")
|
||||||
|
val original = StringUtils.substringAfter(p.getOriginalObjIdentifier, "::")
|
||||||
|
(p.getId, s"$prefix|$original")
|
||||||
|
case _ =>null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
|
def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
||||||
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
|
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
|
||||||
|
|
||||||
|
|
||||||
|
OAFDataset
|
||||||
|
.filter(o => o.isInstanceOf[Result])
|
||||||
|
.map(extract_ids)(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
|
.filter(r => r != null)
|
||||||
|
.where("_1 != _2")
|
||||||
|
.select(col("_1").alias("newId"), col("_2").alias("oldId"))
|
||||||
|
.distinct()
|
||||||
|
.map(f => IdReplace(f.getString(0), f.getString(1)))
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/id_replace")
|
||||||
|
|
||||||
|
|
||||||
OAFDataset
|
OAFDataset
|
||||||
.filter(s => s != null && s.isInstanceOf[Relation])
|
.filter(s => s != null && s.isInstanceOf[Relation])
|
||||||
.map(s =>s.asInstanceOf[Relation])
|
.map(s =>s.asInstanceOf[Relation])
|
||||||
|
@ -100,7 +131,41 @@ object SparkSplitOafTODLIEntities {
|
||||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||||
.map(p => p._2)
|
.map(p => p._2)
|
||||||
.repartition(4000)
|
.repartition(4000)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation_unfixed")
|
||||||
|
|
||||||
|
|
||||||
|
val relations = spark.read.load(s"$workingPath/graph/relation_unfixed").as[Relation]
|
||||||
|
val ids = spark.read.load(s"$workingPath/graph/id_replace").as[IdReplace]
|
||||||
|
|
||||||
|
relations
|
||||||
|
.map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
|
.joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
|
||||||
|
.map(i =>{
|
||||||
|
val r = i._1._2
|
||||||
|
if (i._2 != null)
|
||||||
|
{
|
||||||
|
val id = i._2.newId
|
||||||
|
r.setSource(id)
|
||||||
|
}
|
||||||
|
r
|
||||||
|
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/rel_f_source")
|
||||||
|
|
||||||
|
val rel_source:Dataset[Relation] = spark.read.load(s"$workingPath/graph/rel_f_source").as[Relation]
|
||||||
|
|
||||||
|
rel_source
|
||||||
|
.map(r => (r.getTarget, r))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||||
|
.joinWith(ids, col("_1").equalTo(ids("oldId")), "left")
|
||||||
|
.map(i =>{
|
||||||
|
val r:Relation = i._1._2
|
||||||
|
if (i._2 != null)
|
||||||
|
{
|
||||||
|
val id = i._2.newId
|
||||||
|
r.setTarget(id)
|
||||||
|
}
|
||||||
|
r
|
||||||
|
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@ import com.fasterxml.jackson.databind.SerializationFeature;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
|
import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser;
|
||||||
|
import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser;
|
||||||
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
import eu.dnetlib.scholexplorer.relation.RelationMapper;
|
||||||
|
|
||||||
public class ScholexplorerParserTest {
|
public class ScholexplorerParserTest {
|
||||||
|
@ -37,4 +38,26 @@ public class ScholexplorerParserTest {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPublicationParser() throws Exception {
|
||||||
|
String xml = IOUtils.toString(this.getClass().getResourceAsStream("pmf.xml"));
|
||||||
|
|
||||||
|
PublicationScholexplorerParser p = new PublicationScholexplorerParser();
|
||||||
|
List<Oaf> oaves = p.parseObject(xml, RelationMapper.load());
|
||||||
|
|
||||||
|
ObjectMapper m = new ObjectMapper();
|
||||||
|
m.enable(SerializationFeature.INDENT_OUTPUT);
|
||||||
|
|
||||||
|
oaves
|
||||||
|
.forEach(
|
||||||
|
oaf -> {
|
||||||
|
try {
|
||||||
|
System.out.println(m.writeValueAsString(oaf));
|
||||||
|
System.out.println("----------------------------");
|
||||||
|
} catch (JsonProcessingException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,51 +1,38 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
xmlns="http://namespace.openaire.eu/">
|
||||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
<oai:header xmlns="">
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
<dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::0002882a9d38c4f4612e7666ad768ccd</dri:objIdentifier>
|
||||||
<oai:header>
|
<dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce::url</dri:recordIdentifier>
|
||||||
<dri:repositoryId>aaadf8b3-01a8-4cc2-9964-63cfb19df3b4_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
|
<dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
|
||||||
<dri:recordIdentifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</dri:recordIdentifier>
|
<dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
|
||||||
<dri:datasourceprefix>r3d100010134</dri:datasourceprefix>
|
<dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
|
||||||
<dri:objIdentifier>r3d100010134::00002f60593fd1f758fb838fafb46795</dri:objIdentifier>
|
|
||||||
<dri:dateOfCollection>2020-02-18T03:05:02.534Z</dri:dateOfCollection>
|
|
||||||
<oaf:datasourceprefix/>
|
|
||||||
<identifier>oai:pangaea.de:doi:10.1594/PANGAEA.432865</identifier>
|
|
||||||
<setSpec>citable topicOceans</setSpec>
|
|
||||||
</oai:header>
|
</oai:header>
|
||||||
<oai:metadata>
|
<metadata xmlns="">
|
||||||
<resource xmlns="http://datacite.org/schema/kernel-3">
|
<resource xmlns="http://datacite.org/schema/kernel-3"
|
||||||
<identifier identifierType="doi">10.1594/pangaea.432865</identifier>
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
<titles xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
xsi:schemaLocation="http://datacite.org/schema/kernel-3 http://schema.datacite.org/meta/kernel-3/metadata.xsd">
|
||||||
<title>Daily sea level from coastal tide gauge station Woods_Hole in 1978 (Research quality database)</title>
|
<identifier xmlns="" identifierType="url">https://research.jcu.edu.au/researchdata/published/detail/9079e05370d830eb8d416c77c0b761ce</identifier>
|
||||||
|
<titles xmlns="">
|
||||||
|
<title>Vertebrate monitoring in the Australian Wet Tropics rainforest at CU6A1 (145.30367623, -16.57767628, 600.0m above MSL) collected by Reptile Surveys</title>
|
||||||
</titles>
|
</titles>
|
||||||
<publisher xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">PANGAEA - Data Publisher for Earth & Environmental Science</publisher>
|
<publisher xmlns="">James Cook University</publisher>
|
||||||
<publicationYear xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">2006</publicationYear>
|
<dates xmlns="">
|
||||||
<dates xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
<date dateType="Collected">2013-05-07</date>
|
||||||
<date dateType="Collected">1978-01-01T12:00:00/1978-12-31T12:00:00</date>
|
|
||||||
</dates>
|
</dates>
|
||||||
<creators xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
<creators xmlns=""/>
|
||||||
<creator>
|
<resourceType xmlns="" resourceTypeGeneral="Dataset">Dataset</resourceType>
|
||||||
<creatorName>WOCE Sea Level, WSL</creatorName>
|
<relatedIdentifiers xmlns="">
|
||||||
</creator>
|
<relatedIdentifier entityType="publication" inverseRelationType="related"
|
||||||
</creators>
|
relatedIdentifierType="dnet"
|
||||||
<subjects xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
relationType="IsRelatedTo">r3d100010464::57793c5aa995172db237d9da17353f8b</relatedIdentifier>
|
||||||
<subject subjectScheme="Parameter">DATE/TIME</subject>
|
|
||||||
<subject subjectScheme="Parameter">Sea level</subject>
|
|
||||||
<subject subjectScheme="Method">Tide gauge station</subject>
|
|
||||||
<subject subjectScheme="Campaign">SeaLevel</subject>
|
|
||||||
<subject subjectScheme="Project">World Ocean Circulation Experiment (WOCE)</subject>
|
|
||||||
</subjects>
|
|
||||||
<resourceType resourceTypeGeneral="Dataset"/>
|
|
||||||
<relatedIdentifiers>
|
|
||||||
<relatedIdentifier relatedIdentifierType="URL" relationType="isDocumentedBy"
|
|
||||||
inverseRelationType="documents">http://store.pangaea.de/Projects/WOCE/SeaLevel_rqds/Woods_Hole.txt</relatedIdentifier>
|
|
||||||
</relatedIdentifiers>
|
</relatedIdentifiers>
|
||||||
</resource>
|
</resource>
|
||||||
</oai:metadata>
|
</metadata>
|
||||||
<oaf:about>
|
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
|
||||||
<oaf:datainfo>
|
<oaf:datainfo>
|
||||||
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010134" name="Pangaea"/>
|
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
|
||||||
|
name="Australian National Data Service"/>
|
||||||
<oaf:completionStatus>complete</oaf:completionStatus>
|
<oaf:completionStatus>complete</oaf:completionStatus>
|
||||||
<oaf:provisionMode>collected</oaf:provisionMode>
|
<oaf:provisionMode>collected</oaf:provisionMode>
|
||||||
</oaf:datainfo>
|
</oaf:datainfo>
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<oai:record xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||||
|
xmlns="http://namespace.openaire.eu/">
|
||||||
|
<oai:header xmlns="">
|
||||||
|
<dri:objIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464::57793c5aa995172db237d9da17353f8b</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier xmlns:dri="http://www.driver-repository.eu/namespace/dri">10.1111/j.1365-2486.2005.00995.x::doi</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection xmlns:dri="http://www.driver-repository.eu/namespace/dri">2020-11-02T16:14:07.831Z</dri:dateOfCollection>
|
||||||
|
<dri:repositoryId xmlns:dri="http://www.driver-repository.eu/namespace/dri">ands_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU=</dri:repositoryId>
|
||||||
|
<dri:datasourceprefix xmlns:dri="http://www.driver-repository.eu/namespace/dri">r3d100010464</dri:datasourceprefix>
|
||||||
|
</oai:header>
|
||||||
|
<metadata xmlns="">
|
||||||
|
<oaf:pid xmlns:oaf="http://namespace.dnet.eu/oaf" type="doi">10.1111/j.1365-2486.2005.00995.x</oaf:pid>
|
||||||
|
<dc:identifier xmlns:dc="http://purl.org/dc/elements/1.1/">10.1111/j.1365-2486.2005.00995.x</dc:identifier>
|
||||||
|
<dc:title xmlns:dc="http://purl.org/dc/elements/1.1/">Potential decoupling of trends in distribution area and population size of species with climate change.</dc:title>
|
||||||
|
<dc:type xmlns:dc="http://purl.org/dc/elements/1.1/">publication</dc:type>
|
||||||
|
</metadata>
|
||||||
|
<oaf:about xmlns:oaf="http://namespace.dnet.eu/oaf" xmlns="">
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:collectedFrom completionStatus="complete" id="dli_________::r3d100010464"
|
||||||
|
name="Australian National Data Service"/>
|
||||||
|
<oaf:completionStatus>complete</oaf:completionStatus>
|
||||||
|
<oaf:provisionMode>collected</oaf:provisionMode>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</oaf:about>
|
||||||
|
</oai:record>
|
|
@ -97,12 +97,17 @@ public class Scholix implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
|
private List<ScholixEntityId> mergeScholixEntityId(final List<ScholixEntityId> a, final List<ScholixEntityId> b) {
|
||||||
final List<ScholixEntityId> m = new ArrayList<>(a);
|
final List<ScholixEntityId> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||||
if (b != null)
|
if (b != null)
|
||||||
b.forEach(s -> {
|
b.forEach(s -> {
|
||||||
int tt = (int) m.stream().filter(t -> t.getName().equalsIgnoreCase(s.getName())).count();
|
if (s != null) {
|
||||||
if (tt == 0) {
|
int tt = (int) m
|
||||||
m.add(s);
|
.stream()
|
||||||
|
.filter(t -> t != null && t.getName() != null && t.getName().equalsIgnoreCase(s.getName()))
|
||||||
|
.count();
|
||||||
|
if (tt == 0) {
|
||||||
|
m.add(s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
return m;
|
return m;
|
||||||
|
@ -110,7 +115,7 @@ public class Scholix implements Serializable {
|
||||||
|
|
||||||
private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
|
private List<ScholixIdentifier> mergeScholixIdnetifier(final List<ScholixIdentifier> a,
|
||||||
final List<ScholixIdentifier> b) {
|
final List<ScholixIdentifier> b) {
|
||||||
final List<ScholixIdentifier> m = new ArrayList<>(a);
|
final List<ScholixIdentifier> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||||
if (b != null)
|
if (b != null)
|
||||||
b.forEach(s -> {
|
b.forEach(s -> {
|
||||||
int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
|
int tt = (int) m.stream().filter(t -> t.getIdentifier().equalsIgnoreCase(s.getIdentifier())).count();
|
||||||
|
@ -123,7 +128,7 @@ public class Scholix implements Serializable {
|
||||||
|
|
||||||
private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
|
private List<ScholixCollectedFrom> mergeScholixCollectedFrom(final List<ScholixCollectedFrom> a,
|
||||||
final List<ScholixCollectedFrom> b) {
|
final List<ScholixCollectedFrom> b) {
|
||||||
final List<ScholixCollectedFrom> m = new ArrayList<>(a);
|
final List<ScholixCollectedFrom> m = a != null ? new ArrayList<>(a) : new ArrayList<>();
|
||||||
if (b != null)
|
if (b != null)
|
||||||
b.forEach(s -> {
|
b.forEach(s -> {
|
||||||
int tt = (int) m
|
int tt = (int) m
|
||||||
|
@ -139,14 +144,15 @@ public class Scholix implements Serializable {
|
||||||
|
|
||||||
private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
|
private ScholixRelationship mergeRelationships(final ScholixRelationship a, final ScholixRelationship b) {
|
||||||
ScholixRelationship result = new ScholixRelationship();
|
ScholixRelationship result = new ScholixRelationship();
|
||||||
result.setName(StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
|
result.setName(a == null || StringUtils.isEmpty(a.getName()) ? b.getName() : a.getName());
|
||||||
result.setInverse(StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
|
result.setInverse(a == null || StringUtils.isEmpty(a.getInverse()) ? b.getInverse() : a.getInverse());
|
||||||
result.setSchema(StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
|
result.setSchema(a == null || StringUtils.isEmpty(a.getSchema()) ? b.getSchema() : a.getSchema());
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
|
private ScholixResource mergeResource(final ScholixResource a, final ScholixResource b) {
|
||||||
|
if (a == null)
|
||||||
|
return b;
|
||||||
final ScholixResource result = new ScholixResource();
|
final ScholixResource result = new ScholixResource();
|
||||||
result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
|
result.setCollectedFrom(mergeScholixCollectedFrom(a.getCollectedFrom(), b.getCollectedFrom()));
|
||||||
result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
|
result.setCreator(mergeScholixEntityId(a.getCreator(), b.getCreator()));
|
||||||
|
|
|
@ -7,4 +7,8 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -1,9 +1,17 @@
|
||||||
<workflow-app name="Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Materialize and Index graph to ElasticSearch" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>workingDirPath</name>
|
<name>workingDirPath</name>
|
||||||
<description>the source path</description>
|
<description>the source path</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>index</name>
|
||||||
|
<description>the index name</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esCluster</name>
|
||||||
|
<description>the Index cluster</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
@ -12,39 +20,43 @@
|
||||||
<name>sparkExecutorMemory</name>
|
<name>sparkExecutorMemory</name>
|
||||||
<description>memory for individual executor</description>
|
<description>memory for individual executor</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>index</name>
|
|
||||||
<description>index name</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
<property>
|
|
||||||
<name>indexHost</name>
|
|
||||||
<description>index host name</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="indexSummary"/>
|
<start to="DropAndCreateIndex"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="DropAndCreateIndex">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
|
||||||
|
<arg>-i</arg><arg>${index}</arg>
|
||||||
|
<arg>-c</arg><arg>${esCluster}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="indexSummary"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="indexSummary">
|
<action name="indexSummary">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>index Summary</name>
|
<name>index summary</name>
|
||||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
||||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="32" </spark-opts>
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary</arg>
|
<arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
|
||||||
<arg>--index</arg><arg>${index}_object</arg>
|
<arg>--index</arg><arg>${index}_object</arg>
|
||||||
<arg>--esHost</arg><arg>${indexHost}</arg>
|
|
||||||
<arg>--idPath</arg><arg>id</arg>
|
<arg>--idPath</arg><arg>id</arg>
|
||||||
<arg>--type</arg><arg>summary</arg>
|
<arg>--cluster</arg><arg>${esCluster}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="indexScholix"/>
|
<ok to="indexScholix"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -63,9 +75,8 @@
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
|
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
|
||||||
<arg>--index</arg><arg>${index}_scholix</arg>
|
<arg>--index</arg><arg>${index}_scholix</arg>
|
||||||
<arg>--esHost</arg><arg>${indexHost}</arg>
|
|
||||||
<arg>--idPath</arg><arg>identifier</arg>
|
<arg>--idPath</arg><arg>identifier</arg>
|
||||||
<arg>--type</arg><arg>scholix</arg>
|
<arg>--cluster</arg><arg>${esCluster}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -112,59 +112,5 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
<action name="DropAndCreateIndex">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.dhp.provision.DropAndCreateESIndex</main-class>
|
|
||||||
<arg>-i</arg><arg>${index}</arg>
|
|
||||||
<arg>-c</arg><arg>${esCluster}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="indexSummary"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
<action name="indexSummary">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>index summary</name>
|
|
||||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
|
||||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/summary_json</arg>
|
|
||||||
<arg>--index</arg><arg>${index}_object</arg>
|
|
||||||
<arg>--idPath</arg><arg>id</arg>
|
|
||||||
<arg>--cluster</arg><arg>${esCluster}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="indexScholix"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="indexScholix">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<master>yarn-cluster</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>index scholix</name>
|
|
||||||
<class>eu.dnetlib.dhp.provision.SparkIndexCollectionOnES</class>
|
|
||||||
<jar>dhp-graph-provision-scholexplorer-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT} --conf spark.dynamicAllocation.maxExecutors="8" </spark-opts>
|
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
|
||||||
<arg>--sourcePath</arg><arg>${workingDirPath}/scholix_json</arg>
|
|
||||||
<arg>--index</arg><arg>${index}_scholix</arg>
|
|
||||||
<arg>--idPath</arg><arg>identifier</arg>
|
|
||||||
<arg>--cluster</arg><arg>${esCluster}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
</workflow-app>
|
</workflow-app>
|
Loading…
Reference in New Issue