forked from D-Net/dnet-hadoop
indexing workflow WIP: fixed projects fundingtree xml conversion, prioritized links between results and projects when limiting them to 100 in the join procedure
This commit is contained in:
parent
60bc2b1a20
commit
bc7cfd5975
|
@ -1,10 +1,12 @@
|
||||||
sparkDriverMemory=8G
|
sparkDriverMemory=10G
|
||||||
sparkExecutorMemory=8G
|
sparkExecutorMemory=15G
|
||||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||||
outputPath=/tmp/openaire_provision
|
outputPath=/tmp/openaire_provision
|
||||||
format=TMF
|
format=TMF
|
||||||
batchSize=2000
|
batchSize=2000
|
||||||
|
sparkExecutorCoresForJoining=128
|
||||||
sparkExecutorCoresForIndexing=64
|
sparkExecutorCoresForIndexing=64
|
||||||
reuseRecords=true
|
reuseRecords=false
|
||||||
|
otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource
|
|
@ -1,31 +1,32 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Maps;
|
||||||
import com.jayway.jsonpath.DocumentContext;
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
import eu.dnetlib.dhp.graph.model.*;
|
import eu.dnetlib.dhp.graph.model.*;
|
||||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||||
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
||||||
|
import eu.dnetlib.dhp.graph.utils.RelationPartitioner;
|
||||||
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
|
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.compress.GzipCodec;
|
import org.apache.hadoop.io.compress.GzipCodec;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
|
import org.apache.spark.SparkContext;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.HashSet;
|
import java.util.Map;
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
||||||
|
|
||||||
|
@ -49,6 +50,8 @@ import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
||||||
*/
|
*/
|
||||||
public class GraphJoiner implements Serializable {
|
public class GraphJoiner implements Serializable {
|
||||||
|
|
||||||
|
private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
|
||||||
|
|
||||||
public static final int MAX_RELS = 100;
|
public static final int MAX_RELS = 100;
|
||||||
|
|
||||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||||
|
@ -61,24 +64,30 @@ public class GraphJoiner implements Serializable {
|
||||||
|
|
||||||
private String outPath;
|
private String outPath;
|
||||||
|
|
||||||
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) {
|
private String otherDsTypeId;
|
||||||
|
|
||||||
|
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
|
||||||
this.spark = spark;
|
this.spark = spark;
|
||||||
this.contextMapper = contextMapper;
|
this.contextMapper = contextMapper;
|
||||||
|
this.otherDsTypeId = otherDsTypeId;
|
||||||
this.inputPath = inputPath;
|
this.inputPath = inputPath;
|
||||||
this.outPath = outPath;
|
this.outPath = outPath;
|
||||||
|
|
||||||
|
final SparkContext sc = spark.sparkContext();
|
||||||
|
prepareAccumulators(sc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public GraphJoiner adjacencyLists() {
|
public GraphJoiner adjacencyLists() {
|
||||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext());
|
||||||
|
|
||||||
// read each entity
|
// read each entity
|
||||||
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource");
|
JavaPairRDD<String, TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
|
||||||
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization");
|
JavaPairRDD<String, TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
|
||||||
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project");
|
JavaPairRDD<String, TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
|
||||||
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset");
|
JavaPairRDD<String, TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
|
||||||
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct");
|
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
|
||||||
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software");
|
JavaPairRDD<String, TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
|
||||||
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication");
|
JavaPairRDD<String, TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
|
||||||
|
|
||||||
// create the union between all the entities
|
// create the union between all the entities
|
||||||
final String entitiesPath = getOutPath() + "/entities";
|
final String entitiesPath = getOutPath() + "/entities";
|
||||||
|
@ -93,31 +102,43 @@ public class GraphJoiner implements Serializable {
|
||||||
.map(GraphMappingUtils::serialize)
|
.map(GraphMappingUtils::serialize)
|
||||||
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
||||||
|
|
||||||
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
|
JavaPairRDD<String, EntityRelEntity> entities = jsc.textFile(entitiesPath)
|
||||||
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||||
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
|
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
|
||||||
|
|
||||||
|
final String relationPath = getOutPath() + "/relation";
|
||||||
// reads the relationships
|
// reads the relationships
|
||||||
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath())
|
final JavaPairRDD<SortableRelationKey, EntityRelEntity> rels = readPathRelation(jsc, getInputPath())
|
||||||
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted
|
.filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted
|
||||||
.map(p -> new EntityRelEntity().setRelation(p))
|
.map(p -> new EntityRelEntity().setRelation(p))
|
||||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p))
|
.mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p));
|
||||||
.groupByKey()
|
rels
|
||||||
|
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
|
||||||
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
||||||
.flatMap(p -> p.iterator())
|
.flatMap(p -> p.iterator())
|
||||||
|
.map(s -> new ObjectMapper().writeValueAsString(s))
|
||||||
|
.saveAsTextFile(relationPath, GzipCodec.class);
|
||||||
|
|
||||||
|
final JavaPairRDD<String, EntityRelEntity> relation = jsc.textFile(relationPath)
|
||||||
|
.map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class))
|
||||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
|
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
|
||||||
|
|
||||||
//final String bySource = getOutPath() + "/1_join_by_target";
|
final String bySourcePath = getOutPath() + "/join_by_source";
|
||||||
JavaPairRDD<String, EntityRelEntity> bySource = relation
|
relation
|
||||||
.join(entities
|
.join(entities
|
||||||
.filter(e -> !e._2().getSource().getDeleted())
|
.filter(e -> !e._2().getSource().getDeleted())
|
||||||
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
|
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
|
||||||
.map(s -> new EntityRelEntity()
|
.map(s -> new EntityRelEntity()
|
||||||
.setRelation(s._2()._1().getRelation())
|
.setRelation(s._2()._1().getRelation())
|
||||||
.setTarget(s._2()._2().getSource()))
|
.setTarget(s._2()._2().getSource()))
|
||||||
|
.map(j -> new ObjectMapper().writeValueAsString(j))
|
||||||
|
.saveAsTextFile(bySourcePath, GzipCodec.class);
|
||||||
|
|
||||||
|
JavaPairRDD<String, EntityRelEntity> bySource = jsc.textFile(bySourcePath)
|
||||||
|
.map(e -> getObjectMapper().readValue(e, EntityRelEntity.class))
|
||||||
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
|
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
|
||||||
|
|
||||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>());
|
final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
|
||||||
entities
|
entities
|
||||||
.union(bySource)
|
.union(bySource)
|
||||||
.groupByKey() // by source id
|
.groupByKey() // by source id
|
||||||
|
@ -130,20 +151,6 @@ public class GraphJoiner implements Serializable {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public GraphJoiner asXML() {
|
|
||||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
|
||||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
|
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
|
||||||
|
|
||||||
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
|
|
||||||
sc.textFile(joinedEntitiesPath)
|
|
||||||
.map(s -> mapper.readValue(s, JoinedEntity.class))
|
|
||||||
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
|
|
||||||
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
|
||||||
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SparkSession getSpark() {
|
public SparkSession getSpark() {
|
||||||
return spark;
|
return spark;
|
||||||
}
|
}
|
||||||
|
@ -158,24 +165,23 @@ public class GraphJoiner implements Serializable {
|
||||||
|
|
||||||
// HELPERS
|
// HELPERS
|
||||||
|
|
||||||
private OafEntity parseOaf(final String json, final String type) {
|
private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
|
||||||
final ObjectMapper o = new ObjectMapper();
|
|
||||||
try {
|
try {
|
||||||
switch (GraphMappingUtils.EntityType.valueOf(type)) {
|
switch (GraphMappingUtils.EntityType.valueOf(type)) {
|
||||||
case publication:
|
case publication:
|
||||||
return o.readValue(json, Publication.class);
|
return mapper.readValue(json, Publication.class);
|
||||||
case dataset:
|
case dataset:
|
||||||
return o.readValue(json, Dataset.class);
|
return mapper.readValue(json, Dataset.class);
|
||||||
case otherresearchproduct:
|
case otherresearchproduct:
|
||||||
return o.readValue(json, OtherResearchProduct.class);
|
return mapper.readValue(json, OtherResearchProduct.class);
|
||||||
case software:
|
case software:
|
||||||
return o.readValue(json, Software.class);
|
return mapper.readValue(json, Software.class);
|
||||||
case datasource:
|
case datasource:
|
||||||
return o.readValue(json, Datasource.class);
|
return mapper.readValue(json, Datasource.class);
|
||||||
case organization:
|
case organization:
|
||||||
return o.readValue(json, Organization.class);
|
return mapper.readValue(json, Organization.class);
|
||||||
case project:
|
case project:
|
||||||
return o.readValue(json, Project.class);
|
return mapper.readValue(json, Project.class);
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("invalid type: " + type);
|
throw new IllegalArgumentException("invalid type: " + type);
|
||||||
}
|
}
|
||||||
|
@ -185,26 +191,26 @@ public class GraphJoiner implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
|
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
|
||||||
final ObjectMapper o = new ObjectMapper();
|
final ObjectMapper mapper = getObjectMapper();
|
||||||
final JoinedEntity j = new JoinedEntity();
|
final JoinedEntity j = new JoinedEntity();
|
||||||
final Links links2 = new Links();
|
final Links links = new Links();
|
||||||
for(EntityRelEntity rel : p._2()) {
|
for(EntityRelEntity rel : p._2()) {
|
||||||
if (rel.hasMainEntity() & j.getEntity() == null) {
|
if (rel.hasMainEntity() & j.getEntity() == null) {
|
||||||
j.setType(rel.getSource().getType());
|
j.setType(rel.getSource().getType());
|
||||||
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType()));
|
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper));
|
||||||
}
|
}
|
||||||
if (rel.hasRelatedEntity()) {
|
if (rel.hasRelatedEntity()) {
|
||||||
try {
|
try {
|
||||||
links2.add(
|
links.add(
|
||||||
new eu.dnetlib.dhp.graph.model.Tuple2()
|
new eu.dnetlib.dhp.graph.model.Tuple2()
|
||||||
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class))
|
.setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class))
|
||||||
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
.setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new IllegalArgumentException(e);
|
throw new IllegalArgumentException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
j.setLinks(links2);
|
j.setLinks(links);
|
||||||
if (j.getEntity() == null) {
|
if (j.getEntity() == null) {
|
||||||
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
||||||
}
|
}
|
||||||
|
@ -250,8 +256,38 @@ public class GraphJoiner implements Serializable {
|
||||||
.setTargetId(json.read("$.target"))
|
.setTargetId(json.read("$.target"))
|
||||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||||
.setType("relation")
|
.setType("relation")
|
||||||
|
.setRelType("$.relType")
|
||||||
|
.setSubRelType("$.subRelType")
|
||||||
|
.setRelClass("$.relClass")
|
||||||
.setOaf(s);
|
.setOaf(s);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private ObjectMapper getObjectMapper() {
|
||||||
|
return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void prepareAccumulators(SparkContext sc) {
|
||||||
|
accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
|
||||||
|
accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
|
||||||
|
accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
|
||||||
|
accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
|
||||||
|
accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
|
||||||
|
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
|
||||||
|
|
||||||
|
accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
|
||||||
|
accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
|
||||||
|
accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
|
||||||
|
accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
|
||||||
|
accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
|
||||||
|
|
||||||
|
accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
|
||||||
|
accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
|
||||||
|
accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
|
||||||
|
accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
|
||||||
|
accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
|
||||||
|
accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
|
||||||
|
accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ public class SparkXmlRecordBuilderJob {
|
||||||
final String inputPath = parser.get("sourcePath");
|
final String inputPath = parser.get("sourcePath");
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
final String isLookupUrl = parser.get("isLookupUrl");
|
final String isLookupUrl = parser.get("isLookupUrl");
|
||||||
|
final String otherDsTypeId = parser.get("otherDsTypeId");
|
||||||
|
|
||||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||||
if (fs.exists(new Path(outputPath))) {
|
if (fs.exists(new Path(outputPath))) {
|
||||||
|
@ -31,8 +32,9 @@ public class SparkXmlRecordBuilderJob {
|
||||||
fs.mkdirs(new Path(outputPath));
|
fs.mkdirs(new Path(outputPath));
|
||||||
}
|
}
|
||||||
|
|
||||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath)
|
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
|
||||||
.adjacencyLists();
|
.adjacencyLists();
|
||||||
|
//.asXML();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,99 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph.model;
|
||||||
|
|
||||||
|
import com.google.common.collect.ComparisonChain;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Allows to sort relationships according to the priority defined in weights map.
|
||||||
|
*/
|
||||||
|
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
|
||||||
|
|
||||||
|
private String sourceId;
|
||||||
|
private String targetId;
|
||||||
|
|
||||||
|
private String relType;
|
||||||
|
private String subRelType;
|
||||||
|
private String relClass;
|
||||||
|
|
||||||
|
private final static Map<String, Integer> weights = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
weights.put("outcome", 0);
|
||||||
|
weights.put("supplement", 1);
|
||||||
|
weights.put("publicationDataset", 2);
|
||||||
|
weights.put("relationship", 3);
|
||||||
|
weights.put("similarity", 4);
|
||||||
|
weights.put("affiliation", 5);
|
||||||
|
|
||||||
|
weights.put("provision", 6);
|
||||||
|
weights.put("participation", 7);
|
||||||
|
weights.put("dedup", 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SortableRelationKey from(final EntityRelEntity e) {
|
||||||
|
return new SortableRelationKey()
|
||||||
|
.setSourceId(e.getRelation().getSourceId())
|
||||||
|
.setTargetId(e.getRelation().getTargetId())
|
||||||
|
.setRelType(e.getRelation().getRelType())
|
||||||
|
.setSubRelType(e.getRelation().getSubRelType())
|
||||||
|
.setRelClass(e.getRelation().getRelClass());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSourceId() {
|
||||||
|
return sourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortableRelationKey setSourceId(String sourceId) {
|
||||||
|
this.sourceId = sourceId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTargetId() {
|
||||||
|
return targetId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortableRelationKey setTargetId(String targetId) {
|
||||||
|
this.targetId = targetId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelType() {
|
||||||
|
return relType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortableRelationKey setRelType(String relType) {
|
||||||
|
this.relType = relType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSubRelType() {
|
||||||
|
return subRelType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortableRelationKey setSubRelType(String subRelType) {
|
||||||
|
this.subRelType = subRelType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelClass() {
|
||||||
|
return relClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SortableRelationKey setRelClass(String relClass) {
|
||||||
|
this.relClass = relClass;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(SortableRelationKey o) {
|
||||||
|
return ComparisonChain.start()
|
||||||
|
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
|
||||||
|
.compare(getSourceId(), o.getSourceId())
|
||||||
|
.compare(getTargetId(), o.getTargetId())
|
||||||
|
.result();
|
||||||
|
}
|
||||||
|
|
||||||
public class SortableRelationKey {
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,6 +12,10 @@ public class TypedRow implements Serializable {
|
||||||
|
|
||||||
private String type;
|
private String type;
|
||||||
|
|
||||||
|
private String relType;
|
||||||
|
private String subRelType;
|
||||||
|
private String relClass;
|
||||||
|
|
||||||
private String oaf;
|
private String oaf;
|
||||||
|
|
||||||
public String getSourceId() {
|
public String getSourceId() {
|
||||||
|
@ -50,6 +54,33 @@ public class TypedRow implements Serializable {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getRelType() {
|
||||||
|
return relType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setRelType(String relType) {
|
||||||
|
this.relType = relType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSubRelType() {
|
||||||
|
return subRelType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setSubRelType(String subRelType) {
|
||||||
|
this.subRelType = subRelType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRelClass() {
|
||||||
|
return relClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypedRow setRelClass(String relClass) {
|
||||||
|
this.relClass = relClass;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public String getOaf() {
|
public String getOaf() {
|
||||||
return oaf;
|
return oaf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,8 @@ import static org.apache.commons.lang3.StringUtils.*;
|
||||||
|
|
||||||
public class GraphMappingUtils {
|
public class GraphMappingUtils {
|
||||||
|
|
||||||
|
public static final String SEPARATOR = "_";
|
||||||
|
|
||||||
public enum EntityType {
|
public enum EntityType {
|
||||||
publication, dataset, otherresearchproduct, software, datasource, organization, project
|
publication, dataset, otherresearchproduct, software, datasource, organization, project
|
||||||
}
|
}
|
||||||
|
@ -250,5 +252,8 @@ public class GraphMappingUtils {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getRelDescriptor(String relType, String subRelType, String relClass) {
|
||||||
|
return relType + SEPARATOR + subRelType + SEPARATOR + relClass;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,29 @@
|
||||||
package eu.dnetlib.dhp.graph.utils;
|
package eu.dnetlib.dhp.graph.utils;
|
||||||
|
|
||||||
public class RelationPartitioner {
|
import eu.dnetlib.dhp.graph.model.SortableRelationKey;
|
||||||
|
import org.apache.spark.Partitioner;
|
||||||
|
import org.apache.spark.util.Utils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Used in combination with SortableRelationKey, allows to partition the records by source id, therefore
|
||||||
|
* allowing to sort relations sharing the same source id by the ordering defined in SortableRelationKey.
|
||||||
|
*/
|
||||||
|
public class RelationPartitioner extends Partitioner {
|
||||||
|
|
||||||
|
private int numPartitions;
|
||||||
|
|
||||||
|
public RelationPartitioner(int numPartitions) {
|
||||||
|
this.numPartitions = numPartitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numPartitions() {
|
||||||
|
return numPartitions;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getPartition(Object key) {
|
||||||
|
return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.graph.utils;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.mycila.xmltool.XMLDoc;
|
import com.mycila.xmltool.XMLDoc;
|
||||||
import com.mycila.xmltool.XMLTag;
|
import com.mycila.xmltool.XMLTag;
|
||||||
|
@ -11,6 +12,8 @@ import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||||
import eu.dnetlib.dhp.graph.model.Tuple2;
|
import eu.dnetlib.dhp.graph.model.Tuple2;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
|
@ -27,6 +30,7 @@ import java.io.Serializable;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -37,6 +41,8 @@ import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||||
|
|
||||||
public class XmlRecordFactory implements Serializable {
|
public class XmlRecordFactory implements Serializable {
|
||||||
|
|
||||||
|
private Map<String, LongAccumulator> accumulators;
|
||||||
|
|
||||||
private Set<String> specialDatasourceTypes;
|
private Set<String> specialDatasourceTypes;
|
||||||
|
|
||||||
private ContextMapper contextMapper;
|
private ContextMapper contextMapper;
|
||||||
|
@ -47,11 +53,20 @@ public class XmlRecordFactory implements Serializable {
|
||||||
|
|
||||||
public XmlRecordFactory(
|
public XmlRecordFactory(
|
||||||
final ContextMapper contextMapper, final boolean indent,
|
final ContextMapper contextMapper, final boolean indent,
|
||||||
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) {
|
final String schemaLocation, final String otherDatasourceTypesUForUI) {
|
||||||
|
|
||||||
|
this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI);
|
||||||
|
}
|
||||||
|
|
||||||
|
public XmlRecordFactory(
|
||||||
|
final Map<String, LongAccumulator> accumulators,
|
||||||
|
final ContextMapper contextMapper, final boolean indent,
|
||||||
|
final String schemaLocation, final String otherDatasourceTypesUForUI) {
|
||||||
|
|
||||||
|
this.accumulators = accumulators;
|
||||||
this.contextMapper = contextMapper;
|
this.contextMapper = contextMapper;
|
||||||
this.schemaLocation = schemaLocation;
|
this.schemaLocation = schemaLocation;
|
||||||
this.specialDatasourceTypes = otherDatasourceTypesUForUI;
|
this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI));
|
||||||
|
|
||||||
this.indent = indent;
|
this.indent = indent;
|
||||||
}
|
}
|
||||||
|
@ -583,7 +598,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
if (p.getFundingtree() != null) {
|
if (p.getFundingtree() != null) {
|
||||||
metadata.addAll(p.getFundingtree()
|
metadata.addAll(p.getFundingtree()
|
||||||
.stream()
|
.stream()
|
||||||
.map(ft -> asXmlElement("fundingtree", ft.getValue()))
|
.map(ft -> ft.getValue())
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -713,12 +728,27 @@ public class XmlRecordFactory implements Serializable {
|
||||||
}
|
}
|
||||||
final DataInfo info = rel.getDataInfo();
|
final DataInfo info = rel.getDataInfo();
|
||||||
|
|
||||||
|
final String inverseRelClass = getInverseRelClass(rel.getRelClass());
|
||||||
|
final String scheme = getScheme(re.getType(), targetType);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(inverseRelClass)) {
|
||||||
|
throw new IllegalArgumentException("missing inverse for: " + rel.getRelClass());
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(scheme)) {
|
||||||
|
throw new IllegalArgumentException(String.format("missing scheme for: <%s - %s>", re.getType(), targetType));
|
||||||
|
}
|
||||||
|
|
||||||
|
final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass());
|
||||||
|
if (accumulators.containsKey(accumulatorName)) {
|
||||||
|
accumulators.get(accumulatorName).add(1);
|
||||||
|
}
|
||||||
|
|
||||||
rels.add(templateFactory.getRel(
|
rels.add(templateFactory.getRel(
|
||||||
targetType,
|
targetType,
|
||||||
rel.getTarget(),
|
rel.getTarget(),
|
||||||
Sets.newHashSet(metadata),
|
Sets.newHashSet(metadata),
|
||||||
getInverseRelClass(rel.getRelClass()),
|
inverseRelClass,
|
||||||
getScheme(targetType, re.getType()),
|
scheme,
|
||||||
info));
|
info));
|
||||||
}
|
}
|
||||||
return rels;
|
return rels;
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
||||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true},
|
||||||
|
{"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -53,6 +53,7 @@
|
||||||
--executor-memory ${sparkExecutorMemory}
|
--executor-memory ${sparkExecutorMemory}
|
||||||
--executor-cores ${sparkExecutorCores}
|
--executor-cores ${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForJoining}
|
||||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -60,6 +61,7 @@
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn</arg>
|
<arg>-mt</arg> <arg>yarn</arg>
|
||||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||||
|
<arg>-t</arg> <arg>${otherDsTypeId}</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -1,4 +1,38 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
|
import org.junit.Before;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class GraphJoinerTest {
|
public class GraphJoinerTest {
|
||||||
|
|
||||||
|
private ClassLoader cl = getClass().getClassLoader();
|
||||||
|
private Path workingDir;
|
||||||
|
private Path inputDir;
|
||||||
|
private Path outputDir;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void before() throws IOException {
|
||||||
|
workingDir = Files.createTempDirectory("promote_action_set");
|
||||||
|
inputDir = workingDir.resolve("input");
|
||||||
|
outputDir = workingDir.resolve("output");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void copyFiles(Path source, Path target) throws IOException {
|
||||||
|
Files.list(source).forEach(f -> {
|
||||||
|
try {
|
||||||
|
if (Files.isDirectory(f)) {
|
||||||
|
Path subTarget = Files.createDirectories(target.resolve(f.getFileName()));
|
||||||
|
copyFiles(f, subTarget);
|
||||||
|
} else {
|
||||||
|
Files.copy(f, target.resolve(f.getFileName()));
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue