diff --git a/dhp-workflows/dhp-graph-provision/README.md b/dhp-workflows/dhp-graph-provision/README.md new file mode 100644 index 000000000..973a5909d --- /dev/null +++ b/dhp-workflows/dhp-graph-provision/README.md @@ -0,0 +1,21 @@ +Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The +operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and +all the possible relationships (similarity links produced by the Dedup process are excluded). + +The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and +again by E, finally grouped by E.id; + +The workflow is organized in different parts aimed to to reduce the complexity of the operation + +1) PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == +false), each entity can be linked at most to 100 other objects + +2) CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type +E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = +T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples +(R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + +3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the +result as JoinedEntity + +4) XmlConverterJob: convert the JoinedEntities as XML records diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java index d9cc03cd5..0bc270e8f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java @@ -25,26 +25,8 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class AdjacencyListBuilderJob { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index b08e593f7..86d380409 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -31,26 +31,9 @@ import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * CreateRelatedEntitiesJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join + * (R.target = T_i.id) save the tuples (R_i, T_i) */ public class CreateRelatedEntitiesJob_phase1 { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 7e175121e..170835fdb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -34,26 +34,8 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * CreateRelatedEntitiesJob (phase 2): create the union of all the entity types E, hash by id read the tuples * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase2 { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index da0a81021..20d27f0f3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -36,26 +36,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and - * again by E, finally grouped by E.id; - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted + * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects */ public class PrepareRelationsJob { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index a1ed7fd2a..d8eba31b6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -37,23 +37,7 @@ import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The - * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and - * all the possible relationships (similarity links produced by the Dedup process are excluded). - *

- * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) - * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == - * false), each entity can be linked at most to 100 other objects - *

- * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type - * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = - * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples - * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - *

- * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the - * result as JoinedEntity - *

- * 4) XmlConverterJob: convert the JoinedEntities as XML records + * XmlConverterJob converts the JoinedEntities as XML records */ public class XmlConverterJob {