[graph provision] added person to the provision workflow

This commit is contained in:
Claudio Atzori 2024-08-02 16:14:10 +02:00
parent 6bdb8643e6
commit 975d44cac7
2 changed files with 70 additions and 0 deletions

View File

@ -5,6 +5,7 @@ import java.io.StringReader;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.solr.Person;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@ -89,6 +90,8 @@ public class ProvisionModelSupport {
r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) {
r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs));
} else if (e instanceof eu.dnetlib.dhp.schema.oaf.Person) {
r.setPerson(mapPerson((eu.dnetlib.dhp.schema.oaf.Person) e));
}
r
.setLinks(
@ -185,6 +188,14 @@ public class ProvisionModelSupport {
return ps;
}
private static Person mapPerson(eu.dnetlib.dhp.schema.oaf.Person p) {
Person ps = new Person();
ps.setFamilyName(p.getFamilyName());
ps.setGivenName(p.getGivenName());
ps.setAlternativeNames(p.getAlternativeNames());
return ps;
}
private static Funding mapFunding(List<String> fundingtree, VocabularyGroup vocs) {
SAXReader reader = new SAXReader();
return Optional

View File

@ -180,6 +180,7 @@
<path start="join_relation_datasource"/>
<path start="join_relation_organization"/>
<path start="join_relation_project"/>
<path start="join_relation_person"/>
</fork>
<action name="join_relation_publication">
@ -378,6 +379,34 @@
<error to="Kill"/>
</action>
<action name="join_relation_person">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = person.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial/person</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<join name="wait_joins" to="fork_join_all_entities"/>
<fork name="fork_join_all_entities">
@ -388,6 +417,7 @@
<path start="join_datasource_relations"/>
<path start="join_organization_relations"/>
<path start="join_project_relations"/>
<path start="join_person_relations"/>
</fork>
<action name="join_publication_relations">
@ -593,6 +623,35 @@
<error to="Kill"/>
</action>
<action name="join_person_relations">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[person.id = relatedEntity.source]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/person</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Person</arg>
<arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_entities/person</arg>
<arg>--numPartitions</arg><arg>10000</arg>
</spark>
<ok to="wait_join_phase2"/>
<error to="Kill"/>
</action>
<join name="wait_join_phase2" to="create_payloads"/>
<action name="create_payloads">