[DumpOpenAorgs] extended set of dumps to create a dataset containing only OpenORGS organizations

This commit is contained in:
Miriam Baglioni 2024-08-08 12:14:37 +02:00
parent b54f45bd80
commit 8e38d9e01c
5 changed files with 30 additions and 8 deletions

View File

@ -93,7 +93,7 @@ public class SparkDumpOrganizationJob implements Serializable {
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
Utils
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
.readPath(spark, inputPath + "/organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
.filter(
(FilterFunction<eu.dnetlib.dhp.schema.oaf.Organization>) o -> !o.getDataInfo().getDeletedbyinference()
&& o.getId().startsWith("20|openorgs"))
@ -104,7 +104,7 @@ public class SparkDumpOrganizationJob implements Serializable {
.write()
.mode(SaveMode.Overwrite)
.option(COMPRESSION, GZIP)
.json(outputPath + "/organization");
.json(outputPath + "/openorgs");
}
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(

View File

@ -2,4 +2,5 @@
dump_complete classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/complete/oozie_app
dump_funder classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app
dump_community classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app
dump_subset classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/subset/oozie_app
dump_subset classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/subset/oozie_app
dump_organization classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/organizations/oozie_app

View File

@ -174,10 +174,32 @@
<case to="dump_funder">${wf:conf('dumpType') eq "funder"}</case>
<case to="dump_community">${wf:conf('dumpType') eq "community"}</case>
<case to="dump_subset">${wf:conf('dumpType') eq "subset"}</case>
<case to="dump_organization">${wf:conf('dumpType') eq "openorgs"}</case>
<default to="dump_complete"/>
</switch>
</decision>
<!-- Sub-workflow which runs the dump for the organizations with on openorg identifier -->
<action name="dump_organization">
<sub-workflow>
<app-path>${wf:appPath()}/dump_organizations
</app-path>
<propagate-configuration/>
<configuration>
<property>
<name>outputPath</name>
<value>${outputPath}</value>
</property>
<property>
<name>sourcePath</name>
<value>${sourcePath}</value>
</property>
</configuration>
</sub-workflow>
<ok to="make_archive" />
<error to="Kill" />
</action>
<!-- Sub-workflow which runs the dump subset for the complete graph -->
<action name="dump_subset">
<sub-workflow>

View File

@ -82,14 +82,13 @@
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
<arg>--communityMapPath</arg><arg>noneed</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/organization</arg>
<arg>--outputPath</arg><arg>${outputPath}/dump/openorgs</arg>
</spark>
<ok to="End"/>
<ok to="make_archive"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>