[DumpOpenAorgs] extended set of dumps to create a dataset containing only OpenORGS organizations
This commit is contained in:
parent
b54f45bd80
commit
8e38d9e01c
|
@ -93,7 +93,7 @@ public class SparkDumpOrganizationJob implements Serializable {
|
|||
|
||||
private static void organizationMap(SparkSession spark, String inputPath, String outputPath) {
|
||||
Utils
|
||||
.readPath(spark, inputPath + "organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
|
||||
.readPath(spark, inputPath + "/organization", eu.dnetlib.dhp.schema.oaf.Organization.class)
|
||||
.filter(
|
||||
(FilterFunction<eu.dnetlib.dhp.schema.oaf.Organization>) o -> !o.getDataInfo().getDeletedbyinference()
|
||||
&& o.getId().startsWith("20|openorgs"))
|
||||
|
@ -104,7 +104,7 @@ public class SparkDumpOrganizationJob implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option(COMPRESSION, GZIP)
|
||||
.json(outputPath + "/organization");
|
||||
.json(outputPath + "/openorgs");
|
||||
}
|
||||
|
||||
private static eu.dnetlib.dhp.oa.model.graph.Organization mapOrganization(
|
||||
|
|
|
@ -2,4 +2,5 @@
|
|||
dump_complete classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/complete/oozie_app
|
||||
dump_funder classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/funder/oozie_app
|
||||
dump_community classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/community/oozie_app
|
||||
dump_subset classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/subset/oozie_app
|
||||
dump_subset classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/subset/oozie_app
|
||||
dump_organization classpath eu/dnetlib/dhp/oa/graph/dump/wf/subworkflows/organizations/oozie_app
|
|
@ -174,10 +174,32 @@
|
|||
<case to="dump_funder">${wf:conf('dumpType') eq "funder"}</case>
|
||||
<case to="dump_community">${wf:conf('dumpType') eq "community"}</case>
|
||||
<case to="dump_subset">${wf:conf('dumpType') eq "subset"}</case>
|
||||
<case to="dump_organization">${wf:conf('dumpType') eq "openorgs"}</case>
|
||||
<default to="dump_complete"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<!-- Sub-workflow which runs the dump for the organizations with on openorg identifier -->
|
||||
<action name="dump_organization">
|
||||
<sub-workflow>
|
||||
<app-path>${wf:appPath()}/dump_organizations
|
||||
</app-path>
|
||||
<propagate-configuration/>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<value>${outputPath}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<value>${sourcePath}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</sub-workflow>
|
||||
<ok to="make_archive" />
|
||||
<error to="Kill" />
|
||||
</action>
|
||||
|
||||
<!-- Sub-workflow which runs the dump subset for the complete graph -->
|
||||
<action name="dump_subset">
|
||||
<sub-workflow>
|
||||
|
|
|
@ -82,14 +82,13 @@
|
|||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/project</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/project</arg>
|
||||
<arg>--communityMapPath</arg><arg>noneed</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/organization</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dump/openorgs</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<ok to="make_archive"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue