forked from D-Net/dnet-hadoop
adjustments for country propagation
This commit is contained in:
parent
4740c808f7
commit
3afd4aa57b
|
@ -45,7 +45,7 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareDatasourceCountryAssociation.class
|
PrepareDatasourceCountryAssociation.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -90,7 +90,8 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
||||||
Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
|
Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
|
||||||
Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
|
Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
|
||||||
(allowedtypes.contains(ds.getJurisdiction().getClassid()) ||
|
((Optional.ofNullable(ds.getJurisdiction()).isPresent() &&
|
||||||
|
allowedtypes.contains(ds.getJurisdiction().getClassid())) ||
|
||||||
whitelist.contains(ds.getId())));
|
whitelist.contains(ds.getId())));
|
||||||
|
|
||||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||||
|
|
|
@ -32,7 +32,7 @@ public class PrepareResultCountrySet {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCountrySet.class
|
PrepareResultCountrySet.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class SparkCountryPropagationJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkCountryPropagationJob.class
|
SparkCountryPropagationJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class PrepareInfo implements Serializable {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToOrganizationFromIstRepoJob.class
|
SparkResultToOrganizationFromIstRepoJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "preparedInfoPath",
|
||||||
|
"paramDescription": "the path where prepared info have been stored",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,32 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "out",
|
||||||
|
"paramLongName": "outputPath",
|
||||||
|
"paramDescription": "the path used to store temporary output files",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "w",
|
||||||
|
"paramLongName": "whitelist",
|
||||||
|
"paramDescription": "the datasource having a type different from the allowed ones but that we want to add anyway",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "at",
|
||||||
|
"paramLongName": "allowedtypes",
|
||||||
|
"paramDescription": "the allowed datasource types for country propagation",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,38 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"s",
|
||||||
|
"paramLongName":"sourcePath",
|
||||||
|
"paramDescription": "the path of the sequencial file to read",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"out",
|
||||||
|
"paramLongName":"outputPath",
|
||||||
|
"paramDescription": "the output path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"w",
|
||||||
|
"paramLongName":"workingPath",
|
||||||
|
"paramDescription": "the working path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"tn",
|
||||||
|
"paramLongName":"resultTableName",
|
||||||
|
"paramDescription": "the name of the result table we are currently working on",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "preparedInfoPath",
|
||||||
|
"paramDescription": "the path where prepared info have been stored",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "ssm",
|
||||||
|
"paramLongName": "isSparkSessionManaged",
|
||||||
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -61,7 +61,7 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
<arg>--whitelist</arg><arg>${whitelist}</arg>
|
||||||
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_prepare_result_country"/>
|
<ok to="fork_prepare_result_country"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -95,8 +95,10 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/workingP</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -123,8 +125,10 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/workingD</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -151,8 +155,10 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/workingO</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -179,14 +185,16 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingDir}/workingS</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_prepare"/>
|
<ok to="wait_prepare"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_prepare" to="fork_join_apply_country_propagation"/>
|
<join name="wait_prepare" to="fork_apply_country_propagation"/>
|
||||||
|
|
||||||
<fork name="fork_apply_country_propagation">
|
<fork name="fork_apply_country_propagation">
|
||||||
<path start="propagation_publication"/>
|
<path start="propagation_publication"/>
|
||||||
|
@ -216,9 +224,9 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/country/publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -245,9 +253,9 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/country/dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -274,9 +282,9 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -303,8 +311,9 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingDir}/country</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${workingDir}/country/software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue