This commit is contained in:
Miriam Baglioni 2020-07-09 11:46:32 +02:00
parent b1dad7959c
commit d77d213d7c
8 changed files with 83 additions and 41 deletions

View File

@ -101,12 +101,12 @@ public class GCatAPIClient {
OkHttpClient httpClient = new OkHttpClient();
Request request = new Request.Builder()
.url(getGcatBaseURL() + itemPath + "?offset=" + offset + "&limit=" + limit)
.header("gcube-token", getApplicationToken())
.addHeader("Content-Type", "application/json")
.addHeader("Accept", "application/json")
.get()
.build();
.url(getGcatBaseURL() + itemPath + "?offset=" + offset + "&limit=" + limit)
.header("gcube-token", getApplicationToken())
.addHeader("Content-Type", "application/json")
.addHeader("Accept", "application/json")
.get()
.build();
try (Response response = httpClient.newCall(request).execute()) {
int status = response.code();

View File

@ -28,13 +28,14 @@ public class SendToCatalogue implements Serializable {
.toString(
SendToCatalogue.class
.getResourceAsStream(
"/eu/dnetlib/dhp/blacklist/catalogue_parameters.json")));
"/eu/dnetlib/dhp/oa/graph/gcat/catalogue_parameters.json")));
parser.parseArgument(args);
final String access_token = parser.get("accessToken");
final String access_token = parser.get("gcattoken");
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode");
final String gcatBaseUrl = parser.get("gcatBaseUrl");
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
@ -46,6 +47,7 @@ public class SendToCatalogue implements Serializable {
new Path(hdfsPath), true);
GCatAPIClient gCatAPIClient = new GCatAPIClient();
gCatAPIClient.setApplicationToken(access_token);
gCatAPIClient.setGcatBaseURL(gcatBaseUrl);
int purged = gCatAPIClient.purgeAll();
log.info("purged: " + purged);
while (fileStatusListIterator.hasNext()) {

View File

@ -35,7 +35,7 @@ public class SparkDumpRISISCatalogue implements Serializable {
.toString(
SparkDumpRISISCatalogue.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/gcat/catalogue_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/gcat/dump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -23,4 +23,5 @@
<name>hiveDbName</name>
<value>openaire</value>
</property>
</configuration>

View File

@ -1,35 +1,27 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramName":"gct",
"paramLongName":"gcattoken",
"paramDescription": "the token for the deposition on the catalogue",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the path where storing the sequential file",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the name node on hdfs",
"paramRequired": true
},
{
"paramName":"cm",
"paramLongName":"communityName",
"paramDescription": "the name of the community for which to execute the dump to the catalogue",
"paramName": "gbu",
"paramLongName": "gcatBaseUrl",
"paramDescription": "the base url for the catalogue",
"paramRequired": true
}
]

View File

@ -0,0 +1,38 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName":"tn",
"paramLongName":"resultTableName",
"paramDescription": "the name of the result table we are currently working on",
"paramRequired": true
},
{
"paramName":"cm",
"paramLongName":"communityName",
"paramDescription": "the name of the community for which to execute the dump to the catalogue",
"paramRequired": true
}
]

View File

@ -23,4 +23,8 @@
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -5,14 +5,18 @@
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>outputPath</name>
<description>the output path</description>
</property>
<property>
<name>communityName</name>
<description>The name of the community for which execute the dump for the catalogue</description>
</property>
<property>
<name>gcattoken</name>
<description>the access token for the deposition on the cataloge</description>
</property>
<property>
<name>gcatBaseUrl</name>
<description>the baseUrl to access the catalogue</description>
</property>
<property>
<name>hiveDbName</name>
<description>the target hive database name</description>
@ -88,8 +92,8 @@
<action name="reset_outputpath">
<fs>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
<delete path="${workingDir}"/>
<mkdir path="${workingDir}"/>
</fs>
<ok to="fork_dump"/>
<error to="Kill"/>
@ -122,7 +126,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -148,7 +152,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -174,7 +178,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -200,7 +204,7 @@
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
<arg>--communityName</arg><arg>${communityName}</arg>
</spark>
<ok to="join_dump"/>
@ -212,9 +216,10 @@
<action name="populate_catalogue">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.gcat.SendToCatalogue</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
<arg>--hdfsPath</arg><arg>${workingDir}</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>${accessToken}</arg>
<arg>--gcattoken</arg><arg>${gcattoken}</arg>
<arg>--gcatBaseUrl</arg><arg>${gcatBaseUrl}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>