forked from D-Net/dnet-hadoop
This commit is contained in:
parent
b1dad7959c
commit
d77d213d7c
|
@ -101,12 +101,12 @@ public class GCatAPIClient {
|
|||
OkHttpClient httpClient = new OkHttpClient();
|
||||
|
||||
Request request = new Request.Builder()
|
||||
.url(getGcatBaseURL() + itemPath + "?offset=" + offset + "&limit=" + limit)
|
||||
.header("gcube-token", getApplicationToken())
|
||||
.addHeader("Content-Type", "application/json")
|
||||
.addHeader("Accept", "application/json")
|
||||
.get()
|
||||
.build();
|
||||
.url(getGcatBaseURL() + itemPath + "?offset=" + offset + "&limit=" + limit)
|
||||
.header("gcube-token", getApplicationToken())
|
||||
.addHeader("Content-Type", "application/json")
|
||||
.addHeader("Accept", "application/json")
|
||||
.get()
|
||||
.build();
|
||||
|
||||
try (Response response = httpClient.newCall(request).execute()) {
|
||||
int status = response.code();
|
||||
|
|
|
@ -28,13 +28,14 @@ public class SendToCatalogue implements Serializable {
|
|||
.toString(
|
||||
SendToCatalogue.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/blacklist/catalogue_parameters.json")));
|
||||
"/eu/dnetlib/dhp/oa/graph/gcat/catalogue_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String access_token = parser.get("accessToken");
|
||||
final String access_token = parser.get("gcattoken");
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
final String gcatBaseUrl = parser.get("gcatBaseUrl");
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
@ -46,6 +47,7 @@ public class SendToCatalogue implements Serializable {
|
|||
new Path(hdfsPath), true);
|
||||
GCatAPIClient gCatAPIClient = new GCatAPIClient();
|
||||
gCatAPIClient.setApplicationToken(access_token);
|
||||
gCatAPIClient.setGcatBaseURL(gcatBaseUrl);
|
||||
int purged = gCatAPIClient.purgeAll();
|
||||
log.info("purged: " + purged);
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
|
|
|
@ -35,7 +35,7 @@ public class SparkDumpRISISCatalogue implements Serializable {
|
|||
.toString(
|
||||
SparkDumpRISISCatalogue.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/gcat/catalogue_parameters.json"));
|
||||
"/eu/dnetlib/dhp/oa/graph/gcat/dump_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
|
|
@ -23,4 +23,5 @@
|
|||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
|
@ -1,35 +1,27 @@
|
|||
|
||||
|
||||
[
|
||||
|
||||
{
|
||||
"paramName":"s",
|
||||
"paramLongName":"sourcePath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramName":"gct",
|
||||
"paramLongName":"gcattoken",
|
||||
"paramDescription": "the token for the deposition on the catalogue",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path used to store temporary output files",
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the path where storing the sequential file",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"tn",
|
||||
"paramLongName":"resultTableName",
|
||||
"paramDescription": "the name of the result table we are currently working on",
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the name node on hdfs",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"cm",
|
||||
"paramLongName":"communityName",
|
||||
"paramDescription": "the name of the community for which to execute the dump to the catalogue",
|
||||
"paramName": "gbu",
|
||||
"paramLongName": "gcatBaseUrl",
|
||||
"paramDescription": "the base url for the catalogue",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
|
||||
|
||||
[
|
||||
|
||||
{
|
||||
"paramName":"s",
|
||||
"paramLongName":"sourcePath",
|
||||
"paramDescription": "the path of the sequencial file to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path used to store temporary output files",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ssm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"tn",
|
||||
"paramLongName":"resultTableName",
|
||||
"paramDescription": "the name of the result table we are currently working on",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"cm",
|
||||
"paramLongName":"communityName",
|
||||
"paramDescription": "the name of the community for which to execute the dump to the catalogue",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -23,4 +23,8 @@
|
|||
<name>hiveDbName</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -5,14 +5,18 @@
|
|||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the output path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>communityName</name>
|
||||
<description>The name of the community for which execute the dump for the catalogue</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>gcattoken</name>
|
||||
<description>the access token for the deposition on the cataloge</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>gcatBaseUrl</name>
|
||||
<description>the baseUrl to access the catalogue</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveDbName</name>
|
||||
<description>the target hive database name</description>
|
||||
|
@ -88,8 +92,8 @@
|
|||
|
||||
<action name="reset_outputpath">
|
||||
<fs>
|
||||
<delete path="${outputPath}"/>
|
||||
<mkdir path="${outputPath}"/>
|
||||
<delete path="${workingDir}"/>
|
||||
<mkdir path="${workingDir}"/>
|
||||
</fs>
|
||||
<ok to="fork_dump"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -122,7 +126,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/publication</arg>
|
||||
<arg>--communityName</arg><arg>${communityName}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
|
@ -148,7 +152,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/dataset</arg>
|
||||
<arg>--communityName</arg><arg>${communityName}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
|
@ -174,7 +178,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||
<arg>--communityName</arg><arg>${communityName}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
|
@ -200,7 +204,7 @@
|
|||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/software</arg>
|
||||
<arg>--communityName</arg><arg>${communityName}</arg>
|
||||
</spark>
|
||||
<ok to="join_dump"/>
|
||||
|
@ -212,9 +216,10 @@
|
|||
<action name="populate_catalogue">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.dump.gcat.SendToCatalogue</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${workingDir}/blacklist</arg>
|
||||
<arg>--hdfsPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--accessToken</arg><arg>${accessToken}</arg>
|
||||
<arg>--gcattoken</arg><arg>${gcattoken}</arg>
|
||||
<arg>--gcatBaseUrl</arg><arg>${gcatBaseUrl}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue