simple code to get file from URL

2020-05-15 18:18:01 +02:00 · 2020-05-15 18:18:01 +02:00 · 22cb9e0da7
parent 3aaad753fd
commit 22cb9e0da7
7 changed files with 255 additions and 0 deletions
--- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Programme.java
+++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Programme.java
@ -0,0 +1,4 @@
+package eu.dnetlib.dhp.schema.oaf;
+
+public class Programme {
+}
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionset/h2020programme/GetFile.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionset/h2020programme/GetFile.java
@ -0,0 +1,53 @@
+
+package eu.dnetlib.dhp.actionset.h2020programme;
+
+import java.io.*;
+import java.net.URL;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import eu.dnetlib.dhp.application.ArgumentApplicationParser;
+
+public class GetFile {
+
+	private static final Log log = LogFactory.getLog(GetFile.class);
+
+	public static void main(final String[] args) throws Exception {
+		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
+			IOUtils
+				.toString(
+					GetFile.class
+						.getResourceAsStream(
+							"/eu/dnetlib/dhp/actionset/h2020programme/parameters.json")));
+
+		Configuration conf = new Configuration();
+
+		parser.parseArgument(args);
+
+		final String fileURL = parser.get("fileURL");
+		final String hdfsPath = parser.get("hdfsPath");
+		final String hdfsNameNode = parser.get("hdfsNameNode");
+
+		conf.set("fs.defaultFS", hdfsNameNode);
+		FileSystem fileSystem = FileSystem.get(conf);
+		Path hdfsWritePath = new Path(hdfsPath);
+		FSDataOutputStream fsDataOutputStream = null;
+		if (fileSystem.exists(hdfsWritePath)) {
+			fsDataOutputStream = fileSystem.append(hdfsWritePath);
+		} else {
+			fsDataOutputStream = fileSystem.create(hdfsWritePath);
+		}
+
+		InputStream is = new BufferedInputStream(new URL(fileURL).openStream());
+
+		org.apache.hadoop.io.IOUtils.copyBytes(is, fsDataOutputStream, 4096, true);
+
+	}
+
+}
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/action_set_parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/action_set_parameters.json
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/lib/scripts/getprogrammefile.sh
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/lib/scripts/getprogrammefile.sh
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/lib/scripts/getprojectfile.sh
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/lib/scripts/getprojectfile.sh
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/oozie_app/workflow.xml
@ -0,0 +1,112 @@
+<workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5">
+    <parameters>
+        <property>
+            <name>sequenceFilePath</name>
+            <description>the path to store the sequence file of the native metadata collected</description>
+        </property>
+
+        <property>
+            <name>mdStorePath</name>
+            <description>the path of the native mdstore</description>
+        </property>
+
+        <property>
+            <name>apiDescription</name>
+            <description>A json encoding of the API Description class</description>
+        </property>
+
+        <property>
+            <name>dataSourceInfo</name>
+            <description>A json encoding of the Datasource Info</description>
+        </property>
+        <property>
+            <name>identifierPath</name>
+            <description>An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier </description>
+        </property>
+
+        <property>
+            <name>metadataEncoding</name>
+            <description> The type of the metadata XML/JSON</description>
+        </property>
+
+        <property>
+            <name>timestamp</name>
+            <description>The timestamp of the collection date</description>
+        </property>
+
+        <property>
+            <name>workflowId</name>
+            <description>The identifier of the workflow</description>
+        </property>
+    </parameters>
+
+    <start to="DeleteMDStoresNative"/>
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    <action name="DeleteMDStoresNative">
+        <fs>
+            <mkdir path='${sequenceFilePath}'/>
+            <mkdir path='${mdStorePath}'/>
+            <delete path='${sequenceFilePath}'/>
+            <delete path='${mdStorePath}'/>
+        </fs>
+        <ok to="CollectionWorker"/>
+        <error to="Kill"/>
+    </action>
+    <action name="CollectionWorker">
+        <java>
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <main-class>eu.dnetlib.dhp.collection.worker.DnetCollectorWorker</main-class>
+            <java-opts>-p</java-opts><java-opts>${sequenceFilePath}</java-opts>
+            <java-opts>-a</java-opts><java-opts>${apiDescription}</java-opts>
+            <java-opts>-n</java-opts><java-opts>${nameNode}</java-opts>
+            <java-opts>-rh</java-opts><java-opts>${rmq_host}</java-opts>
+            <java-opts>-ru</java-opts><java-opts>${rmq_user}</java-opts>
+            <java-opts>-rp</java-opts><java-opts>${rmq_pwd}</java-opts>
+            <java-opts>-rr</java-opts><java-opts>${rmq_report}</java-opts>
+            <java-opts>-ro</java-opts><java-opts>${rmq_ongoing}</java-opts>
+            <java-opts>-u</java-opts><java-opts>sandro.labruzzo</java-opts>
+            <java-opts>-w</java-opts><java-opts>${workflowId}</java-opts>
+        </java>
+        <ok to="GenerateNativeStoreSparkJob"/>
+        <error to="Kill"/>
+    </action>
+    <action name="GenerateNativeStoreSparkJob">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <job-tracker>${jobTracker}</job-tracker>
+            <name-node>${nameNode}</name-node>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>GenerateNativeStoreSparkJob</name>
+            <class>eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob</class>
+            <jar>dhp-aggregations-1.0.0-SNAPSHOT.jar</jar>
+            <spark-opts>--num-executors 50 --conf spark.yarn.jars=&quot;hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2&quot;</spark-opts>
+            <arg>--encoding</arg> <arg>${metadataEncoding}</arg>
+            <arg>--dateOfCollection</arg> <arg>${timestamp}</arg>
+            <arg>--provenance</arg> <arg> ${dataSourceInfo}</arg>
+            <arg>--xpath</arg><arg>${identifierPath}</arg>
+            <arg>--input</arg><arg>${sequenceFilePath}</arg>
+            <arg>--output</arg><arg>${mdStorePath}</arg>
+            <arg>-rh</arg><arg>${rmq_host}</arg>
+            <arg>-ru</arg><arg>${rmq_user}</arg>
+            <arg>-rp</arg><arg>${rmq_pwd}</arg>
+            <arg>-rr</arg><arg>${rmq_report}</arg>
+            <arg>-ro</arg><arg>${rmq_ongoing}</arg>
+            <arg>-w</arg><arg>${workflowId}</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="DropInvalidStore"/>
+    </action>
+
+    <action name="DropInvalidStore">
+        <fs>
+            <delete path='${mdStorePath}/../'/>
+        </fs>
+        <ok to="Kill"/>
+        <error to="Kill"/>
+    </action>
+
+    <end name="End"/>
+</workflow-app>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/parameters.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionset/h2020programme/parameters.json
@ -0,0 +1,86 @@
+[
+  {
+    "paramName": "issm",
+    "paramLongName": "isSparkSessionManaged",
+    "paramDescription": "when true will stop SparkSession after job execution",
+    "paramRequired": false
+  },
+  {
+    "paramName": "e",
+    "paramLongName": "encoding",
+    "paramDescription": "the encoding of the input record should be JSON or XML",
+    "paramRequired": true
+  },
+  {
+    "paramName": "d",
+    "paramLongName": "dateOfCollection",
+    "paramDescription": "the date when the record has been stored",
+    "paramRequired": true
+  },
+  {
+    "paramName": "p",
+    "paramLongName": "provenance",
+    "paramDescription": "the infos about the provenance of the collected records",
+    "paramRequired": true
+  },
+  {
+    "paramName": "x",
+    "paramLongName": "xpath",
+    "paramDescription": "the xpath to identify the record identifier",
+    "paramRequired": true
+  },
+  {
+    "paramName": "i",
+    "paramLongName": "input",
+    "paramDescription": "the path of the sequencial file to read",
+    "paramRequired": true
+  },
+  {
+    "paramName": "o",
+    "paramLongName": "output",
+    "paramDescription": "the path of the result DataFrame on HDFS",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ru",
+    "paramLongName": "rabbitUser",
+    "paramDescription": "the user to connect with RabbitMq for messaging",
+    "paramRequired": true
+  },
+  {
+    "paramName": "rp",
+    "paramLongName": "rabbitPassword",
+    "paramDescription": "the password to connect with RabbitMq for messaging",
+    "paramRequired": true
+  },
+  {
+    "paramName": "rh",
+    "paramLongName": "rabbitHost",
+    "paramDescription": "the host of the RabbitMq server",
+    "paramRequired": true
+  },
+  {
+    "paramName": "ro",
+    "paramLongName": "rabbitOngoingQueue",
+    "paramDescription": "the name of the ongoing queue",
+    "paramRequired": true
+  },
+  {
+    "paramName": "rr",
+    "paramLongName": "rabbitReportQueue",
+    "paramDescription": "the name of the report queue",
+    "paramRequired": true
+  },
+  {
+    "paramName": "w",
+    "paramLongName": "workflowId",
+    "paramDescription": "the identifier of the dnet Workflow",
+    "paramRequired": true
+  },
+  {
+    "paramName": "t",
+    "paramLongName": "isTest",
+    "paramDescription": "the name of the report queue",
+    "paramRequired": false
+  }
+]