Merge branch 'beta' into rest-collector-request-header-map

2024-05-29 15:46:23 +02:00 · 2024-05-29 15:46:23 +02:00 · c5f4da16a4
parent 1b165a14a0 e996787be2
commit c5f4da16a4
7 changed files with 110 additions and 45 deletions
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@ -169,4 +169,23 @@
 		</dependency>
 	</dependencies>

+	<!-- dependencies required on JDK9+ because J2EE has been removed -->
+	<profiles>
+		<profile>
+			<id>spark-34</id>
+			<dependencies>
+				<dependency>
+					<groupId>javax.xml.bind</groupId>
+					<artifactId>jaxb-api</artifactId>
+					<version>2.2.11</version>
+				</dependency>
+				<dependency>
+					<groupId>com.sun.xml.ws</groupId>
+					<artifactId>jaxws-ri</artifactId>
+					<version>2.3.3</version>
+					<type>pom</type>
+				</dependency>
+			</dependencies>
+		</profile>
+	</profiles>
 </project>
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/crossref/irish_funder.json
@ -625,12 +625,6 @@
    "name": "Alimentary Health",
    "synonym": []
  },
-  {
-    "id": "501100011103",
-    "uri": "http://dx.doi.org/10.13039/501100011103",
-    "name": "Rann\u00eds",
-    "synonym": []
-  },
  {
    "id": "501100012354",
    "uri": "http://dx.doi.org/10.13039/501100012354",
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/rest/OsfPreprintCollectorTest.java
@ -39,8 +39,8 @@ public class OsfPreprintCollectorTest {
 	private final String resumptionType = "page";
 	private final String resumptionXpath = "/*/*[local-name()='links']/*[local-name()='next']";

-	private final String resultSizeParam = "";
-	private final String resultSizeValue = "";
+	private final String resultSizeParam = "page[size]";
+	private final String resultSizeValue = "100";

 	private final String resultFormatParam = "format";
 	private final String resultFormatValue = "json";
@ -74,7 +74,7 @@ public class OsfPreprintCollectorTest {
 		final AtomicInteger i = new AtomicInteger(0);
 		final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());

-		stream.limit(200).forEach(s -> {
+		stream.limit(2000).forEach(s -> {
 			Assertions.assertTrue(s.length() > 0);
 			i.incrementAndGet();
 			log.info(s);
--- a/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-hist-snaps/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-hist-snaps/oozie_app/copyDataToImpalaCluster.sh
@ -72,6 +72,8 @@ function copydb() {
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
+    else
+      return 2
    fi
  fi

@ -90,19 +92,30 @@ function copydb() {
                -pb \
                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}

-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
+    else
+      return 3
    fi
  fi

-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi

  echo -e "\nCreating schema for db: '${db}'\n"

@ -131,7 +144,7 @@ function copydb() {
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@ -139,7 +152,7 @@ function copydb() {
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
          fi
        fi
      fi
@ -185,7 +198,7 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -215,7 +228,7 @@ function copydb() {
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
    fi
  fi

--- a/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-irish/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor-irish/oozie_app/copyDataToImpalaCluster.sh
@ -72,6 +72,8 @@ function copydb() {
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
+    else
+      return 2
    fi
  fi

@ -90,19 +92,30 @@ function copydb() {
                -pb \
                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}

-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
+    else
+      return 3
    fi
  fi

-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi

  echo -e "\nCreating schema for db: '${db}'\n"

@ -131,7 +144,7 @@ function copydb() {
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@ -139,7 +152,7 @@ function copydb() {
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
          fi
        fi
      fi
@ -185,7 +198,7 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -215,7 +228,7 @@ function copydb() {
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
    fi
  fi

--- a/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-monitor-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats-monitor/oozie_app/copyDataToImpalaCluster.sh
@ -72,6 +72,8 @@ function copydb() {
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
+    else
+      return 2
    fi
  fi

@ -90,19 +92,30 @@ function copydb() {
                -pb \
                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}

-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
+    else
+      return 3
    fi
  fi

-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi

  echo -e "\nCreating schema for db: '${db}'\n"

@ -131,7 +144,7 @@ function copydb() {
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@ -139,7 +152,7 @@ function copydb() {
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
          fi
        fi
      fi
@ -185,7 +198,7 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -215,7 +228,7 @@ function copydb() {
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
    fi
  fi

--- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
+++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/copyDataToImpalaCluster.sh
@ -74,6 +74,8 @@ function copydb() {
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 2
+    else
+      return 2
    fi
  fi

@ -92,19 +94,30 @@ function copydb() {
                -pb \
                ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}

-  # Check the exit status of the "hadoop distcp" command.
-  if [ $? -eq 0 ]; then
-    echo -e "\nSuccessfully copied the files of '${db}'.\n"
+  if [ $? -eq 0 ]; then # Check the exit status of the "hadoop distcp" command.
+    echo -e "\nSuccessfully copied the files of '${db}' from Ocean to Impala cluster.\n"
  else
    echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
      exit 3
+    else
+      return 3
    fi
  fi

-  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
-  #hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod -R 777 ${TEMP_SUBDIR_FULLPATH}/${db}.db
+  # Give WRITE and EXECUTE permissions to the DBs' directory only, in order to be able to create more tables later, on top of that DB.
+  hdfs dfs -conf ${IMPALA_CONFIG_FILE} -chmod u+wx ${IMPALA_HDFS_DB_BASE_PATH}/${db}.db
+  # In case we ever use this script for a writable DB (using inserts/updates), we should perform the costly recursive operation as well, using the "-R" param.
+  if [ $? -ne 0 ]; then # Check the exit status..
+    echo -e "\n\nERROR: FAILED TO ASSIGN WRITE AND EXECUTE PERMISSIONS TO THE DIRECTORY OF DB: '${db}'. GOT EXIT STATUS: $?\n\n"
+    rm -f error.log
+    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
+      exit 4
+    else
+      return 4
+    fi
+  fi

  echo -e "\nCreating schema for db: '${db}'\n"

@ -133,7 +146,7 @@ function copydb() {
      if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
          echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 4
+            exit 5
          fi
      else
        impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
@ -141,7 +154,7 @@ function copydb() {
        if [ -n "$log_errors" ]; then
          echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
          if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-            exit 5
+            exit 6
          fi
        fi
      fi
@ -187,7 +200,7 @@ function copydb() {
    if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
      echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
      if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-        exit 6
+        exit 7
      fi
    elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
      echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
@ -217,7 +230,7 @@ function copydb() {
    echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
    rm -f error.log
    if [[ SHOULD_EXIT_WHOLE_SCRIPT_UPON_ERROR -eq 1 ]]; then
-      exit 7
+      exit 8
    fi
  fi