From 9f73d93e624ae08ca61288503171226477fd87fd Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Tue, 3 Oct 2023 14:39:08 +0300 Subject: [PATCH] Add param for limiting repo Urls --- .../swh/CollectLastVisitRepositoryData.java | 34 ++++++++----------- .../swh/CollectSoftwareRepositoryURLs.java | 16 ++++++--- .../dnetlib/dhp/swh/PrepareSWHActionsets.java | 1 - ...nput_collect_software_repository_urls.json | 6 ++++ .../eu/dnetlib/dhp/swh/job.properties | 2 ++ .../eu/dnetlib/dhp/swh/oozie_app/workflow.xml | 1 + 6 files changed, 35 insertions(+), 25 deletions(-) diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java index 296a4cce1..e602266a8 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectLastVisitRepositoryData.java @@ -1,32 +1,26 @@ package eu.dnetlib.dhp.swh; -import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URISyntaxException; -import java.net.URL; -import java.nio.charset.StandardCharsets; - -import org.apache.commons.cli.ParseException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.swh.utils.SWHConnection; import eu.dnetlib.dhp.swh.utils.SWHConstants; import eu.dnetlib.dhp.swh.utils.SWHUtils; +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.SequenceFile; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.net.URL; + +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; /** * Given a file with software repository URLs, this class diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java index 6232fa322..abd51bc5b 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/CollectSoftwareRepositoryURLs.java @@ -51,6 +51,9 @@ public class CollectSoftwareRepositoryURLs { final String hiveMetastoreUris = parser.get("hiveMetastoreUris"); log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + final Integer softwareLimit = Integer.parseInt(parser.get("softwareLimit")); + log.info("softwareLimit: {}", softwareLimit); + SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", hiveMetastoreUris); @@ -58,18 +61,23 @@ public class CollectSoftwareRepositoryURLs { conf, isSparkSessionManaged, spark -> { - doRun(spark, hiveDbName, outputPath); + doRun(spark, hiveDbName, softwareLimit, outputPath); }); } - private static void doRun(SparkSession spark, String hiveDbName, String outputPath) { + private static void doRun(SparkSession spark, String hiveDbName, Integer limit, + String outputPath) { String queryTemplate = "SELECT distinct coderepositoryurl.value " + "FROM %s.software " + "WHERE coderepositoryurl.value IS NOT NULL " + "AND datainfo.deletedbyinference = FALSE " + - "AND datainfo.invisible = FALSE " + - "LIMIT 5000"; + "AND datainfo.invisible = FALSE "; + + if (limit != null) { + queryTemplate += String.format("LIMIT %s", limit); + } + String query = String.format(queryTemplate, hiveDbName); log.info("Hive query to fetch software code URLs: {}", query); diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java index c0ab11bc4..c54e10d3e 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/PrepareSWHActionsets.java @@ -82,7 +82,6 @@ public class PrepareSWHActionsets { softwareRDD .saveAsHadoopFile( outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); -// , GzipCodec.class); }); } diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_software_repository_urls.json b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_software_repository_urls.json index 6e98c7673..4459fe9df 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_software_repository_urls.json +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/input_collect_software_repository_urls.json @@ -22,5 +22,11 @@ "paramLongName": "hiveMetastoreUris", "paramDescription": "the hive metastore uris", "paramRequired": true + }, + { + "paramName": "slim", + "paramLongName": "softwareLimit", + "paramDescription": "limit on the number of software repo URL to fetch", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties index 651bae337..8dd0689a3 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/job.properties @@ -14,4 +14,6 @@ maxNumberOfRetry=2 retryDelay=1 requestDelay=100 +softwareLimit=500 + resume=collect-software-repository-urls diff --git a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml index 7aa667a4a..e0763414f 100644 --- a/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-swh/src/main/resources/eu/dnetlib/dhp/swh/oozie_app/workflow.xml @@ -83,6 +83,7 @@ --softwareCodeRepositoryURLs${softwareCodeRepositoryURLs} --hiveDbName${hiveDbName} --hiveMetastoreUris${hiveMetastoreUris} + --softwareLimit${softwareLimit}