Add param for limiting repo Urls

This commit is contained in:
Serafeim Chatzopoulos 2023-10-03 14:39:08 +03:00
parent 839a8524e7
commit 9f73d93e62
6 changed files with 35 additions and 25 deletions

View File

@ -1,32 +1,26 @@
package eu.dnetlib.dhp.swh;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams;
import eu.dnetlib.dhp.swh.utils.SWHConnection;
import eu.dnetlib.dhp.swh.utils.SWHConstants;
import eu.dnetlib.dhp.swh.utils.SWHUtils;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URL;
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
/**
* Given a file with software repository URLs, this class

View File

@ -51,6 +51,9 @@ public class CollectSoftwareRepositoryURLs {
final String hiveMetastoreUris = parser.get("hiveMetastoreUris");
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
final Integer softwareLimit = Integer.parseInt(parser.get("softwareLimit"));
log.info("softwareLimit: {}", softwareLimit);
SparkConf conf = new SparkConf();
conf.set("hive.metastore.uris", hiveMetastoreUris);
@ -58,18 +61,23 @@ public class CollectSoftwareRepositoryURLs {
conf,
isSparkSessionManaged,
spark -> {
doRun(spark, hiveDbName, outputPath);
doRun(spark, hiveDbName, softwareLimit, outputPath);
});
}
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, String outputPath) {
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, Integer limit,
String outputPath) {
String queryTemplate = "SELECT distinct coderepositoryurl.value " +
"FROM %s.software " +
"WHERE coderepositoryurl.value IS NOT NULL " +
"AND datainfo.deletedbyinference = FALSE " +
"AND datainfo.invisible = FALSE " +
"LIMIT 5000";
"AND datainfo.invisible = FALSE ";
if (limit != null) {
queryTemplate += String.format("LIMIT %s", limit);
}
String query = String.format(queryTemplate, hiveDbName);
log.info("Hive query to fetch software code URLs: {}", query);

View File

@ -82,7 +82,6 @@ public class PrepareSWHActionsets {
softwareRDD
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
// , GzipCodec.class);
});
}

View File

@ -22,5 +22,11 @@
"paramLongName": "hiveMetastoreUris",
"paramDescription": "the hive metastore uris",
"paramRequired": true
},
{
"paramName": "slim",
"paramLongName": "softwareLimit",
"paramDescription": "limit on the number of software repo URL to fetch",
"paramRequired": false
}
]

View File

@ -14,4 +14,6 @@ maxNumberOfRetry=2
retryDelay=1
requestDelay=100
softwareLimit=500
resume=collect-software-repository-urls

View File

@ -83,6 +83,7 @@
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--softwareLimit</arg><arg>${softwareLimit}</arg>
</spark>
<ok to="collect-repository-last-visit-data"/>
<error to="Kill"/>