forked from D-Net/dnet-hadoop
Add param for limiting repo Urls
This commit is contained in:
parent
839a8524e7
commit
9f73d93e62
|
@ -1,32 +1,26 @@
|
|||
|
||||
package eu.dnetlib.dhp.swh;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
|
||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||
|
||||
/**
|
||||
* Given a file with software repository URLs, this class
|
||||
|
|
|
@ -51,6 +51,9 @@ public class CollectSoftwareRepositoryURLs {
|
|||
final String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
||||
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
||||
|
||||
final Integer softwareLimit = Integer.parseInt(parser.get("softwareLimit"));
|
||||
log.info("softwareLimit: {}", softwareLimit);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||
|
||||
|
@ -58,18 +61,23 @@ public class CollectSoftwareRepositoryURLs {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
doRun(spark, hiveDbName, outputPath);
|
||||
doRun(spark, hiveDbName, softwareLimit, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, String outputPath) {
|
||||
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, Integer limit,
|
||||
String outputPath) {
|
||||
|
||||
String queryTemplate = "SELECT distinct coderepositoryurl.value " +
|
||||
"FROM %s.software " +
|
||||
"WHERE coderepositoryurl.value IS NOT NULL " +
|
||||
"AND datainfo.deletedbyinference = FALSE " +
|
||||
"AND datainfo.invisible = FALSE " +
|
||||
"LIMIT 5000";
|
||||
"AND datainfo.invisible = FALSE ";
|
||||
|
||||
if (limit != null) {
|
||||
queryTemplate += String.format("LIMIT %s", limit);
|
||||
}
|
||||
|
||||
String query = String.format(queryTemplate, hiveDbName);
|
||||
|
||||
log.info("Hive query to fetch software code URLs: {}", query);
|
||||
|
|
|
@ -82,7 +82,6 @@ public class PrepareSWHActionsets {
|
|||
softwareRDD
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||
// , GzipCodec.class);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -22,5 +22,11 @@
|
|||
"paramLongName": "hiveMetastoreUris",
|
||||
"paramDescription": "the hive metastore uris",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "slim",
|
||||
"paramLongName": "softwareLimit",
|
||||
"paramDescription": "limit on the number of software repo URL to fetch",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
|
@ -14,4 +14,6 @@ maxNumberOfRetry=2
|
|||
retryDelay=1
|
||||
requestDelay=100
|
||||
|
||||
softwareLimit=500
|
||||
|
||||
resume=collect-software-repository-urls
|
||||
|
|
|
@ -83,6 +83,7 @@
|
|||
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--softwareLimit</arg><arg>${softwareLimit}</arg>
|
||||
</spark>
|
||||
<ok to="collect-repository-last-visit-data"/>
|
||||
<error to="Kill"/>
|
||||
|
|
Loading…
Reference in New Issue