forked from D-Net/dnet-hadoop
Add param for limiting repo Urls
This commit is contained in:
parent
839a8524e7
commit
9f73d93e62
|
@ -1,32 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.swh;
|
package eu.dnetlib.dhp.swh;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
|
||||||
import org.apache.hadoop.io.Text;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||||
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
import eu.dnetlib.dhp.common.collection.HttpClientParams;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
import eu.dnetlib.dhp.swh.utils.SWHConnection;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
import eu.dnetlib.dhp.swh.utils.SWHConstants;
|
||||||
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
import eu.dnetlib.dhp.swh.utils.SWHUtils;
|
||||||
|
import org.apache.commons.cli.ParseException;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URL;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a file with software repository URLs, this class
|
* Given a file with software repository URLs, this class
|
||||||
|
|
|
@ -51,6 +51,9 @@ public class CollectSoftwareRepositoryURLs {
|
||||||
final String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
final String hiveMetastoreUris = parser.get("hiveMetastoreUris");
|
||||||
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
log.info("hiveMetastoreUris: {}", hiveMetastoreUris);
|
||||||
|
|
||||||
|
final Integer softwareLimit = Integer.parseInt(parser.get("softwareLimit"));
|
||||||
|
log.info("softwareLimit: {}", softwareLimit);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
conf.set("hive.metastore.uris", hiveMetastoreUris);
|
||||||
|
|
||||||
|
@ -58,18 +61,23 @@ public class CollectSoftwareRepositoryURLs {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
doRun(spark, hiveDbName, outputPath);
|
doRun(spark, hiveDbName, softwareLimit, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, String outputPath) {
|
private static <I extends Result> void doRun(SparkSession spark, String hiveDbName, Integer limit,
|
||||||
|
String outputPath) {
|
||||||
|
|
||||||
String queryTemplate = "SELECT distinct coderepositoryurl.value " +
|
String queryTemplate = "SELECT distinct coderepositoryurl.value " +
|
||||||
"FROM %s.software " +
|
"FROM %s.software " +
|
||||||
"WHERE coderepositoryurl.value IS NOT NULL " +
|
"WHERE coderepositoryurl.value IS NOT NULL " +
|
||||||
"AND datainfo.deletedbyinference = FALSE " +
|
"AND datainfo.deletedbyinference = FALSE " +
|
||||||
"AND datainfo.invisible = FALSE " +
|
"AND datainfo.invisible = FALSE ";
|
||||||
"LIMIT 5000";
|
|
||||||
|
if (limit != null) {
|
||||||
|
queryTemplate += String.format("LIMIT %s", limit);
|
||||||
|
}
|
||||||
|
|
||||||
String query = String.format(queryTemplate, hiveDbName);
|
String query = String.format(queryTemplate, hiveDbName);
|
||||||
|
|
||||||
log.info("Hive query to fetch software code URLs: {}", query);
|
log.info("Hive query to fetch software code URLs: {}", query);
|
||||||
|
|
|
@ -82,7 +82,6 @@ public class PrepareSWHActionsets {
|
||||||
softwareRDD
|
softwareRDD
|
||||||
.saveAsHadoopFile(
|
.saveAsHadoopFile(
|
||||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
|
||||||
// , GzipCodec.class);
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,5 +22,11 @@
|
||||||
"paramLongName": "hiveMetastoreUris",
|
"paramLongName": "hiveMetastoreUris",
|
||||||
"paramDescription": "the hive metastore uris",
|
"paramDescription": "the hive metastore uris",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "slim",
|
||||||
|
"paramLongName": "softwareLimit",
|
||||||
|
"paramDescription": "limit on the number of software repo URL to fetch",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -14,4 +14,6 @@ maxNumberOfRetry=2
|
||||||
retryDelay=1
|
retryDelay=1
|
||||||
requestDelay=100
|
requestDelay=100
|
||||||
|
|
||||||
|
softwareLimit=500
|
||||||
|
|
||||||
resume=collect-software-repository-urls
|
resume=collect-software-repository-urls
|
||||||
|
|
|
@ -83,6 +83,7 @@
|
||||||
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
<arg>--softwareCodeRepositoryURLs</arg><arg>${softwareCodeRepositoryURLs}</arg>
|
||||||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
<arg>--softwareLimit</arg><arg>${softwareLimit}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="collect-repository-last-visit-data"/>
|
<ok to="collect-repository-last-visit-data"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue