package eu.dnetlib.dhp.swh; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.Result; /** * Collects unique software repository URLs in the Graph using Hive * * @author Serafeim Chatzopoulos */ public class CollectSoftwareRepositoryURLs { private static final Logger log = LoggerFactory.getLogger(CollectSoftwareRepositoryURLs.class); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( CollectSoftwareRepositoryURLs.class .getResourceAsStream("/eu/dnetlib/dhp/swh/input_collect_software_repository_urls.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional .ofNullable(parser.get("isSparkSessionManaged")) .map(Boolean::valueOf) .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); final String hiveDbName = parser.get("hiveDbName"); log.info("hiveDbName: {}", hiveDbName); final String outputPath = parser.get("softwareCodeRepositoryURLs"); log.info("softwareCodeRepositoryURLs: {}", outputPath); final String hiveMetastoreUris = parser.get("hiveMetastoreUris"); log.info("hiveMetastoreUris: {}", hiveMetastoreUris); final Integer softwareLimit = Integer.parseInt(parser.get("softwareLimit")); log.info("softwareLimit: {}", softwareLimit); SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", hiveMetastoreUris); runWithSparkHiveSession( conf, isSparkSessionManaged, spark -> { doRun(spark, hiveDbName, softwareLimit, outputPath); }); } private static void doRun(SparkSession spark, String hiveDbName, Integer limit, String outputPath) { String queryTemplate = "SELECT distinct coderepositoryurl.value " + "FROM %s.software " + "WHERE coderepositoryurl.value IS NOT NULL " + "AND datainfo.deletedbyinference = FALSE " + "AND datainfo.invisible = FALSE "; if (limit != null) { queryTemplate += String.format("LIMIT %s", limit); } String query = String.format(queryTemplate, hiveDbName); log.info("Hive query to fetch software code URLs: {}", query); Dataset df = spark.sql(query); // write distinct repository URLs df .write() .mode(SaveMode.Overwrite) .csv(outputPath); } }