package eu.dnetlib.dhp.swh; import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; import java.io.BufferedReader; import java.io.IOException; import java.net.URL; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.swh.utils.SWHConnection; import eu.dnetlib.dhp.swh.utils.SWHConstants; import eu.dnetlib.dhp.swh.utils.SWHUtils; /** * Given a file with software repository URLs, this class * collects last visit data from the Software Heritage API. * * @author Serafeim Chatzopoulos */ public class CollectLastVisitRepositoryData { private static final Logger log = LoggerFactory.getLogger(CollectLastVisitRepositoryData.class); private static SWHConnection swhConnection = null; public static void main(final String[] args) throws IOException, ParseException { final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser( IOUtils .toString( CollectLastVisitRepositoryData.class .getResourceAsStream( "/eu/dnetlib/dhp/swh/input_collect_last_visit_repository_data.json"))); argumentParser.parseArgument(args); log.info("Java Xmx: {}m", Runtime.getRuntime().maxMemory() / (1024 * 1024)); final String hdfsuri = argumentParser.get("namenode"); log.info("hdfsURI: {}", hdfsuri); final String inputPath = argumentParser.get("softwareCodeRepositoryURLs"); log.info("inputPath: {}", inputPath); final String outputPath = argumentParser.get("lastVisitsPath"); log.info("outputPath: {}", outputPath); final String apiAccessToken = argumentParser.get("apiAccessToken"); log.info("apiAccessToken: {}", apiAccessToken); final HttpClientParams clientParams = SWHUtils.getClientParams(argumentParser); swhConnection = new SWHConnection(clientParams, apiAccessToken); final FileSystem fs = FileSystem.get(getHadoopConfiguration(hdfsuri)); collect(fs, inputPath, outputPath); fs.close(); } private static void collect(FileSystem fs, String inputPath, String outputPath) throws IOException { SequenceFile.Writer fw = SWHUtils.getSequenceFileWriter(fs, outputPath); // Specify the HDFS directory path you want to read Path directoryPath = new Path(inputPath); // List all files in the directory FileStatus[] partStatuses = fs.listStatus(directoryPath); for (FileStatus partStatus : partStatuses) { // Check if it's a file (not a directory) if (partStatus.isFile()) { handleFile(fs, partStatus.getPath(), fw); } } fw.close(); } private static void handleFile(FileSystem fs, Path partInputPath, SequenceFile.Writer fw) throws IOException { BufferedReader br = SWHUtils.getFileReader(fs, partInputPath); String repoUrl; while ((repoUrl = br.readLine()) != null) { URL url = new URL(String.format(SWHConstants.SWH_LATEST_VISIT_URL, repoUrl.trim())); String response; try { response = swhConnection.call(url.toString()); } catch (CollectorException e) { log.error("Error in request: {}", url); response = "{}"; } SWHUtils.appendToSequenceFile(fw, repoUrl, response); } br.close(); } }