forked from D-Net/dnet-hadoop
converter input taken from textfile on hdfs with base64 gzipped rows
This commit is contained in:
parent
9a0ca0296a
commit
0703e0c65f
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.bmuse.bioschema;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -87,7 +88,7 @@ public class ScrapingJob {
|
|||
final Text value = new Text(nquads);
|
||||
writer.append(key, value);
|
||||
} catch (Throwable t) {
|
||||
logger.error(u.text() + " " + t.getMessage());
|
||||
logger.error(u.text() + " -> ", t);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
|
|
@ -7,6 +7,10 @@
|
|||
<name>nameNode</name>
|
||||
<value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
|
|
|
@ -2,12 +2,12 @@
|
|||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/data/bioschema/ped/</value>
|
||||
<value>/data/bioschema/mobidb/</value>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapUrl</name>
|
||||
<value>https://proteinensemble.org/sitemap2.xml.gz</value>
|
||||
<value>https://mobidb.org/sitemap2.xml.gz</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>sitemapURLKey</name>
|
||||
|
@ -20,7 +20,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>maxScrapedPages</name>
|
||||
<value>10</value>
|
||||
<value>5</value>
|
||||
<description>max number of pages that will be scraped, default: no limit</description>
|
||||
</property>
|
||||
<property>
|
||||
|
@ -71,6 +71,7 @@
|
|||
<arg>--sitemapUrl</arg><arg>${sitemapUrl}</arg>
|
||||
<arg>--sitemapURLKey</arg><arg>${sitemapURLKey}</arg>
|
||||
<arg>--dynamic</arg><arg>${dynamic}</arg>
|
||||
<arg>--maxScrapedPages</arg><arg>${maxScrapedPages}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -19,7 +19,9 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.rdfconverter.utils.CompressorUtil;
|
||||
import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter;
|
||||
import ucar.nc2.stream.NcStreamProto;
|
||||
|
||||
public class SparkRdfToDatacite {
|
||||
|
||||
|
@ -49,22 +51,30 @@ public class SparkRdfToDatacite {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
String rdfNquadsRecordsPath = workingPath.concat(rdfNquadsRecords);
|
||||
JavaPairRDD<Text, Text> rdfNquadsRecordsRDD = sc
|
||||
.sequenceFile(rdfNquadsRecordsPath, Text.class, Text.class);
|
||||
logger.info("Rdf nquads records retrieved: {}", rdfNquadsRecordsRDD.count());
|
||||
String base64GzippedNquadsPath = workingPath.concat(rdfNquadsRecords);
|
||||
JavaRDD<String> base64GzippedNquadsRDD = sc
|
||||
.textFile(base64GzippedNquadsPath);
|
||||
logger.info("Rdf nquads records retrieved: {}", base64GzippedNquadsRDD.count());
|
||||
|
||||
JavaRDD<Text> proteins = rdfNquadsRecordsRDD.flatMap(nquads -> {
|
||||
JavaRDD<String> proteins2 = base64GzippedNquadsRDD
|
||||
.flatMap(nquads -> {
|
||||
RDFConverter converter = new RDFConverter();
|
||||
ArrayList<String> jsonlds = null;
|
||||
try {
|
||||
jsonlds = converter.nQuadsFile2DataciteJson(nquads._2().toString(), profile);
|
||||
jsonlds = converter
|
||||
.nQuadsFile2DataciteJson(CompressorUtil.decompressValue(nquads), profile);
|
||||
} catch (Exception e) {
|
||||
logger.error(nquads._1().toString(), e);
|
||||
logger.error("converting: " + nquads, e);
|
||||
return Arrays.asList(new String()).iterator();
|
||||
}
|
||||
return jsonlds.iterator();
|
||||
}).filter(Objects::nonNull).filter(jsonld -> !jsonld.isEmpty()).map(jsonld -> new Text(jsonld));
|
||||
});
|
||||
logger.info("json datacite non filtered: {}", proteins2.count());
|
||||
JavaRDD<Text> proteins = proteins2
|
||||
.filter(Objects::nonNull)
|
||||
.filter(jsonld -> !jsonld.isEmpty())
|
||||
.distinct()
|
||||
.map(jsonld -> new Text(jsonld));
|
||||
logger.info("json datacite generated: {}", proteins.count());
|
||||
proteins.saveAsTextFile(workingPath.concat(output), GzipCodec.class);
|
||||
});
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
|
||||
package eu.dnetlib.dhp.rdfconverter.utils;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
public class CompressorUtil {
|
||||
|
||||
public static String decompressValue(final String abstractCompressed) {
|
||||
try {
|
||||
byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes());
|
||||
GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray));
|
||||
final StringWriter stringWriter = new StringWriter();
|
||||
IOUtils.copy(gis, stringWriter);
|
||||
return stringWriter.toString();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String compressValue(final String value) throws IOException {
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
GZIPOutputStream gzip = new GZIPOutputStream(out);
|
||||
gzip.write(value.getBytes());
|
||||
gzip.close();
|
||||
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
|
||||
}
|
||||
}
|
|
@ -270,6 +270,9 @@ public class RDFConverter {
|
|||
}
|
||||
results.add(writer.toString());
|
||||
});
|
||||
if (dataciteProteins.isEmpty()) {
|
||||
log.error("No Protein data found: " + nquads);
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>rdfInput</name>
|
||||
<value>nquads.seq</value>
|
||||
<value>base64_gzipped_nquads.txt</value>
|
||||
<description>rdf output of scraping workflow</description>
|
||||
</property>
|
||||
<property>
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.rdfconverter.utils.CompressorUtil;
|
||||
import eu.dnetlib.dhp.rdfconverter.utils.RDFConverter;
|
||||
|
||||
public class ConverterTest {
|
||||
|
@ -47,4 +48,17 @@ public class ConverterTest {
|
|||
logger.info("JSON DATACITE >> " + r);
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void decompressTest() throws Exception {
|
||||
InputStream is = ConverterTest.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/rdfconverter/bioschema/base64_gzipped_nquads.txt");
|
||||
String base64_gzipped_nquads = IOUtils.toString(is);
|
||||
String nq = CompressorUtil.decompressValue(base64_gzipped_nquads);
|
||||
RDFConverter converter = new RDFConverter();
|
||||
ArrayList<String> results = converter.nQuadsFile2DataciteJson(nq, "Protein");
|
||||
results.stream().forEach(r -> {
|
||||
logger.info("JSON DATACITE >> " + r);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
H4sIAAAAAAAAANVY23KjOBB9369wOc+2uNhJ7EqlipiLVQgGkBbGL7OFQbZVayMv4Hj89yOMk2GSSc3O+FLJG0Kt0+d0S92Cu0VZroshAFPGi2RBV3HR5fkcJHm8XYJHGaz4lKVT4Em9a1UGiqQoUk9WgXTfuquWipXrTb7cr1nHjyCnZc7oI03NnK8ONhV8DbM3q6HuW92/7s7l/FN232pX5h2p15FVIitDRR325faXL08Lt9ttd6vulyqSJIPPDsJ7CldpXFLCVvQcDJOcCvQ0YuWiEZy5GG6m3YSvwDjq4Igi8OD8jQ0haEnjghagjOdA6va7yg+kGkH1ZUW9bXhtyJMHgwGQFCCikaezTrHLyvhrJyuuyt2aNljU+uoU5bykLGtM/r7+/8fzOTppAkqarwqQ8GzGxQPhb7tf53zGliIuB6IiNrLcCQxkaNg4N+sXsVrExQPjHl/uVjTH9L8NzRIR1rbj6tD2EYpcy0BuiEIMdQ+byLEt7AUG1EIUwCAUj2SCQnfieAhGrg1tYYNN3UNEx4Y9MSPTGLs28aAXuRMvtEJk6iOkWTbWiEkINg3XD209RMQUgGMUOpaOkecRCD2oYWwTXzNsATP2I9sPRyZEriVea9DBCIeuoQVi1sIhJIFr+roJDYgFD4xx4AVjKGhBn+CAYIuYAcS+QMYOsULXI8jWoWHpEOmhbVjQtVw9dEI48i0B4kVjB6GJjibYRoGQZJloAgkKPWhblm4LbgZqXzZZLKVZyWaM5iJDT/PfX9aea6xhjXNpglmy3KQ0hZkel+Lolw2eDYgr4UXqSIMLs8viqiy2tU3J14t4vmut6/PX6l+YSCGIaMWLMrLJWMVnb3B4fg1yCXaivvKMr1gSxNmcHuH8ddsEfeXmWr3t9W5OpekPSuvR/WVUtUH2SCOe/3smHaeI7dE6dTpjGU2JaGzvWOarEtSgjasKdErn8uCmr6rXN/3bjxORX1abw4HfgX5/oAyUD6rsZ73wSdnwQyt7vmhO+ZJN8zjf7W3EELijB0gqkf98NIXVdXnE06oh19RP1YPPT13k4kVmhE+elXzJ54fUHAZ1frTPn9z3d7ZelbJT9ovjCu/Zpf78Ylglq7UvGe9oL/6K+/fNeHC45nkZv7klmbgbPW/KM1wsn36PvP2FXLJyuf/CFIv0hyMj/Q0PBaE5ARIAAA==
|
Loading…
Reference in New Issue