2020-06-25 18:43:29 +02:00
|
|
|
|
|
|
|
package eu.dnetlib.doiboost.orcidnodoi;
|
|
|
|
|
2020-06-26 17:27:34 +02:00
|
|
|
import java.io.BufferedReader;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.InputStreamReader;
|
|
|
|
import java.net.URI;
|
|
|
|
|
2020-06-25 18:43:29 +02:00
|
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
import org.apache.hadoop.io.IOUtils;
|
|
|
|
import org.apache.hadoop.io.SequenceFile;
|
|
|
|
import org.apache.hadoop.io.Text;
|
|
|
|
import org.apache.hadoop.io.compress.CompressionCodec;
|
|
|
|
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
|
|
|
import org.mortbay.log.Log;
|
|
|
|
|
2021-04-01 17:07:49 +02:00
|
|
|
import eu.dnetlib.dhp.schema.orcid.WorkDetail;
|
2020-06-26 17:27:34 +02:00
|
|
|
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
|
|
|
|
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;
|
2020-06-25 18:43:29 +02:00
|
|
|
|
2020-09-15 11:32:49 +02:00
|
|
|
/**
|
|
|
|
* This class write on hdfs one sequence file, the key is an orcid identifier and the
|
|
|
|
* value is an orcid publication in json format
|
|
|
|
*/
|
|
|
|
|
2020-06-25 18:43:29 +02:00
|
|
|
public class ActivitiesDumpReader {
|
|
|
|
|
2020-07-03 23:30:31 +02:00
|
|
|
private static final int MAX_XML_WORKS_PARSED = -1;
|
|
|
|
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
2020-06-25 18:43:29 +02:00
|
|
|
|
2021-08-11 12:13:22 +02:00
|
|
|
private ActivitiesDumpReader() {
|
|
|
|
}
|
|
|
|
|
2020-06-25 18:43:29 +02:00
|
|
|
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
|
|
|
throws Exception {
|
|
|
|
String uri = inputUri;
|
|
|
|
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
|
|
|
Path inputPath = new Path(uri);
|
|
|
|
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
|
|
|
CompressionCodec codec = factory.getCodec(inputPath);
|
|
|
|
if (codec == null) {
|
|
|
|
System.err.println("No codec found for " + uri);
|
|
|
|
System.exit(1);
|
|
|
|
}
|
|
|
|
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
|
|
|
InputStream gzipInputStream = null;
|
|
|
|
try {
|
|
|
|
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
2021-08-11 12:13:22 +02:00
|
|
|
parseTarActivities(conf, gzipInputStream, outputPath);
|
2020-06-25 18:43:29 +02:00
|
|
|
|
|
|
|
} finally {
|
|
|
|
Log.debug("Closing gzip stream");
|
|
|
|
IOUtils.closeStream(gzipInputStream);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-11 12:13:22 +02:00
|
|
|
private static void parseTarActivities(Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
2020-06-25 18:43:29 +02:00
|
|
|
int counter = 0;
|
|
|
|
int noDoiFound = 0;
|
|
|
|
int errorFromOrcidFound = 0;
|
|
|
|
int xmlParserErrorFound = 0;
|
|
|
|
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
|
|
|
TarArchiveEntry entry = null;
|
|
|
|
|
|
|
|
try (SequenceFile.Writer writer = SequenceFile
|
|
|
|
.createWriter(
|
|
|
|
conf,
|
|
|
|
SequenceFile.Writer.file(outputPath),
|
|
|
|
SequenceFile.Writer.keyClass(Text.class),
|
|
|
|
SequenceFile.Writer.valueClass(Text.class))) {
|
|
|
|
while ((entry = tais.getNextTarEntry()) != null) {
|
|
|
|
String filename = entry.getName();
|
2021-08-11 12:13:22 +02:00
|
|
|
StringBuilder builder = new StringBuilder();
|
2020-06-25 18:43:29 +02:00
|
|
|
try {
|
|
|
|
if (entry.isDirectory() || !filename.contains("works")) {
|
|
|
|
|
|
|
|
} else {
|
|
|
|
Log.debug("XML work entry name: " + entry.getName());
|
|
|
|
counter++;
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
|
|
|
// tarInput
|
|
|
|
String line;
|
2021-08-11 12:13:22 +02:00
|
|
|
builder = new StringBuilder();
|
2020-06-25 18:43:29 +02:00
|
|
|
while ((line = br.readLine()) != null) {
|
2021-08-11 12:13:22 +02:00
|
|
|
builder.append(line);
|
2020-06-25 18:43:29 +02:00
|
|
|
}
|
2021-04-01 17:07:49 +02:00
|
|
|
WorkDetail workDetail = XMLRecordParserNoDoi
|
2021-08-11 12:13:22 +02:00
|
|
|
.VTDParseWorkData(builder.toString().getBytes());
|
2021-04-01 17:07:49 +02:00
|
|
|
if (workDetail != null) {
|
|
|
|
if (workDetail.getErrorCode() != null) {
|
2020-06-25 18:43:29 +02:00
|
|
|
errorFromOrcidFound += 1;
|
|
|
|
Log
|
|
|
|
.debug(
|
|
|
|
"error from Orcid with code "
|
2021-04-01 17:07:49 +02:00
|
|
|
+ workDetail.getErrorCode()
|
2020-06-25 18:43:29 +02:00
|
|
|
+ " for entry "
|
|
|
|
+ entry.getName());
|
|
|
|
continue;
|
|
|
|
}
|
2021-04-01 17:07:49 +02:00
|
|
|
boolean isDoiFound = workDetail
|
2020-06-26 17:27:34 +02:00
|
|
|
.getExtIds()
|
|
|
|
.stream()
|
|
|
|
.filter(e -> e.getType() != null)
|
|
|
|
.anyMatch(e -> e.getType().equals("doi"));
|
2020-06-25 18:43:29 +02:00
|
|
|
if (!isDoiFound) {
|
2021-04-01 17:07:49 +02:00
|
|
|
String jsonData = JsonHelper.createOidWork(workDetail);
|
|
|
|
Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData);
|
2020-06-25 18:43:29 +02:00
|
|
|
|
2021-04-01 17:07:49 +02:00
|
|
|
final Text key = new Text(workDetail.getOid());
|
2020-06-25 18:43:29 +02:00
|
|
|
final Text value = new Text(jsonData);
|
|
|
|
|
|
|
|
try {
|
|
|
|
writer.append(key, value);
|
|
|
|
} catch (IOException e) {
|
|
|
|
Log.debug("Writing to sequence file: " + e.getMessage());
|
|
|
|
Log.debug(e);
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
noDoiFound += 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
2021-08-11 12:13:22 +02:00
|
|
|
Log.warn("Data not retrievable [" + entry.getName() + "] " + builder);
|
2020-06-25 18:43:29 +02:00
|
|
|
xmlParserErrorFound += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
2020-10-22 14:02:32 +02:00
|
|
|
throw new Exception(filename, e);
|
2020-06-25 18:43:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
|
|
|
Log.info("Current xml works parsed: " + counter);
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-10-22 14:02:32 +02:00
|
|
|
} catch (Exception e) {
|
2020-06-25 18:43:29 +02:00
|
|
|
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
|
|
|
Log.warn(e);
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
Log.info("Activities parse completed");
|
|
|
|
Log.info("Total XML works parsed: " + counter);
|
|
|
|
Log.info("Total no doi work found: " + noDoiFound);
|
|
|
|
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
|
|
|
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
|
|
|
}
|
|
|
|
}
|