dnet-hadoop/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcidnodoi/ActivitiesDumpReader.java


package eu.dnetlib.doiboost.orcidnodoi;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;

import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.mortbay.log.Log;

import eu.dnetlib.dhp.schema.orcid.WorkDetail;
import eu.dnetlib.doiboost.orcid.json.JsonHelper;
import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;

/**
 * This class write on hdfs one sequence file, the key is an orcid identifier and the
 * value is an orcid publication in json format
 */

public class ActivitiesDumpReader {

	private static final int MAX_XML_WORKS_PARSED = -1;
	private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;

	private ActivitiesDumpReader() {
	}

	public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
		throws Exception {
		String uri = inputUri;
		FileSystem fs = FileSystem.get(URI.create(uri), conf);
		Path inputPath = new Path(uri);
		CompressionCodecFactory factory = new CompressionCodecFactory(conf);
		CompressionCodec codec = factory.getCodec(inputPath);
		if (codec == null) {
			System.err.println("No codec found for " + uri);
			System.exit(1);
		}
		CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
		InputStream gzipInputStream = null;
		try {
			gzipInputStream = codec.createInputStream(fs.open(inputPath));
			parseTarActivities(conf, gzipInputStream, outputPath);

		} finally {
			Log.debug("Closing gzip stream");
			IOUtils.closeStream(gzipInputStream);
		}
	}

	private static void parseTarActivities(Configuration conf, InputStream gzipInputStream, Path outputPath) {
		int counter = 0;
		int noDoiFound = 0;
		int errorFromOrcidFound = 0;
		int xmlParserErrorFound = 0;
		try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
			TarArchiveEntry entry = null;

			try (SequenceFile.Writer writer = SequenceFile
				.createWriter(
					conf,
					SequenceFile.Writer.file(outputPath),
					SequenceFile.Writer.keyClass(Text.class),
					SequenceFile.Writer.valueClass(Text.class))) {
				while ((entry = tais.getNextTarEntry()) != null) {
					String filename = entry.getName();
					StringBuilder builder = new StringBuilder();
					try {
						if (entry.isDirectory() || !filename.contains("works")) {

						} else {
							Log.debug("XML work entry name: " + entry.getName());
							counter++;
							BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
																									// tarInput
							String line;
							builder = new StringBuilder();
							while ((line = br.readLine()) != null) {
								builder.append(line);
							}
							WorkDetail workDetail = XMLRecordParserNoDoi
								.VTDParseWorkData(builder.toString().getBytes());
							if (workDetail != null) {
								if (workDetail.getErrorCode() != null) {
									errorFromOrcidFound += 1;
									Log
										.debug(
											"error from Orcid with code "
												+ workDetail.getErrorCode()
												+ " for entry "
												+ entry.getName());
									continue;
								}
								boolean isDoiFound = workDetail
									.getExtIds()
									.stream()
									.filter(e -> e.getType() != null)
									.anyMatch(e -> e.getType().equals("doi"));
								if (!isDoiFound) {
									String jsonData = JsonHelper.createOidWork(workDetail);
									Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData);

									final Text key = new Text(workDetail.getOid());
									final Text value = new Text(jsonData);

									try {
										writer.append(key, value);
									} catch (IOException e) {
										Log.debug("Writing to sequence file: " + e.getMessage());
										Log.debug(e);
										throw new RuntimeException(e);
									}
									noDoiFound += 1;
								}

							} else {
								Log.warn("Data not retrievable [" + entry.getName() + "] " + builder);
								xmlParserErrorFound += 1;
							}
						}
					} catch (Exception e) {
						throw new Exception(filename, e);
					}

					if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
						Log.info("Current xml works parsed: " + counter);
					}

					if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
						break;
					}
				}
			}
		} catch (Exception e) {
			Log.warn("Parsing work from gzip archive: " + e.getMessage());
			Log.warn(e);
			throw new RuntimeException(e);
		}
		Log.info("Activities parse completed");
		Log.info("Total XML works parsed: " + counter);
		Log.info("Total no doi work found: " + noDoiFound);
		Log.info("Error from Orcid found: " + errorFromOrcidFound);
		Log.info("Error parsing xml work found: " + xmlParserErrorFound);
	}
}
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00
			`package eu.dnetlib.doiboost.orcidnodoi;`

merged with dnet version 2020-06-26 17:27:34 +02:00			`import java.io.BufferedReader;`
			`import java.io.IOException;`
			`import java.io.InputStream;`
			`import java.io.InputStreamReader;`
			`import java.net.URI;`

added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`import org.apache.commons.compress.archivers.tar.TarArchiveEntry;`
			`import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;`
			`import org.apache.hadoop.conf.Configuration;`
			`import org.apache.hadoop.fs.FileSystem;`
			`import org.apache.hadoop.fs.Path;`
			`import org.apache.hadoop.io.IOUtils;`
			`import org.apache.hadoop.io.SequenceFile;`
			`import org.apache.hadoop.io.Text;`
			`import org.apache.hadoop.io.compress.CompressionCodec;`
			`import org.apache.hadoop.io.compress.CompressionCodecFactory;`
			`import org.mortbay.log.Log;`

[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`import eu.dnetlib.dhp.schema.orcid.WorkDetail;`
merged with dnet version 2020-06-26 17:27:34 +02:00			`import eu.dnetlib.doiboost.orcid.json.JsonHelper;`
			`import eu.dnetlib.doiboost.orcidnodoi.xml.XMLRecordParserNoDoi;`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00
add comments 2020-09-15 11:32:49 +02:00			`/**`
			`* This class write on hdfs one sequence file, the key is an orcid identifier and the`
			`* value is an orcid publication in json format`
			`*/`

added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`public class ActivitiesDumpReader {`

separate workflow to parse orcid summaries, activities and generate dataset with no doi publications; test 2020-07-03 23:30:31 +02:00			`private static final int MAX_XML_WORKS_PARSED = -1;`
			`private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`private ActivitiesDumpReader() {`
			`}`

added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)`
			`throws Exception {`
			`String uri = inputUri;`
			`FileSystem fs = FileSystem.get(URI.create(uri), conf);`
			`Path inputPath = new Path(uri);`
			`CompressionCodecFactory factory = new CompressionCodecFactory(conf);`
			`CompressionCodec codec = factory.getCodec(inputPath);`
			`if (codec == null) {`
			`System.err.println("No codec found for " + uri);`
			`System.exit(1);`
			`}`
			`CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());`
			`InputStream gzipInputStream = null;`
			`try {`
			`gzipInputStream = codec.createInputStream(fs.open(inputPath));`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`parseTarActivities(conf, gzipInputStream, outputPath);`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00
			`} finally {`
			`Log.debug("Closing gzip stream");`
			`IOUtils.closeStream(gzipInputStream);`
			`}`
			`}`

suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`private static void parseTarActivities(Configuration conf, InputStream gzipInputStream, Path outputPath) {`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`int counter = 0;`
			`int noDoiFound = 0;`
			`int errorFromOrcidFound = 0;`
			`int xmlParserErrorFound = 0;`
			`try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {`
			`TarArchiveEntry entry = null;`

			`try (SequenceFile.Writer writer = SequenceFile`
			`.createWriter(`
			`conf,`
			`SequenceFile.Writer.file(outputPath),`
			`SequenceFile.Writer.keyClass(Text.class),`
			`SequenceFile.Writer.valueClass(Text.class))) {`
			`while ((entry = tais.getNextTarEntry()) != null) {`
			`String filename = entry.getName();`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`StringBuilder builder = new StringBuilder();`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`try {`
			`if (entry.isDirectory() \|\| !filename.contains("works")) {`

			`} else {`
			`Log.debug("XML work entry name: " + entry.getName());`
			`counter++;`
			`BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from`
			`// tarInput`
			`String line;`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`builder = new StringBuilder();`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`while ((line = br.readLine()) != null) {`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`builder.append(line);`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`}`
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`WorkDetail workDetail = XMLRecordParserNoDoi`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`.VTDParseWorkData(builder.toString().getBytes());`
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`if (workDetail != null) {`
			`if (workDetail.getErrorCode() != null) {`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`errorFromOrcidFound += 1;`
			`Log`
			`.debug(`
			`"error from Orcid with code "`
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`+ workDetail.getErrorCode()`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`+ " for entry "`
			`+ entry.getName());`
			`continue;`
			`}`
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`boolean isDoiFound = workDetail`
merged with dnet version 2020-06-26 17:27:34 +02:00			`.getExtIds()`
			`.stream()`
			`.filter(e -> e.getType() != null)`
			`.anyMatch(e -> e.getType().equals("doi"));`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`if (!isDoiFound) {`
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`String jsonData = JsonHelper.createOidWork(workDetail);`
			`Log.debug("oid: " + workDetail.getOid() + " data: " + jsonData);`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00
[ORCID-no-doi] integrating PR#98 https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/98 2021-04-01 17:07:49 +02:00			`final Text key = new Text(workDetail.getOid());`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`final Text value = new Text(jsonData);`

			`try {`
			`writer.append(key, value);`
			`} catch (IOException e) {`
			`Log.debug("Writing to sequence file: " + e.getMessage());`
			`Log.debug(e);`
			`throw new RuntimeException(e);`
			`}`
			`noDoiFound += 1;`
			`}`

			`} else {`
suggestions from SonarLint 2021-08-11 12:13:22 +02:00			`Log.warn("Data not retrievable [" + entry.getName() + "] " + builder);`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`xmlParserErrorFound += 1;`
			`}`
			`}`
			`} catch (Exception e) {`
propagate exception on parsing work (PR request) 2020-10-22 14:02:32 +02:00			`throw new Exception(filename, e);`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`}`

			`if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {`
			`Log.info("Current xml works parsed: " + counter);`
			`}`

			`if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {`
			`break;`
			`}`
			`}`
			`}`
propagate exception on parsing work (PR request) 2020-10-22 14:02:32 +02:00			`} catch (Exception e) {`
added workflow to generate seq(orcidId,work) and seq(orcidId,enrichedWork) 2020-06-25 18:43:29 +02:00			`Log.warn("Parsing work from gzip archive: " + e.getMessage());`
			`Log.warn(e);`
			`throw new RuntimeException(e);`
			`}`
			`Log.info("Activities parse completed");`
			`Log.info("Total XML works parsed: " + counter);`
			`Log.info("Total no doi work found: " + noDoiFound);`
			`Log.info("Error from Orcid found: " + errorFromOrcidFound);`
			`Log.info("Error parsing xml work found: " + xmlParserErrorFound);`
			`}`
			`}`