BrBETA_dnet-hadoop/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/orcid/DownloadORCIDTest.java

159 lines
4.7 KiB
Java

package eu.dnetlib.dhp.collection.orcid;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.ximpleware.NavException;
import com.ximpleware.ParseException;
import com.ximpleware.XPathEvalException;
import com.ximpleware.XPathParseException;
import eu.dnetlib.dhp.collection.orcid.model.Author;
import eu.dnetlib.dhp.collection.orcid.model.ORCIDItem;
import eu.dnetlib.dhp.parser.utility.VtdException;
public class DownloadORCIDTest {
private final Logger log = LoggerFactory.getLogger(DownloadORCIDTest.class);
// public void test() throws Exception {
//
// Configuration conf = new Configuration();
// // Set FileSystem URI
//// conf.set("fs.defaultFS", "file://");
// // Because of Maven
// conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
// conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
//
// System.setProperty("hadoop.home.dir", "file:///Users/sandro/orcid/");
//
// final FileSystem fileSystem = FileSystem.get(conf);
//
// new ExtractORCIDDump(fileSystem).run("/Users/sandro/orcid/", "/Users/sandro/orcid/extracted");
//
//// final GZIPInputStream gzip = new GZIPInputStream(Files.newInputStream(Paths.get("/Users/sandro/orcid/ORCID_2023_10_activities_1.tar.gz")));
//// try(final TarArchiveInputStream tais = new TarArchiveInputStream(gzip)) {
////
//// TarArchiveEntry entry;
//// while ((entry = tais.getNextTarEntry()) != null) {
////
//// if (entry.isFile() && entry.getName().contains("employments")) {
////
//// System.out.println(entry.getName());
//// final String [] items = entry.getName().split("/");
////
//// final String res = IOUtils.toString(new BufferedReader(new InputStreamReader(tais)));
//// System.out.println("res = " + res);
////
//// System.out.println(items[items.length-2]);
//// break;
//// }
////
////
//// }
//// }
//
// }
@Test
public void testSummary() throws Exception {
final String xml = IOUtils
.toString(
Objects.requireNonNull(getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/orcid/summary.xml")));
final OrcidParser parser = new OrcidParser();
ORCIDItem orcidItem = parser.parseSummary(xml);
final ObjectMapper mapper = new ObjectMapper();
System.out.println(mapper.writeValueAsString(orcidItem));
}
@Test
public void testParsingWork() throws Exception {
final List<String> works_path = Arrays
.asList(
"/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-2536-4498.xml",
"/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0002-5982-8983.xml",
"/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191.xml",
"/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191-similarity.xml",
"/eu/dnetlib/dhp/collection/orcid/activity_work_0000-0003-2760-1191_contributors.xml"
);
final OrcidParser parser = new OrcidParser();
final ObjectMapper mapper = new ObjectMapper();
works_path.stream().map(s -> {
try {
return IOUtils
.toString(
Objects
.requireNonNull(
getClass()
.getResourceAsStream(
s)));
} catch (IOException e) {
throw new RuntimeException(e);
}
}).forEach(s -> {
try {
System.out.println(mapper.writeValueAsString(parser.parseWork(s)));
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
@Test
public void testParsingEmployments() throws Exception {
final List<String> works_path = Arrays
.asList(
"/eu/dnetlib/dhp/collection/orcid/employment.xml",
"/eu/dnetlib/dhp/collection/orcid/employment_2.xml",
"/eu/dnetlib/dhp/collection/orcid/employment_3.xml"
);
final OrcidParser parser = new OrcidParser();
final ObjectMapper mapper = new ObjectMapper();
works_path.stream().map(s -> {
try {
return IOUtils
.toString(
Objects
.requireNonNull(
getClass()
.getResourceAsStream(
s)));
} catch (IOException e) {
throw new RuntimeException(e);
}
}).forEach(s -> {
try {
System.out.println(mapper.writeValueAsString(parser.parseEmployment(s)));
} catch (Exception e) {
throw new RuntimeException(e);
}
});
}
}